<a href="https://colab.research.google.com/github/elifoskanbas/redWineQualityPredictor/blob/main/redWineQualityPredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Red Wine Quality Predictor Regression Model

In [77]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

#kaggle.json is your API file that you have to download from Kaggle

Saving kaggle.json to kaggle.json


In [78]:
!kaggle datasets download -d uciml/red-wine-quality-cortez-et-al-2009

Dataset URL: https://www.kaggle.com/datasets/uciml/red-wine-quality-cortez-et-al-2009
License(s): DbCL-1.0
red-wine-quality-cortez-et-al-2009.zip: Skipping, found more recently modified local copy (use --force to force download)


In [79]:
import zipfile
import pandas as pd
import os

zip_filename = 'red-wine-quality-cortez-et-al-2009.zip'
extract_dir = './wine_data'

if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

# opening zip
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
    print("zip file opened successfully.")

file_path = os.path.join(extract_dir, 'winequality-red.csv')
df = pd.read_csv(file_path)

print("\nfirst 5 row of the data set:")
print(df.head())

zip file opened successfully.

first 5 row of the data set:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0   

In [80]:
X = df.drop('quality', axis=1)
y = df['quality']

In [81]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
from sklearn.preprocessing import StandardScaler
import pandas as pd


scaler = StandardScaler()
columns_to_scale = []

for col in X_train.columns:
    if X_train[col].dtype in ['float64', 'int64']:
        if col != 'quality':
            columns_to_scale.append(col)


X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

for col in columns_to_scale:
    X_train_scaled[col] = scaler.fit_transform(X_train_scaled[[col]])
    X_test_scaled[col] = scaler.transform(X_test_scaled[[col]])

print(X_train_scaled.head())



     fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
493       0.218332          0.889712     0.192092        0.309726  -0.049642   
354      -1.290166         -1.788783     0.652753       -0.805080  -0.455214   
342       1.494753         -0.784347     1.011045       -0.526378   0.599272   
834       0.276351          0.861811    -0.063831       -0.665729  -0.009085   
705       0.044274          2.814880    -0.626861        2.399985  -0.313264   

     free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
493             0.691007              1.042934  1.846696  1.093500   0.458223   
354             2.388473              3.593870 -3.004491 -0.400439  -0.401197   
342            -0.957960             -0.991742  0.768655 -0.075669   0.515517   
834             0.012020             -0.718427  0.089488  0.054238  -1.088733   
705            -0.472970              0.222990  1.199871  0.379008  -0.974144   

      alcohol  
493  1.123177  


In [83]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)

linear_mse = mean_squared_error(y_test, y_pred)
linear_r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", linear_mse)
print("R^2 Score:", linear_r2)

Mean Squared Error: 0.3900251439639549
R^2 Score: 0.403180341279622


In [84]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score


param_grid = {
    'C': [0.1, 1, 10, 100], # regularization
    'gamma': [0.001, 0.01, 0.1, 1], # RBF kernel parameter
    'epsilon': [0.1, 0.2, 0.3, 0.4]
}


svr = SVR(kernel='rbf')

grid_search = GridSearchCV(
    estimator=svr,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5, # divide the training dataset into 5 equal parts
    verbose=3,
    n_jobs=-1 #all CPU cores
)

grid_search.fit(X_train_scaled, y_train.values.ravel())


print("Best parameters:", grid_search.best_params_)

print("Best negative MSE Score:", grid_search.best_score_)

best_svr_model = grid_search.best_estimator_


y_pred_tuned = best_svr_model.predict(X_test_scaled)

mse_tuned = mean_squared_error(y_test, y_pred_tuned)
r2_tuned = r2_score(y_test, y_pred_tuned)

print("set (Tuned) SVR MSE:", mse_tuned)
print("set (Tuned) SVR R^2:", r2_tuned)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best parameters: {'C': 1, 'epsilon': 0.2, 'gamma': 0.1}
Best negative MSE Score: -0.4079911621807212
set (Tuned) SVR MSE: 0.34433402834611887
set (Tuned) SVR R^2: 0.4730972593337831


In [85]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],      # (None=infinity)
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestRegressor(random_state=42)

grid_search_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2,
    n_jobs=-1
)

print("Starting Grid Search...")
grid_search_rf.fit(X_train_scaled, y_train.values.ravel())
print("Grid Search Completed.")

# Best hyperparameter combination
best_params_rf = grid_search_rf.best_params_
print("\nBest Random Forest Parameters:", best_params_rf)

# Best model's Cross Validation score (Negative MSE)
print("Best Negative MSE (CV Score):", grid_search_rf.best_score_)


best_rf_model = grid_search_rf.best_estimator_
y_pred_tuned_rf = best_rf_model.predict(X_test_scaled)

mse_tuned_rf = mean_squared_error(y_test, y_pred_tuned_rf)
r2_tuned_rf = r2_score(y_test, y_pred_tuned_rf)

print("\n--- Optimized Random Forest Performance ---")
print(f"Mean Squared Error (MSE): {mse_tuned_rf:.4f}")
print(f"R^2 Score: {r2_tuned_rf:.4f}")




Starting Grid Search...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Grid Search Completed.

Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Best Negative MSE (CV Score): -0.3679196825639978

--- Optimized Random Forest Performance ---
Mean Squared Error (MSE): 0.3067
R^2 Score: 0.5307


In [86]:
print("\n--- Comparison ---")
print("\n--- Previous Linear results ---")
print(f"linear MSE: {linear_mse}")
print(f"linear R^2: {linear_r2}")

print("\n--- Previous SVR results ---")
print(f"SVR MSE: {mse_tuned}")
print(f"SVR R^2: {r2_tuned}")

print("\n--- RF results ---")
print(f"SVR MSE: {mse_tuned_rf}")
print(f"SVR R^2: {r2_tuned_rf}")


--- Comparison ---

--- Previous Linear results ---
linear MSE: 0.3900251439639549
linear R^2: 0.403180341279622

--- Previous SVR results ---
SVR MSE: 0.34433402834611887
SVR R^2: 0.4730972593337831

--- RF results ---
SVR MSE: 0.3066800694444445
SVR R^2: 0.5307156545807452


The best performance belongs to the Random Forest