In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

sns.set(color_codes=True)

<br>
<br>
<br>

### Data Collection

In [3]:
# training data
train_data = pd.read_csv("../data/train.csv")
train_data.head()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1,6
1,1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8,6
2,2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3,7
3,3,8.1,0.87,0.22,2.6,0.084,11.0,65.0,0.9973,3.2,0.53,9.8,5
4,4,8.5,0.36,0.3,2.3,0.079,10.0,45.0,0.99444,3.2,1.36,9.5,6


In [4]:
# test data
test_data = pd.read_csv("../data/test.csv")
test_data.head()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,2056,7.2,0.51,0.01,2.0,0.077,31.0,54.0,0.99748,3.39,0.59,9.8
1,2057,7.2,0.755,0.15,2.0,0.102,14.0,35.0,0.99586,3.33,0.68,10.0
2,2058,8.4,0.46,0.4,2.0,0.065,21.0,50.0,0.99774,3.08,0.65,9.5
3,2059,8.0,0.47,0.4,1.8,0.056,14.0,25.0,0.9948,3.3,0.65,11.7
4,2060,6.5,0.34,0.32,2.1,0.044,8.0,94.0,0.99356,3.23,0.48,12.8


<br>
<br>
<br>

### Feature Engineering

In [24]:
# creating new features
train_data['total_acid'] = train_data['fixed acidity'] + train_data['volatile acidity'] + train_data['citric acid']
train_data['acid/density'] = train_data['total_acid']  / train_data['density']
train_data['alcohol_density'] = train_data['alcohol']  * train_data['density']
train_data['sulphate/density'] = train_data['sulphates']  / train_data['density']
train_data['sulphates/acid'] = train_data['sulphates'] / train_data['volatile acidity']
train_data['sulphates/chlorides'] = train_data['sulphates'] / train_data['chlorides']
train_data['sulphates*alcohol'] = train_data['sulphates'] / train_data['alcohol']

In [25]:
# inspecting updated data
train_data.head()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,total_acid,acid/density,alcohol_density,sulphate/density,sulphates/acid,sulphates/chlorides,sulphates*alcohol
0,0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1,6,8.89,8.928213,12.048212,0.77331,1.54,10.547945,0.063636
1,1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8,6,10.33,10.345104,12.781312,0.67098,2.233333,7.282609,0.052344
2,2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3,7,7.64,7.666065,11.26158,0.73249,1.431373,12.372881,0.064602
3,3,8.1,0.87,0.22,2.6,0.084,11.0,65.0,0.9973,3.2,0.53,9.8,5,9.19,9.21488,9.77354,0.531435,0.609195,6.309524,0.054082
4,4,8.5,0.36,0.3,2.3,0.079,10.0,45.0,0.99444,3.2,1.36,9.5,6,9.16,9.211214,9.44718,1.367604,3.777778,17.21519,0.143158


<br>
<br>
<br>

### Data Preprocessing

In [None]:
from sklearn.preprocessing import RobustScaler

In [56]:
# creating scaler
scaler = RobustScaler()

In [57]:
# removing id and quality feature
data_1 = train_data.drop(['Id', 'quality'], axis=1)
data_1.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,total_acid,acid/density,alcohol_density,sulphate/density,sulphates/acid,sulphates/chlorides,sulphates*alcohol
0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1,8.89,8.928213,12.048212,0.77331,1.54,10.547945,0.063636
1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8,10.33,10.345104,12.781312,0.67098,2.233333,7.282609,0.052344
2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3,7.64,7.666065,11.26158,0.73249,1.431373,12.372881,0.064602
3,8.1,0.87,0.22,2.6,0.084,11.0,65.0,0.9973,3.2,0.53,9.8,9.19,9.21488,9.77354,0.531435,0.609195,6.309524,0.054082
4,8.5,0.36,0.3,2.3,0.079,10.0,45.0,0.99444,3.2,1.36,9.5,9.16,9.211214,9.44718,1.367604,3.777778,17.21519,0.143158


In [60]:
# scaling data
train_scaled = scaler.fit_transform(data_1)
# converting back to dataframe
train_scaled = pd.DataFrame(train_scaled, columns=data_1.columns)
train_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,total_acid,acid/density,alcohol_density,sulphate/density,sulphates/acid,sulphates/chlorides,sulphates*alcohol
0,0.025,-0.08,0.424242,0.0,-0.315789,0.875,-0.116279,-0.445455,0.105263,0.941176,1.333333,0.0625,0.06609,1.305148,0.951503,0.388627,0.909747,0.283881
1,0.675,-0.88,1.454545,0.142857,0.684211,0.875,0.534884,0.836364,0.052632,0.352941,1.8,0.754808,0.746963,1.78776,0.34857,1.230653,-0.145427,-0.523106
2,-0.425,-0.04,-0.666667,-0.142857,-1.052632,-0.8125,-0.744186,-0.045455,1.105263,0.705882,0.8,-0.538462,-0.540423,0.787294,0.710994,0.256704,1.499465,0.35287
3,0.075,1.4,-0.090909,0.571429,0.263158,-0.3125,0.488372,0.272727,-0.578947,-0.470588,-0.2,0.206731,0.203845,-0.192308,-0.473633,-0.741798,-0.459874,-0.398915
4,0.275,-0.64,0.151515,0.142857,0.0,-0.375,0.023256,-1.027273,-0.578947,4.411765,-0.4,0.192308,0.202084,-0.407156,4.453107,3.106319,3.064228,5.966608


<br>
<br>
<br>

### Data Preparation

In [5]:
from sklearn.model_selection import train_test_split

In [71]:
# creating X
X = train_data.drop(['Id', 'quality'], axis=1)
# X = train_scaled
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,total_acid,acid/density,alcohol_density,sulphate/density,sulphates/acid,sulphates/chlorides,sulphates*alcohol
0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1,8.89,8.928213,12.048212,0.77331,1.54,10.547945,0.063636
1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8,10.33,10.345104,12.781312,0.67098,2.233333,7.282609,0.052344
2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3,7.64,7.666065,11.26158,0.73249,1.431373,12.372881,0.064602
3,8.1,0.87,0.22,2.6,0.084,11.0,65.0,0.9973,3.2,0.53,9.8,9.19,9.21488,9.77354,0.531435,0.609195,6.309524,0.054082
4,8.5,0.36,0.3,2.3,0.079,10.0,45.0,0.99444,3.2,1.36,9.5,9.16,9.211214,9.44718,1.367604,3.777778,17.21519,0.143158


In [72]:
# creating y
y = train_data[['quality']]
y.head()

Unnamed: 0,quality
0,6
1,6
2,7
3,5
4,6


In [73]:
# creating validation set
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y.quality)

In [74]:
# verifying shape
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(1644, 18)
(412, 18)
(1644, 1)
(412, 1)


<br>
<br>
<br>

### Modelling

In [18]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import cross_val_score

In [19]:
# quadratic weighted kappa
def quad_kappa(y, y_pred):
    return cohen_kappa_score(y, y_pred, weights='quadratic')

In [20]:
# custom scorer
custom_scorer = make_scorer(quad_kappa, greater_is_better=True)

<br>
<br>

In [75]:
# creatiing classifier
classifier = RandomForestClassifier()

In [76]:
# cross validation scores
scores = cross_val_score(classifier, x_train, np.ravel(y_train), cv=10, scoring=custom_scorer, n_jobs=-1)

In [77]:
# inspecting scores
scores

array([0.44354234, 0.48879757, 0.49597905, 0.48151087, 0.45753528,
       0.45315463, 0.48771973, 0.47774166, 0.44348598, 0.52965585])

In [78]:
# mean score
scores.mean()

0.4759122963344008

In [79]:
# training the model
classifier.fit(x_train, np.ravel(y_train))

In [80]:
# inspecting feature importance
importance = pd.Series(classifier.feature_importances_, index=classifier.feature_names_in_)
importance.sort_values(ascending=False)

alcohol_density         0.098013
sulphate/density        0.094447
alcohol                 0.072236
sulphates*alcohol       0.062481
sulphates/acid          0.059691
density                 0.056189
sulphates               0.056015
total sulfur dioxide    0.055944
sulphates/chlorides     0.050385
pH                      0.047683
residual sugar          0.047262
total_acid              0.046575
volatile acidity        0.045114
chlorides               0.044646
acid/density            0.043545
free sulfur dioxide     0.041916
citric acid             0.039512
fixed acidity           0.038344
dtype: float64

<br>
<br>
<br>

### Hyperparameter tunning

In [81]:
import optuna as op

In [82]:
# objective function
def objective(trial):

    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)

    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)
    return cross_val_score(clf, x_train, np.ravel(y_train), n_jobs=-1, cv=5, scoring=custom_scorer).mean()

In [83]:
# creating study
study = op.create_study(direction='maximize')
# optimising the objective function with the study
study.optimize(objective, n_trials=100)

[32m[I 2023-02-02 16:24:19,492][0m A new study created in memory with name: no-name-33407c3a-96fd-4346-a586-d7303eae6c3a[0m
[32m[I 2023-02-02 16:24:23,105][0m Trial 0 finished with value: 0.4815750030809694 and parameters: {'n_estimators': 488, 'max_depth': 10, 'min_samples_split': 13}. Best is trial 0 with value: 0.4815750030809694.[0m
[32m[I 2023-02-02 16:24:25,888][0m Trial 1 finished with value: 0.4854157295469005 and parameters: {'n_estimators': 490, 'max_depth': 11, 'min_samples_split': 15}. Best is trial 1 with value: 0.4854157295469005.[0m
[32m[I 2023-02-02 16:24:27,077][0m Trial 2 finished with value: 0.49217777205350044 and parameters: {'n_estimators': 229, 'max_depth': 7, 'min_samples_split': 10}. Best is trial 2 with value: 0.49217777205350044.[0m
[32m[I 2023-02-02 16:24:29,433][0m Trial 3 finished with value: 0.4897381883143135 and parameters: {'n_estimators': 424, 'max_depth': 19, 'min_samples_split': 18}. Best is trial 2 with value: 0.49217777205350044.[0m

In [88]:
# objective function 2
def objective_2(trial):

    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 1.0)

    clf = LGBMClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate)
    return cross_val_score(clf, x_train, np.ravel(y_train), n_jobs=-1, cv=5, scoring=custom_scorer).mean()

In [90]:
# study 2
study = op.create_study(direction='maximize')
study.optimize(objective_2, n_trials=1000)

[32m[I 2023-02-02 16:34:52,069][0m A new study created in memory with name: no-name-f73f7a08-9441-4fe1-b3fc-7d75bdf006bf[0m
[32m[I 2023-02-02 16:35:04,823][0m Trial 0 finished with value: 0.4293571604103116 and parameters: {'n_estimators': 460, 'max_depth': 13, 'learning_rate': 0.2578960794460902}. Best is trial 0 with value: 0.4293571604103116.[0m
[32m[I 2023-02-02 16:35:08,591][0m Trial 1 finished with value: 0.18227648624628012 and parameters: {'n_estimators': 137, 'max_depth': 12, 'learning_rate': 0.5860464252826997}. Best is trial 0 with value: 0.4293571604103116.[0m
[32m[I 2023-02-02 16:35:10,206][0m Trial 2 finished with value: 0.2828150206841702 and parameters: {'n_estimators': 262, 'max_depth': 3, 'learning_rate': 0.5897680381371543}. Best is trial 0 with value: 0.4293571604103116.[0m
[32m[I 2023-02-02 16:35:14,549][0m Trial 3 finished with value: 0.43917889873184757 and parameters: {'n_estimators': 287, 'max_depth': 12, 'learning_rate': 0.35420408570934375}. Bes

KeyboardInterrupt: 