In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

sns.set(color_codes=True)

In [2]:
import time

<br>
<br>
<br>

### Data Collection

In [3]:
# importing training data
train_data = pd.read_csv("../data/train_scaled_std.csv")
train_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.214182,-0.159429,0.663801,-0.23163,-0.373311,1.303453,-0.310649,-0.562501,0.136562,0.933169,1.638215,6
1,0.548293,-1.314684,2.470184,-0.115164,0.427602,1.303453,0.539042,0.980993,0.066281,0.208049,2.318768,6
2,-0.742049,-0.101666,-1.248839,-0.348097,-0.963457,-1.394584,-1.129995,-0.080843,1.471893,0.643121,0.86044,7
3,-0.15553,1.977792,-0.23939,0.234236,0.090375,-0.595165,0.47835,0.302294,-0.777086,-0.807119,-0.597888,5
4,0.079077,-0.968107,0.185641,-0.115164,-0.120391,-0.695093,-0.128573,-1.263094,-0.777086,5.211378,-0.889553,6


In [None]:
# importing test data
test_data = pd.read_csv("../data/test_scaled_std.csv")
test_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.683397,-0.101666,-1.355097,-0.464564,-0.204698,1.403381,0.144543,0.400815,0.558245,-0.372047,-0.597888
1,-0.683397,1.31352,-0.611292,-0.464564,0.849135,-0.295383,-0.432034,-0.485873,0.136562,0.280561,-0.403444
2,0.020425,-0.39048,0.71693,-0.464564,-0.710537,0.404108,0.023158,0.543122,-1.620453,0.063025,-0.889553
3,-0.214182,-0.332717,0.71693,-0.697497,-1.089917,-0.295383,-0.735495,-1.066052,-0.07428,0.063025,1.249328
4,-1.093961,-1.083633,0.291899,-0.348097,-1.595756,-0.894947,1.358388,-1.744751,-0.566244,-1.169679,2.318768


<br>
<br>
<br>

### Data Preparation

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
# creatign X
X = train_data.drop(['quality'], axis=1)
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.214182,-0.159429,0.663801,-0.23163,-0.373311,1.303453,-0.310649,-0.562501,0.136562,0.933169,1.638215
1,0.548293,-1.314684,2.470184,-0.115164,0.427602,1.303453,0.539042,0.980993,0.066281,0.208049,2.318768
2,-0.742049,-0.101666,-1.248839,-0.348097,-0.963457,-1.394584,-1.129995,-0.080843,1.471893,0.643121,0.86044
3,-0.15553,1.977792,-0.23939,0.234236,0.090375,-0.595165,0.47835,0.302294,-0.777086,-0.807119,-0.597888
4,0.079077,-0.968107,0.185641,-0.115164,-0.120391,-0.695093,-0.128573,-1.263094,-0.777086,5.211378,-0.889553


In [7]:
# creating y
y = train_data[['quality']]
y.head()

Unnamed: 0,quality
0,6
1,6
2,7
3,5
4,6


In [8]:
# creating validation sets
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y.quality, random_state=42)

In [9]:
# verifying shapes
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(1644, 11)
(412, 11)
(1644, 1)
(412, 1)


<br>
<br>
<br>

### Model Selection

In [15]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV

In [13]:
# quadratic weighted kappa
def quad_kappa(y, y_pred):
    return cohen_kappa_score(y, y_pred, weights='quadratic')

In [14]:
# creating custom scorer
custom_scorer = make_scorer(quad_kappa, greater_is_better=True)

<br>

#### xgboost

In [21]:
# best set of parameters for xgboost
params = {
          'subsample'       : 0.1,
          'reg_lambda'      : 50,
          'min_child_weight': 1,
          'max_depth'       : 6,
          'learning_rate'   : 0.05,
          'colsample_bytree': 0.4,
          'objective'       : 'multi:softmax',
          'eval_metric'     : 'mlogloss'
         }

In [22]:
# creating classifier
xgboost = XGBClassifier(**params, n_estimators=20000, seed=42)

In [23]:
# training the model
xgboost.fit(x_train, np.ravel(y_train))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [24]:
# training data score
predictions = xgboost.predict(x_train)
quad_kappa(np.ravel(y_train), predictions)

0.9016653066689642

In [25]:
# validation data score
predictions = xgboost.predict(x_val)
quad_kappa(np.ravel(y_val), predictions)

0.5152708413369189

In [73]:
# cross validation score
scores = cross_val_score(xgboost, x_train, y_train, cv=10, n_jobs=-1, scoring=custom_scorer)
scores.mean()

0.4271704021973323

<br>

#### light_gbm

In [29]:
# best set of parameters for light_gbm
params = {
          'learning_rate'    : 0.05,
          'max_depth'        : 6,
          'subsample'        : 0.4,
          'subsample_freq'   : 3,
          'colsample_bytree' : 0.7,
          'reg_lambda'       : 100,
          'num_leaves'       : 20,
          'min_child_samples': 30,
          'objective'        : 'multiclass',
          'metric'           : 'multi_logloss'
         } 

In [30]:
# creating classifier
light_gbm = LGBMClassifier(**params, n_estimators=20000, seed=42)

In [31]:
# training the model
light_gbm.fit(x_train, np.ravel(y_train))

In [32]:
# score on training data
predictions = light_gbm.predict(x_train)
quad_kappa(np.ravel(y_train), predictions)

1.0

In [33]:
# score on validation data
predictions = light_gbm.predict(x_val)
quad_kappa(np.ravel(y_val), predictions)

0.46612436131555157

<br>

#### catboost

In [45]:
# best set of parameters for catboost
params = {
          'learning_rate'   : 0.01,
          'depth'           : 3,
          'rsm'             : 1,
          'min_data_in_leaf': 25,
          'l2_leaf_reg'     : 70,
          "objective"       : "MultiClass",
          "loss_function"   : "WKappa"
         }

In [46]:
# creating classifier
cat_boost = CatBoostClassifier(**params,
                             n_estimators=20000,
                             random_state=42)

In [47]:
# training the model
cat_boost.fit(x_train, np.ravel(y_train))

0:	learn: 1.7859359	total: 159ms	remaining: 53m 8s
1:	learn: 1.7788072	total: 162ms	remaining: 26m 55s
2:	learn: 1.7734162	total: 164ms	remaining: 18m 10s
3:	learn: 1.7659143	total: 166ms	remaining: 13m 48s
4:	learn: 1.7584172	total: 168ms	remaining: 11m 11s
5:	learn: 1.7508726	total: 171ms	remaining: 9m 29s
6:	learn: 1.7444075	total: 173ms	remaining: 8m 13s
7:	learn: 1.7373677	total: 175ms	remaining: 7m 16s
8:	learn: 1.7316664	total: 177ms	remaining: 6m 33s
9:	learn: 1.7263252	total: 179ms	remaining: 5m 58s
10:	learn: 1.7206899	total: 181ms	remaining: 5m 29s
11:	learn: 1.7145153	total: 183ms	remaining: 5m 5s
12:	learn: 1.7082560	total: 185ms	remaining: 4m 45s
13:	learn: 1.7026570	total: 188ms	remaining: 4m 29s
14:	learn: 1.6962806	total: 190ms	remaining: 4m 13s
15:	learn: 1.6905702	total: 192ms	remaining: 4m
16:	learn: 1.6839131	total: 195ms	remaining: 3m 49s
17:	learn: 1.6773381	total: 198ms	remaining: 3m 39s
18:	learn: 1.6723950	total: 201ms	remaining: 3m 31s
19:	learn: 1.6665775	to

<catboost.core.CatBoostClassifier at 0x1e701007910>

In [48]:
# score on validation set
predictions = cat_boost.predict(x_val)
quad_kappa(np.ravel(y_val), predictions)

0.5563902243589745