In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

sns.set(color_codes=True)

In [2]:
import pycaret.classification as pcc
from sklearn.metrics import cohen_kappa_score

<br>
<br>
<br>

### Data Collection

In [3]:
# importing training data
train_data = pd.read_csv("../data/train.csv")
train_data.head()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1,6
1,1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8,6
2,2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3,7
3,3,8.1,0.87,0.22,2.6,0.084,11.0,65.0,0.9973,3.2,0.53,9.8,5
4,4,8.5,0.36,0.3,2.3,0.079,10.0,45.0,0.99444,3.2,1.36,9.5,6


In [4]:
# inspecting target
train_data.quality.value_counts()

5    839
6    778
7    333
4     55
8     39
3     12
Name: quality, dtype: int64

In [5]:
# removing 3 from target
# so it ranges from 0-5
train_data['quality'] = train_data.quality - 3

In [6]:
# updated target
train_data.quality.value_counts()

2    839
3    778
4    333
1     55
5     39
0     12
Name: quality, dtype: int64

In [7]:
# importing test data
test_data = pd.read_csv("../data/test.csv")
test_data.head()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,2056,7.2,0.51,0.01,2.0,0.077,31.0,54.0,0.99748,3.39,0.59,9.8
1,2057,7.2,0.755,0.15,2.0,0.102,14.0,35.0,0.99586,3.33,0.68,10.0
2,2058,8.4,0.46,0.4,2.0,0.065,21.0,50.0,0.99774,3.08,0.65,9.5
3,2059,8.0,0.47,0.4,1.8,0.056,14.0,25.0,0.9948,3.3,0.65,11.7
4,2060,6.5,0.34,0.32,2.1,0.044,8.0,94.0,0.99356,3.23,0.48,12.8


In [8]:
# verifying columns
train_data.head()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1,3
1,1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8,3
2,2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3,4
3,3,8.1,0.87,0.22,2.6,0.084,11.0,65.0,0.9973,3.2,0.53,9.8,2
4,4,8.5,0.36,0.3,2.3,0.079,10.0,45.0,0.99444,3.2,1.36,9.5,3


In [9]:
# returns quadratic kappa score
def quad_kappa(y, y_pred):
    return cohen_kappa_score(y, y_pred, weights='quadratic')

<br>
<br>
<br>

### Normal Data

In [13]:
# setting up environment
wine_quality = pcc.setup(
    data=train_data,
    target='quality',
    use_gpu=True,
    feature_interaction=True,
    feature_ratio=True,
    polynomial_features=True,
    trigonometry_features=True,
    feature_selection=True,
    remove_multicollinearity=True
)

Unnamed: 0,Description,Value
0,session_id,1747
1,Target,quality
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(2056, 13)"
5,Missing Values,False
6,Numeric Features,12
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [14]:
# inspecting the environment
wine_quality

(5,
 None,
 Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=['Id'],
                                       ml_usecase='classification',
                                       numerical_features=[], target='quality',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_...
                  Fix_multicollinearity(correlation_with_target_preference=None,
                                        correlation_with_target_threshold=0.0,
                                        target_variable='quality',
                                    

<br>
<br>

In [15]:
# adding our custom metric
pcc.add_metric('quad_kappa', 'Quad Kappa', quad_kappa)

Name                                                  Quad Kappa
Display Name                                          Quad Kappa
Score Function       <function quad_kappa at 0x000001BABA1E10D8>
Scorer                                   make_scorer(quad_kappa)
Target                                                      pred
Args                                                          {}
Greater is Better                                           True
Multiclass                                                  True
Custom                                                      True
Name: quad_kappa, dtype: object

In [16]:
# comparing models and sorting them by quad_kappa
pcc.compare_models(sort='quad_kappa')

IntProgress(value=0, description='Processing: ', max=84)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Quad Kappa,TT (Sec)
qda,Quadratic Discriminant Analysis,0.3725,0.5243,0.2678,0.4712,0.3949,0.1642,0.1746,0.4324,0.044
rf,Random Forest Classifier,0.5073,0.5784,0.2854,0.5252,0.5121,0.2796,0.282,0.4213,1.279
nb,Naive Bayes,0.2377,0.538,0.2862,0.4924,0.2976,0.0991,0.1141,0.3443,0.032
lr,Logistic Regression,0.2905,0.5682,0.3015,0.5271,0.3559,0.1397,0.155,0.3338,2.302
dt,Decision Tree Classifier,0.3913,0.4537,0.2485,0.4364,0.4095,0.1359,0.1375,0.311,0.085
ada,Ada Boost Classifier,0.1549,0.4445,0.3246,0.3555,0.1683,0.0584,0.0758,0.3034,0.431
ridge,Ridge Classifier,0.2244,0.0,0.3185,0.5036,0.273,0.1055,0.1237,0.2787,0.032
svm,SVM - Linear Kernel,0.228,0.0,0.2647,0.3226,0.1845,0.0806,0.1089,0.2198,0.15
knn,K Neighbors Classifier,0.3043,0.4821,0.2439,0.4315,0.349,0.1002,0.1054,0.1997,0.495


<br>
<br>

In [34]:
# stacking top 2 models
staker = pcc.stack_models(top_2, optimize='Quad Kappa')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Quad Kappa
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.5556,0.7138,0.2567,0.5317,0.5195,0.2891,0.2979,0.4181
1,0.5069,0.6878,0.2212,0.4503,0.4521,0.2021,0.2117,0.35
2,0.5486,0.7014,0.2506,0.5348,0.5092,0.2751,0.2851,0.4729
3,0.5903,0.7681,0.2749,0.5536,0.5509,0.3457,0.3576,0.5377
4,0.5486,0.7254,0.2597,0.5088,0.5194,0.2857,0.291,0.4797
5,0.5208,0.7025,0.2459,0.4952,0.4957,0.2362,0.2409,0.4636
6,0.5694,0.7485,0.2662,0.5366,0.5356,0.3109,0.3192,0.5096
7,0.5069,0.6566,0.2393,0.4949,0.4759,0.2068,0.2167,0.3773
8,0.5486,0.0,0.3102,0.5234,0.523,0.2821,0.2884,0.4644
9,0.5524,0.0,0.2945,0.5051,0.5084,0.2751,0.2849,0.3532


In [33]:
# blending top 2 models
# soft voting
blender = pcc.blend_models(top_2, optimize='Quad Kappa')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Quad Kappa
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.5556,0.7327,0.2613,0.5237,0.5298,0.2996,0.3056,0.4502
1,0.5625,0.7207,0.2636,0.517,0.5335,0.3119,0.3161,0.5372
2,0.5625,0.7312,0.2786,0.5325,0.5446,0.321,0.323,0.4964
3,0.6042,0.7864,0.3004,0.5667,0.5834,0.386,0.388,0.5733
4,0.6042,0.7822,0.2817,0.5637,0.5752,0.3733,0.3796,0.5079
5,0.5278,0.7065,0.2642,0.5028,0.5148,0.2761,0.2767,0.5601
6,0.5903,0.7501,0.28,0.5492,0.5659,0.3572,0.3605,0.5232
7,0.5139,0.6914,0.2582,0.4785,0.4918,0.2418,0.245,0.4819
8,0.5903,0.0,0.3367,0.5587,0.5657,0.3536,0.3603,0.5283
9,0.5734,0.0,0.3352,0.5542,0.5554,0.3235,0.3279,0.4764


In [35]:
# boosting top 2 models
boosted = pcc.ensemble_model(blender, method="Boosting", optimize='Quad Kappa')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Quad Kappa
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.5625,0.7345,0.2642,0.5251,0.536,0.3117,0.3169,0.4449
1,0.5486,0.7285,0.2541,0.5018,0.5184,0.2888,0.2932,0.5203
2,0.5625,0.7306,0.278,0.5295,0.5426,0.3202,0.3226,0.5165
3,0.625,0.7847,0.3131,0.5862,0.6039,0.4193,0.4214,0.5779
4,0.6181,0.7912,0.2952,0.5781,0.5917,0.3971,0.4018,0.561
5,0.5208,0.7066,0.2573,0.4977,0.5087,0.2642,0.265,0.5339
6,0.5903,0.7542,0.2879,0.5524,0.5689,0.36,0.3622,0.5046
7,0.4792,0.6862,0.2357,0.4508,0.4606,0.184,0.186,0.441
8,0.625,0.0,0.3591,0.5906,0.5992,0.4079,0.4143,0.5992
9,0.5594,0.0,0.3185,0.5353,0.5382,0.2999,0.3049,0.4199


<br>
<br>

In [38]:
# making predictions on test data
predictions = pcc.predict_model(blender, data=test_data)

In [41]:
# inspecting predictions
preds = predictions.Label
preds.head()

0    3
1    3
2    2
3    3
4    2
Name: Label, dtype: int32

In [43]:
# checking shape
preds.shape

(1372,)

In [48]:
# adding 3 to predictions
preds = preds + 3
preds.head()

0    6
1    6
2    5
3    6
4    5
Name: Label, dtype: int32

In [44]:
# loading sample submission
submission_df = pd.read_csv("../data/submissions/sample_submission.csv")
submission_df.head()

Unnamed: 0,Id,quality
0,2056,5
1,2057,5
2,2058,5
3,2059,5
4,2060,5


In [45]:
# checking shape
submission_df.shape

(1372, 2)

In [49]:
# updating target
submission_df.quality = preds
submission_df.head()

Unnamed: 0,Id,quality
0,2056,6
1,2057,6
2,2058,5
3,2059,6
4,2060,5


In [50]:
# saving as csv file
submission_df.to_csv("../data/submission_2.csv", index=None)