In [2]:
import pandas as pd 
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score

import xgboost as xgb 
import lightgbm as lgbm

import optuna
from optuna.samplers import TPESampler

import warnings
warnings.filterwarnings('ignore')

<br>
<br>
<br>

### Data Collection

In [3]:
# importing training data
train = pd.read_csv('../data/train.csv').drop('Id',axis=1)

# target and feature names
target = 'quality'
features = [c for c in train.columns if c not in ['id','Time', target]]

# importing red wine original dataset
orig_train = pd.read_csv('../red_white_dataset/winequality-red.csv', delimiter=';')
# removing duplicates
orig_train = orig_train[~orig_train.duplicated()]

# concatting original data with training data
train = pd.concat([train, orig_train]).reset_index(drop=True)
# adding new column that labels datapoint as train or test
train['split']= 'train'

# importing test data
test = pd.read_csv('../data/test.csv').drop('Id',axis=1)
# labelling as test
test['split'] = 'test'

# combining train and test datasets
data = pd.concat([train, test]).reset_index(drop=True)

# importing sample submission file
sub = pd.read_csv('../data/submissions/sample_submission.csv')

In [4]:
# inspecting total data
data.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,split
0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1,6.0,train
1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8,6.0,train
2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3,7.0,train


In [5]:
# shape of train and test data
print(train.shape)
print(test.shape)

(3415, 13)
(1372, 12)


In [6]:
# subtracting 3 from target so it's value ranges from 0 - 5
train[target] = train[target] - 3

<br>
<br>
<br>

### Hyperparameter Tunning

##### xgboost

In [None]:
def objective(trial):

    # parameters searching space
    params_optuna = {
            'max_depth': trial.suggest_int('max_depth', 1, 15),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_float('gamma', 0.01, 1.0),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0001, 1.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0001, 1.0),
            'n_estimators': 10000,
            'objective' : "multi:softmax",
            'num_class': 5,
    }

    # cv fold setup
    n = 10
    cv = StratifiedKFold(n, shuffle=True, random_state=42)

    # will hold scores of vaslidation set
    fold_scores = []

    for i, (train_idx, val_idx) in enumerate(cv.split(train[features], train[target])):

        # splitting into training and validations set
        X_train, y_train = train.loc[train_idx, features],train.loc[train_idx, target]
        X_val, y_val = train.loc[val_idx, features],train.loc[val_idx, target]

        # training model with train set
        model = xgb.XGBClassifier(**params_optuna)
        model.fit(X_train,
                 y_train,
                 eval_set= [(X_val, y_val)],
                 early_stopping_rounds = 200,
                 verbose=200)

        # prediction score on validation set
        pred_val = model.predict(X_val)
        score = cohen_kappa_score(y_val, pred_val, weights='quadratic')

        # appending score
        fold_scores.append(score)
    
    # return mean validation score
    return np.mean(fold_scores)

In [None]:
# create optuna study
study = optuna.create_study(direction='maximize', sampler = TPESampler())

# optimise the study
study.optimize(func=objective, n_trials=1000, n_jobs=-1)

In [None]:
# get best set of parameters
study.best_params

In [None]:
# get best achieved value
study.best_value

<br>
<br>

##### lightgbm

In [None]:
def objective(trial):

    # parameters searching space
    params_optuna = {
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
            "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
            'max_depth': trial.suggest_int('max_depth', 1, 15),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
            "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
            "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
            "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95, step=0.1),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95, step=0.1),
            "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
            'n_estimators': 10000,
            'objective' : "multiclass",
            'metric' :'multi_logloss',
    }

    # cv fold setup
    n = 10
    cv = StratifiedKFold(n, shuffle=True, random_state=42)

    # will hold scores of vaslidation set
    fold_scores = []

    for i, (train_idx, val_idx) in enumerate(cv.split(train[features], train[target])):

        # splitting into training and validations set
        X_train, y_train = train.loc[train_idx, features],train.loc[train_idx, target]
        X_val, y_val = train.loc[val_idx, features],train.loc[val_idx, target]

        # training model with train set
        model = lgbm.LGBMClassifier(**params_optuna)
        model.fit(X_train,
                 y_train,
                 eval_set= [(X_val, y_val)],
                 early_stopping_rounds = 200,
                 verbose=200)

        # prediction score on validation set
        pred_val = model.predict(X_val)
        score = cohen_kappa_score(y_val,pred_val, weights='quadratic')

        # appending score
        fold_scores.append(score)
    
    # return mean validation score
    return np.mean(fold_scores)

In [None]:
study = optuna.create_study(direction='maximize', sampler = TPESampler())
study.optimize(func=objective, timeout=100)

In [None]:
# get best set of parameters
study.best_params

In [None]:
# get best achieved value
study.best_value

<br>
<br>
<br>

### Modelling

In [7]:
# xg_boost parameters set

xgb_params_1 = {
    'max_depth': 1,
    'learning_rate': 0.38948972099234563,
    'gamma': 0.5992826807539955,
    'subsample': 0.760188310145391,
    'colsample_bytree': 0.9470322372755515,
    'reg_alpha': 0.8286639480742322,
    'reg_lambda': 0.6987138355697013,
    'n_estimators': 10000,
    'num_class': 5,
    'objective' : "multi:softmax",
    'metric': 'multiclass',
    'eval_metric': 'mlogloss',
    'use_label_encoder': False,
}

In [8]:
# light_gbm parameters set

lgbm_params_1 = {
    'num_leaves': 50,
    'max_depth': 8,
    'lambda_l1': 1.9528554374745727e-09,
    'lambda_l2': 6.288470302197343,
    'feature_fraction': 0.8253409987746099,
    'bagging_fraction': 0.6280124722436471,
    'bagging_freq': 4,
    'min_child_samples': 85,
    'min_data_in_leaf': 93,
    'n_estimators':10000,
    'objective' : "multiclass",
    'metric' :'multi_logloss'
}

In [15]:
# cv fold setup
k = 10
cv = StratifiedKFold(k, shuffle=True, random_state=z)

# will hold scores of validation set
fold_scores = []
# will hold predictions of test set
test_preds = []

# will hold out of fold predictions
oof_preds = []
# will hold out of fold true values
oof_true = []

for i, (train_idx, val_idx) in enumerate(cv.split(train[features], train[target])):
    
    # creating training and valisation data
    X_train = train.loc[train_idx, features]
    y_train = train.loc[train_idx, target]
    X_val = train.loc[val_idx, features]
    y_val = train.loc[val_idx, target]
    


    #************** XGBoost **************

    # training xg_boost on training data
    # with best set of parameters
    model1 = xgb.XGBClassifier(**xgb_params_1)
    model1.fit(X_train,
            y_train,
            eval_set= [(X_val,y_val)],
            early_stopping_rounds = 200,
            verbose=200)
    
    # prediction score on validation set
    pred_val1 = model1.predict(X_val)
    score1 = cohen_kappa_score(y_val, pred_val1)
    
    # discard the predictions of poor performing models
    if score1 > 0.36:

        # making predictions on test data
        # and appending them to test_preds
        test_preds.append(model1.predict(test[features]))

        # appending validation score
        fold_scores.append(score1)
    


    #************** LightGBM **************
    
    # training light_gbm on training data
    # with best set of parameters
    model2 = lgbm.LGBMClassifier(**lgbm_params_1)
    model2.fit(X_train,
            y_train,
            eval_set= [(X_val,y_val)],
            early_stopping_rounds = 200,
            verbose=200)
    
    # prediction score on validation set
    pred_val2 = model2.predict(X_val)
    score2 = cohen_kappa_score(y_val, pred_val2)
    
    # discard the predictions of poor performing models
    if score2 > 0.36:

        # making predictions on test data
        # and appending them to test_preds
        test_preds.append(model2.predict(test[features]))

        # appending validation score
        fold_scores.append(score2)
    

    # appending mean of predictions from both models
    oof_preds.extend(np.mean([pred_val1, pred_val2], axis=0))

    # appending true values
    oof_true.extend(y_val)

    # printing average validation score for each fold
    print('')
    print(f'=== Fold {i} Cohen Kappa Score {np.mean([score1, score2])} ===')
    print('')

# printing total average validation score
print('')
print(f'=== Average Cohen Kappa Score {np.mean(fold_scores)} ===')
print('')

Parameters: { "metric" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-mlogloss:1.49383
[200]	validation_0-mlogloss:0.94957
[262]	validation_0-mlogloss:0.95374
Training until validation scores don't improve for 200 rounds
[200]	valid_0's multi_logloss: 1.02068
Early stopping, best iteration is:
[57]	valid_0's multi_logloss: 0.966673

=== Fold 0 Cohen Kappa Score 0.3815267934061126 ===

Parameters: { "metric" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-mlogloss:1.49688
[200]

<br>
<br>

##### Threshold Tunning

In [16]:
class OptunaRounder:

    def __init__(self, y_true, y_pred):
        self.y_true = y_true
        self.y_pred = y_pred
        self.labels = np.unique(y_true)

    # getting best threshold values
    def __call__(self, trial):
        thresholds = []
        for i in range(len(self.labels) - 1):
            low = max(thresholds) if i > 0 else min(self.labels)
            high = max(self.labels)
            t = trial.suggest_float(f't{i}', low, high)
            thresholds.append(t)
        try:
            opt_y_pred = self.adjust(self.y_pred, thresholds)
        except: return 0
        return cohen_kappa_score(self.y_true, opt_y_pred, weights='quadratic')

    # modifying predictions as per obtained threshold
    def adjust(self, y_pred, thresholds):
        opt_y_pred = pd.cut(y_pred,
                            [-np.inf] + thresholds + [np.inf],
                            labels=self.labels)
        return opt_y_pred

In [19]:
# creating objective
objective = OptunaRounder(oof_true, oof_preds)
# creating a study
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
# optimising the study
study.optimize(objective, timeout=60, n_jobs=-1, show_progress_bar=True)

In [20]:
# getting best thresholds
best_thresholds = sorted(study.best_params.values())
print(f'Optimized thresholds: {best_thresholds}')

# modifying oof predictions
oof_pred_opt = objective.adjust(oof_preds, best_thresholds)
# getting score
best_score = cohen_kappa_score(oof_true, oof_pred_opt, weights='quadratic')
print(f'Optimized OOF Score: {best_score:.5f}')

Optimized thresholds: [1.430557892571371, 1.6233044347046632, 2.7367389713538812, 3.044667702903008, 3.885507912888741]
Optimized OOF Score: 0.52871


In [None]:
# test predictions
test_preds = np.array(test_preds).mean(axis=0)

# modifying test predictions
# and adding 3
opt_test_preds = objective.adjust(test_preds, best_thresholds).astype(int) +3

# updating sample submission
sub[target] = opt_test_preds
# saving as a csv file
sub.to_csv('../data/submission_6.csv', index=False)