In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
import warnings
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score

In [None]:
warnings.simplefilter('ignore')

In [None]:
df = pd.read_csv("/kaggle/input/playground-series-s3e5/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s3e5/test.csv")

In [None]:
df["quality"].value_counts().sort_index()

ordinary regression

In [None]:
X = df.drop(["Id", "quality"], axis=1)
y = df["quality"]

In [None]:
def quadratic_weighted_kappa(preds, data):
    y_true = data.get_label()
    y_pred = preds.clip(3, 8).round()
    qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    return 'QWK', qwk, True

In [None]:
params = {
    "objective": "regression",
    "metric": "None",
    "verbosity": -1,
    "learning_rate": 0.01,
}

preds_valid = np.zeros(len(df))

kf = KFold(n_splits=5, shuffle=True, random_state=0)
for idx_train, idx_valid in kf.split(df):
    X_train = X.iloc[idx_train, :]
    y_train = y.iloc[idx_train]
    X_valid = X.iloc[idx_valid, :]
    y_valid = y.iloc[idx_valid]

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)


    callbacks = [
        lgb.early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(100)
    ]

    model = lgb.train(
        params=params,
        train_set=lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_valid],
        valid_names=["valid"],
        feval=quadratic_weighted_kappa,
        callbacks=callbacks
    )

    preds_valid[idx_valid] = model.predict(X_valid)

In [None]:
# without threshold optimization
qwk = cohen_kappa_score(y, preds_valid.clip(3, 8).round(), weights="quadratic")
print("QWK:", qwk)
# QWK: 0.5117074248423772

In [None]:
# ref: https://blog.amedama.jp/entry/optuna-qwk-optimization
class OptunaRounder:

    def __init__(self, y_true, y_pred):
        self.y_true = y_true
        self.y_pred = y_pred
        self.labels = np.unique(y_true)

    def __call__(self, trial):
        thresholds = []
        for i in range(len(self.labels) - 1):
            low = max(thresholds) if i > 0 else min(self.labels)
            high = max(self.labels)
            t = trial.suggest_uniform(f't{i}', low, high)
            thresholds.append(t)
        try:
            opt_y_pred = self.adjust(self.y_pred, thresholds)
        except: return 0
        return cohen_kappa_score(self.y_true, opt_y_pred, weights='quadratic')

    def adjust(self, y_pred, thresholds):
        opt_y_pred = pd.cut(y_pred,
                            [-np.inf] + thresholds + [np.inf],
                            labels=self.labels)
        return opt_y_pred

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)
objective = OptunaRounder(y - y.min(), preds_valid - y.min())
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(objective, timeout=100)

In [None]:
best_thresholds = sorted(study.best_params.values())
print(f'Optimized thresholds: {best_thresholds}')
# Optimized thresholds: [1.6563933856306514, 1.8518486651366342, 2.507186761950873,
# 3.1780508314616527, 3.564031124411718]

In [None]:
preds_opt = objective.adjust(preds_valid - y.min(), best_thresholds)
preds_opt = preds_opt.astype(int) + y.min()
qwk = cohen_kappa_score(y, preds_opt, weights="quadratic")
print("QWK:", qwk)
# QWK: 0.5413127048421356

QWK objective


In [None]:
a = 5.7
b = 0.7
g = np.zeros(6)
for i in range(6):
    g[i] = ((y - (i+3))**2).mean()
plt.plot([3, 4, 5, 6, 7, 8], g, marker=".", label="actual")
plt.plot([3, 4, 5, 6, 7, 8], [(x-a)**2 + b for x in [3, 4, 5, 6, 7, 8]], label="fitting")#%%
plt.legend()
plt.show()

In [None]:
y = df["quality"] - a

In [None]:
def qwk_obj(preds, dtrain):
    labels = dtrain.get_label() + a
    preds = preds + a
    preds = preds.clip(3, 8)
    f = 1/2*np.sum((preds-labels)**2)
    g = 1/2*np.sum((preds-a)**2+b)
    df = preds - labels
    dg = preds - a
    grad = (df/g - f*dg/g**2)*len(labels)
    hess = np.ones(len(labels))
    return grad, hess

In [None]:
def quadratic_weighted_kappa(preds, data):
    y_true = (data.get_label() + a).round()
    y_pred = (preds+a).clip(3, 8).round()
    qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    return 'QWK', qwk, True

In [None]:

params = {
    "metric": "None",
    "verbosity": -1,
    "learning_rate": 0.01,
}

preds_valid = np.zeros(len(df))

kf = KFold(n_splits=5, shuffle=True, random_state=0)
for idx_train, idx_valid in kf.split(df):
    X_train = X.iloc[idx_train, :]
    y_train = y.iloc[idx_train]
    X_valid = X.iloc[idx_valid, :]
    y_valid = y.iloc[idx_valid]

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)

    callbacks = [
        lgb.early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(100)
    ]

    model = lgb.train(
        params=params,
        train_set=lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_valid],
        valid_names=["valid"],
        fobj=qwk_obj,
        feval=quadratic_weighted_kappa,
        callbacks=callbacks
    )

    preds_valid[idx_valid] = model.predict(X_valid) + a

In [None]:
y = df["quality"]
qwk = cohen_kappa_score(y, preds_valid.clip(3, 8).round(), weights="quadratic")
print("QWK:", qwk)
# QWK: 0.545001299987572