In [24]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [17]:
from basedir import SAMPLE
from utils import from_feather

In [3]:
x_trn_rich, x_tst_rich, y_trn = from_feather('trn_rich', 'tst_rich', 'y_trn')

In [4]:
x_trn_feat, x_tst_feat = [np.load(f'{suffix}_feat.npy') for suffix in ('trn', 'tst')]

In [5]:
x_trn = np.column_stack([x_trn_rich, x_trn_feat])
x_tst = np.column_stack([x_tst_rich, x_tst_feat])
enc = LabelEncoder()
y = enc.fit_transform(y_trn['surface'])

In [6]:
def accuracy(y_true, y_pred):
    n = len(y_true)
    y_hat = y_pred.reshape(9, n).argmax(axis=0)
    value = (y_true == y_hat).mean()
    return 'accuracy', value, True

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(x_trn, y, test_size=0.1)

In [21]:
model = lgb.LGBMClassifier(
    n_estimators=3000, learning_rate=0.005,
    colsample_bytree=0.3, objective='multiclass',
    num_leaves=500, num_class=9)

In [22]:
model.fit(
    X_train, y_train, 
    eval_set=[(X_valid, y_valid)], 
#     eval_metric=accuracy,
    early_stopping_rounds=300,
    verbose=150)

Training until validation scores don't improve for 300 rounds.
[150]	valid_0's multi_logloss: 0.932475
[300]	valid_0's multi_logloss: 0.513065
[450]	valid_0's multi_logloss: 0.316313
[600]	valid_0's multi_logloss: 0.216597
[750]	valid_0's multi_logloss: 0.16545
[900]	valid_0's multi_logloss: 0.138421
[1050]	valid_0's multi_logloss: 0.124324
[1200]	valid_0's multi_logloss: 0.117474
[1350]	valid_0's multi_logloss: 0.114245
[1500]	valid_0's multi_logloss: 0.113495
[1650]	valid_0's multi_logloss: 0.114098
Early stopping, best iteration is:
[1459]	valid_0's multi_logloss: 0.113261


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
        importance_type='split', learning_rate=0.005, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=3000, n_jobs=-1, num_class=9, num_leaves=500,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [23]:
y_hat = model.predict(X_valid)
print(f'Validation accuracy: {(y_hat == y_valid).mean():2.2%}')

Validation accuracy: 96.59%


In [25]:
joblib.dump(model, 'lightgbm_overfit.model')

['lightgbm_overfit.model']

In [19]:
test_preds = enc.inverse_transform(model.predict(x_tst))

In [20]:
submit = pd.read_csv(SAMPLE)
submit['surface'] = test_preds
submit.to_csv('submit.csv', index=None)
!kaggle c submit career-con-2019 -f 'submit.csv' -m "LightGBM deep learning features"

100%|██████████████████████████████████████| 51.8k/51.8k [00:00<00:00, 65.4kB/s]
Successfully submitted to CareerCon 2019 - Help Navigate Robots 