In [135]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from collections import Counter
import mlflow
import sklearn


# local imports
from prepare import *
from evaluate import *

In [136]:
from lightgbm import LGBMRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
)
from catboost import CatBoostRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import AdaBoostRegressor \
                                
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDRegressor, Ridge
import warnings
warnings.filterwarnings('ignore')  #Ridge classifier throws some warnings about ill-conditioned matrix

# Try out balanced classes
reduced_train = pd.read_csv('reduce_train.csv')

#reduced_train = pd.read_csv('reduce_train.csv')
reduced_test = pd.read_csv('reduce_test.csv')
reduced_train.shape, reduced_test.shape

categoricals = ['session_title']
cols_to_drop = ['game_session', 'installation_id', 'accuracy_group']


## Select features

In [152]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
print(X.shape)

lgb = LGBMRegressor(random_state=42).fit(X,y)
model = SelectFromModel(lgb, prefit=True)
X_new = model.transform(X)
X_new.shape


#model = SelectFromModel(clf, prefit=True)
feature_idx = model.get_support()
feature_name = X.columns[feature_idx]
joblib.dump(feature_name, 'sfm_features214.pkl')

(17690, 887)


['sfm_features214.pkl']

In [164]:
lgbm = LGBMRegressor()
rfr = RandomForestRegressor()
cbr = CatBoostRegressor(
    loss_function="RMSE",
    task_type="CPU",
    learning_rate=0.05,
    iterations=2000,
    od_type="Iter",
    early_stopping_rounds=500,
    random_seed=42,
    silent=True
)
gbr = GradientBoostingRegressor()
abr = AdaBoostRegressor()
lvr = LinearSVR()
lr = LinearRegression()
sgd = SGDRegressor()
rr = Ridge()
lsr = Lasso()

models = [lgbm, rfr, cbr, gbr, abr, lr, rr, lsr]

In [154]:
def my_metric(y_true, y_pred):
    y_pred = get_class_pred(y_pred, reduced_train)
    qwk = cohen_kappa_score(y_true, y_pred, weights='quadratic')
    return qwk

In [184]:
from vecstack import stacking

X_train = reduced_train.drop(cols_to_drop, axis=1, errors='ignore')
y_train = reduced_train.accuracy_group
X_test = reduced_test.drop(cols_to_drop, axis=1, errors='ignore')

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
#X_train, y_train = balance_classes(X_train, y_train)
S_train, S_test = stacking(models,                   
                           X_train, y_train, X_test,   
                           regression=True, 
                           mode='oof_pred_bag',        
                           needs_proba=False,         
                           save_dir=None,             
                           metric=my_metric,     
                           n_folds=4,
                           shuffle=True,             
                           random_state=42,            
                           verbose=2)

task:         [regression]
metric:       [my_metric]
mode:         [oof_pred_bag]
n_models:     [8]

model  0:     [LGBMRegressor]
    fold  0:  [0.59831243]
    fold  1:  [0.57691170]
    fold  2:  [0.56467933]
    fold  3:  [0.58872694]
    ----
    MEAN:     [0.58215760] + [0.01262086]
    FULL:     [0.58318457]

model  1:     [RandomForestRegressor]
    fold  0:  [0.57533228]
    fold  1:  [0.55489063]
    fold  2:  [0.54296599]
    fold  3:  [0.56928010]
    ----
    MEAN:     [0.56061725] + [0.01260891]
    FULL:     [0.56196284]

model  2:     [CatBoostRegressor]
    fold  0:  [0.59730389]
    fold  1:  [0.58451552]
    fold  2:  [0.56683012]
    fold  3:  [0.59949770]
    ----
    MEAN:     [0.58703681] + [0.01299379]
    FULL:     [0.58583306]

model  3:     [GradientBoostingRegressor]
    fold  0:  [0.58909148]
    fold  1:  [0.57475967]
    fold  2:  [0.54876345]
    fold  3:  [0.58376672]
    ----
    MEAN:     [0.57409533] + [0.01549650]
    FULL:     [0.57298429]

model  

In [176]:
model = RandomForestRegressor()

model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
#y_pred = get_class_pred(y_pred, reduced_test)
print('Final prediction score: [%.8f]' % my_metric(y_test, y_pred))


Final prediction score: [0.53451841]


In [177]:
model = LinearRegression()

model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
#y_pred = get_class_pred(y_pred, reduced_test)
print('Final prediction score: [%.8f]' % my_metric(y_test, y_pred))

Final prediction score: [0.55715646]


In [178]:
model = LGBMRegressor()

model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
#y_pred = get_class_pred(y_pred, reduced_test)
print('Final prediction score: [%.8f]' % my_metric(y_test, y_pred))

Final prediction score: [0.54007561]


In [179]:
model = Ridge()

model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
#y_pred = get_class_pred(y_pred, reduced_test)
print('Final prediction score: [%.8f]' % my_metric(y_test, y_pred))

Final prediction score: [0.55715646]


In [180]:
model = SGDRegressor()

model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
#y_pred = get_class_pred(y_pred, reduced_test)
print('Final prediction score: [%.8f]' % my_metric(y_test, y_pred))

Final prediction score: [0.55715646]


In [181]:
model = LinearSVR()

model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
#y_pred = get_class_pred(y_pred, reduced_test)
print('Final prediction score: [%.8f]' % my_metric(y_test, y_pred))

Final prediction score: [0.55344670]


In [182]:
model = AdaBoostRegressor()

model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
#y_pred = get_class_pred(y_pred, reduced_test)
print('Final prediction score: [%.8f]' % my_metric(y_test, y_pred))

Final prediction score: [0.48973705]


In [183]:
model = CatBoostRegressor()

model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
#y_pred = get_class_pred(y_pred, reduced_test)
print('Final prediction score: [%.8f]' % my_metric(y_test, y_pred))

0:	learn: 1.0946827	total: 7.3ms	remaining: 7.29s
1:	learn: 1.0722617	total: 11.9ms	remaining: 5.94s
2:	learn: 1.0509908	total: 17.6ms	remaining: 5.83s
3:	learn: 1.0303731	total: 21.8ms	remaining: 5.42s
4:	learn: 1.0106757	total: 25.6ms	remaining: 5.1s
5:	learn: 0.9915381	total: 29.5ms	remaining: 4.89s
6:	learn: 0.9734137	total: 33.3ms	remaining: 4.73s
7:	learn: 0.9560707	total: 37.3ms	remaining: 4.62s
8:	learn: 0.9392247	total: 41ms	remaining: 4.51s
9:	learn: 0.9232001	total: 44.9ms	remaining: 4.44s
10:	learn: 0.9077167	total: 48.9ms	remaining: 4.4s
11:	learn: 0.8929508	total: 52.9ms	remaining: 4.36s
12:	learn: 0.8789371	total: 56.7ms	remaining: 4.3s
13:	learn: 0.8653434	total: 60.5ms	remaining: 4.26s
14:	learn: 0.8523784	total: 64.3ms	remaining: 4.22s
15:	learn: 0.8399240	total: 68.2ms	remaining: 4.2s
16:	learn: 0.8280025	total: 72ms	remaining: 4.16s
17:	learn: 0.8168199	total: 75.9ms	remaining: 4.14s
18:	learn: 0.8060114	total: 79.7ms	remaining: 4.12s
19:	learn: 0.7955868	total: 83.

171:	learn: 0.5855525	total: 692ms	remaining: 3.33s
172:	learn: 0.5854483	total: 696ms	remaining: 3.33s
173:	learn: 0.5852915	total: 700ms	remaining: 3.32s
174:	learn: 0.5851818	total: 705ms	remaining: 3.33s
175:	learn: 0.5851190	total: 709ms	remaining: 3.32s
176:	learn: 0.5850070	total: 713ms	remaining: 3.31s
177:	learn: 0.5849425	total: 717ms	remaining: 3.31s
178:	learn: 0.5848657	total: 721ms	remaining: 3.31s
179:	learn: 0.5847820	total: 725ms	remaining: 3.3s
180:	learn: 0.5846510	total: 729ms	remaining: 3.3s
181:	learn: 0.5845814	total: 733ms	remaining: 3.29s
182:	learn: 0.5845051	total: 737ms	remaining: 3.29s
183:	learn: 0.5843565	total: 742ms	remaining: 3.29s
184:	learn: 0.5842176	total: 747ms	remaining: 3.29s
185:	learn: 0.5841555	total: 750ms	remaining: 3.28s
186:	learn: 0.5840476	total: 755ms	remaining: 3.28s
187:	learn: 0.5839683	total: 760ms	remaining: 3.28s
188:	learn: 0.5839089	total: 764ms	remaining: 3.28s
189:	learn: 0.5838659	total: 769ms	remaining: 3.28s
190:	learn: 0.

370:	learn: 0.5709076	total: 1.54s	remaining: 2.62s
371:	learn: 0.5708662	total: 1.55s	remaining: 2.61s
372:	learn: 0.5708033	total: 1.55s	remaining: 2.61s
373:	learn: 0.5707362	total: 1.56s	remaining: 2.6s
374:	learn: 0.5706891	total: 1.56s	remaining: 2.6s
375:	learn: 0.5705831	total: 1.56s	remaining: 2.6s
376:	learn: 0.5705018	total: 1.57s	remaining: 2.59s
377:	learn: 0.5704677	total: 1.57s	remaining: 2.59s
378:	learn: 0.5704469	total: 1.58s	remaining: 2.58s
379:	learn: 0.5704188	total: 1.58s	remaining: 2.58s
380:	learn: 0.5702999	total: 1.58s	remaining: 2.57s
381:	learn: 0.5702676	total: 1.59s	remaining: 2.57s
382:	learn: 0.5702267	total: 1.59s	remaining: 2.56s
383:	learn: 0.5701802	total: 1.59s	remaining: 2.56s
384:	learn: 0.5700634	total: 1.6s	remaining: 2.55s
385:	learn: 0.5700159	total: 1.6s	remaining: 2.55s
386:	learn: 0.5699345	total: 1.61s	remaining: 2.54s
387:	learn: 0.5698419	total: 1.61s	remaining: 2.54s
388:	learn: 0.5697737	total: 1.61s	remaining: 2.54s
389:	learn: 0.569

546:	learn: 0.5597446	total: 2.23s	remaining: 1.84s
547:	learn: 0.5596978	total: 2.23s	remaining: 1.84s
548:	learn: 0.5596502	total: 2.24s	remaining: 1.84s
549:	learn: 0.5595676	total: 2.25s	remaining: 1.84s
550:	learn: 0.5594576	total: 2.25s	remaining: 1.83s
551:	learn: 0.5594270	total: 2.26s	remaining: 1.83s
552:	learn: 0.5593686	total: 2.26s	remaining: 1.83s
553:	learn: 0.5593141	total: 2.26s	remaining: 1.82s
554:	learn: 0.5592472	total: 2.27s	remaining: 1.82s
555:	learn: 0.5592024	total: 2.27s	remaining: 1.81s
556:	learn: 0.5591177	total: 2.28s	remaining: 1.81s
557:	learn: 0.5590784	total: 2.28s	remaining: 1.81s
558:	learn: 0.5590028	total: 2.28s	remaining: 1.8s
559:	learn: 0.5589312	total: 2.29s	remaining: 1.8s
560:	learn: 0.5588765	total: 2.29s	remaining: 1.79s
561:	learn: 0.5588322	total: 2.29s	remaining: 1.79s
562:	learn: 0.5587764	total: 2.3s	remaining: 1.78s
563:	learn: 0.5587403	total: 2.3s	remaining: 1.78s
564:	learn: 0.5587134	total: 2.31s	remaining: 1.78s
565:	learn: 0.55

716:	learn: 0.5516571	total: 2.91s	remaining: 1.15s
717:	learn: 0.5516419	total: 2.91s	remaining: 1.14s
718:	learn: 0.5516090	total: 2.92s	remaining: 1.14s
719:	learn: 0.5515739	total: 2.92s	remaining: 1.14s
720:	learn: 0.5515291	total: 2.92s	remaining: 1.13s
721:	learn: 0.5514761	total: 2.93s	remaining: 1.13s
722:	learn: 0.5514530	total: 2.93s	remaining: 1.12s
723:	learn: 0.5514406	total: 2.94s	remaining: 1.12s
724:	learn: 0.5514164	total: 2.95s	remaining: 1.12s
725:	learn: 0.5513737	total: 2.95s	remaining: 1.11s
726:	learn: 0.5513585	total: 2.96s	remaining: 1.11s
727:	learn: 0.5513254	total: 2.96s	remaining: 1.1s
728:	learn: 0.5512648	total: 2.96s	remaining: 1.1s
729:	learn: 0.5512314	total: 2.97s	remaining: 1.1s
730:	learn: 0.5511838	total: 2.97s	remaining: 1.09s
731:	learn: 0.5511484	total: 2.98s	remaining: 1.09s
732:	learn: 0.5511105	total: 2.98s	remaining: 1.08s
733:	learn: 0.5510582	total: 2.98s	remaining: 1.08s
734:	learn: 0.5510079	total: 2.99s	remaining: 1.08s
735:	learn: 0.5

878:	learn: 0.5448968	total: 3.59s	remaining: 494ms
879:	learn: 0.5448709	total: 3.59s	remaining: 490ms
880:	learn: 0.5448456	total: 3.6s	remaining: 486ms
881:	learn: 0.5448180	total: 3.6s	remaining: 482ms
882:	learn: 0.5447914	total: 3.61s	remaining: 478ms
883:	learn: 0.5447560	total: 3.61s	remaining: 474ms
884:	learn: 0.5447442	total: 3.61s	remaining: 470ms
885:	learn: 0.5447092	total: 3.62s	remaining: 466ms
886:	learn: 0.5446379	total: 3.62s	remaining: 462ms
887:	learn: 0.5446074	total: 3.63s	remaining: 458ms
888:	learn: 0.5445319	total: 3.63s	remaining: 453ms
889:	learn: 0.5444971	total: 3.63s	remaining: 449ms
890:	learn: 0.5444476	total: 3.64s	remaining: 445ms
891:	learn: 0.5444327	total: 3.64s	remaining: 441ms
892:	learn: 0.5444044	total: 3.65s	remaining: 437ms
893:	learn: 0.5443759	total: 3.65s	remaining: 433ms
894:	learn: 0.5443578	total: 3.65s	remaining: 429ms
895:	learn: 0.5443291	total: 3.66s	remaining: 425ms
896:	learn: 0.5443174	total: 3.66s	remaining: 421ms
897:	learn: 0.

In [186]:
model = Ridge()

model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
#y_pred = get_class_pred(y_pred, reduced_test)
#print('Final prediction score: [%.8f]' % my_metric(y_test, y_pred))

def make_submission(preds, train_df, filename):
    preds = get_class_pred(preds, train_df)
    # assert len(preds)==1000
    sample = pd.read_csv("data/sample_submission.csv")
    submission = pd.DataFrame()
    submission["installation_id"] = sample["installation_id"]
    submission["accuracy_group"] = preds
    submission.to_csv(filename, index=False)
    return submission

make_submission(y_pred, reduced_train, 'vs_preds5.csv')

Unnamed: 0,installation_id,accuracy_group
0,00abaee7,3
1,01242218,3
2,017c5718,3
3,01a44906,3
4,01bc6cb6,3
...,...,...
995,fee254cf,3
996,ff57e602,0
997,ffc73fb2,3
998,ffe00ca8,1


In [54]:
make_submission(final_preds, reduced_train)

Unnamed: 0,installation_id,accuracy_group
0,00abaee7,1
1,01242218,3
2,017c5718,0
3,01a44906,0
4,01bc6cb6,3
...,...,...
995,fee254cf,3
996,ff57e602,1
997,ffc73fb2,3
998,ffe00ca8,3
