In [29]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from collections import Counter

# local imports
from prepare import *
from evaluate import *

### Read in initial datasets if needed

In [30]:
#raw_train, raw_train_labels, raw_test, specs, sample = read_raw_csvs()
raw_train_labels = pd.read_csv('data/train_labels.csv')

### Load compiled train/test datasets

In [31]:
train, test = load_and_prep(raw_train_labels)

In [32]:
#train = balance_classes(train)  ## try with balanced classes     --- results much worse

### Start throwing model mud at the wall

In [33]:
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
                                ExtraTreesClassifier, BaggingClassifier, \
                                GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')  #Ridge classifier throws some warnings about ill-conditioned matrix

### Baseline accuracy of 50% (or 25% if balanced)

In [34]:
train.accuracy_group.value_counts(normalize=True)

3    0.500000
0    0.239062
1    0.136292
2    0.124647
Name: accuracy_group, dtype: float64

# Initialize Models and start testing accuracy

#### KNN and SVC will require scaling, others shouldn't

In [35]:
rf = RandomForestClassifier()
#lr = LogisticRegression()               ## if they are commented out, they weren't performing well (or operator error...)
#sgd = SGDClassifier()
rc = RidgeClassifier()
#nb = GaussianNB()
ac = AdaBoostClassifier()
et = ExtraTreesClassifier()
bc = BaggingClassifier()
gbc = GradientBoostingClassifier()
clf = CatBoostClassifier(
    loss_function='MultiClass',
    task_type="CPU",
    learning_rate=0.01,
    iterations=2000,
    od_type="Iter",
    early_stopping_rounds=500,
    random_seed=42
    )

In [36]:
knn = KNeighborsClassifier()
svc = SVC(probability=True, verbose=1)

## Evaluate model performance

In [37]:
# Catboost - accuracy 56%a
#quick_eval(train, clf, cv=True)

In [38]:
quick_eval(train, rf, cv=True)
#quick_eval(train, lr, cv=True)
#quick_eval(train, sgd, cv=True)
quick_eval(train, rc, cv=True)
#quick_eval(train, nb, cv=True)
quick_eval(train, ac, cv=True)
quick_eval(train, et, cv=True)
quick_eval(train, bc, cv=True)
quick_eval(train, gbc, cv=True)
#quick_eval(train, clf, cv=True)


#quick_eval(train, knn, scale=True, cv=True)
#quick_eval(train, svc, scale=True, cv=True)

The CV score of RandomForestClassifier is 0.539344262295082
The CV score of RidgeClassifier is 0.5076879592990391
The CV score of AdaBoostClassifier is 0.5541548897682307
The CV score of ExtraTreesClassifier is 0.511927642736009
The CV score of BaggingClassifier is 0.48654607122668175
The CV score of GradientBoostingClassifier is 0.5579423403052572


('GradientBoostingClassifier', 0.5579423403052572)

## more models, to include sklearn neural net

In [22]:
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier

In [23]:
mlp = MLPClassifier()
#gpc = GaussianProcessClassifier(verbose=1) #takes a long time
#rbf = RBF()
dt = DecisionTreeClassifier()

In [24]:
quick_eval(train, mlp, cv=True)
#quick_eval(train, gpc, cv=True)    TRY LATER, THIS WAS VERY SLOW
#quick_eval(train, rbf, cv=True)   poor
quick_eval(train, dt, cv=True)    

The CV score of MLPClassifier is 0.25
The CV score of DecisionTreeClassifier is 0.3054421768707483


('DecisionTreeClassifier', 0.3054421768707483)

In [39]:
bc_rf = BaggingClassifier(
    base_estimator=RandomForestClassifier(max_depth=10),
    n_estimators=200)

bc_gbc = BaggingClassifier(
    base_estimator=GradientBoostingClassifier(),
    n_estimators=200)

bc_abc = BaggingClassifier(
    base_estimator=AdaBoostClassifier(),
    n_estimators=200)

quick_eval(train, bc_rf)    # 0.549 with 20 estimators max depth of 10  (.377 with balanced)    0.551 with 200 
quick_eval(train, bc_gbc)   # 0.576 with 20 estimators (.389 with balanced)                     0.578 with 200
quick_eval(train, bc_abc)   # 0.563 with 20 estimators  (.374 with balanced)                    0.561 with 200

The accuracy of BaggingClassifier is 0.5505935556811759
The accuracy of BaggingClassifier is 0.5777275296777841
The accuracy of BaggingClassifier is 0.5610514414923685


('BaggingClassifier', 0.5610514414923685)

## Model Ensembling

In [26]:
vc = VotingClassifier(estimators=[
    ('Adaboost',ac),
    ('rf', rf), 
    ('gbc', gbc),
    ('et', et),
    #('svc', svc),
    ('rc', rc),
    ('mlp', mlp),
    ('dt', dt),
    ('bc_rf', bc_rf),
    ('bc_gbc', bc_gbc),
    ('bc_abc', bc_abc),
    #('catboost', clf)
    ],
     n_jobs=-1,
     voting='hard')

quick_eval(train, vc)#, cv=True)     #initial w/o catboost .564

The accuracy of VotingClassifier is 0.3764172335600907


('VotingClassifier', 0.3764172335600907)

In [27]:
estimators = [
    ('Adaboost',ac),
    ('rf', rf), 
    ('gbc', gbc),
    ('et', et),
    #('svc', svc),
    ('rc', rc),
    ('mlp', mlp),
    ('dt', dt),
    ('bc_rf', bc_rf),
    ('bc_gbc', bc_gbc),
    ('bc_abc', bc_abc),
]
stacking_clf = StackingClassifier(estimators=estimators, 
                final_estimator=RandomForestClassifier(), n_jobs=-1)        # created, load from pickle
quick_eval(train, stacking_clf)

The accuracy of StackingClassifier is 0.3854875283446712


('StackingClassifier', 0.3854875283446712)

In [40]:
##joblib.dump(stacking_clf, 'fitted_stacked_classifier.pkl')
##joblib.dump(vc, 'fitted_voting_classifier.pkl')

['fitted_voting_classifier.pkl']

## Create submission based on Brad's data prep

In [28]:
test_numerics_only = test._get_numeric_data()
test_prediction = stacking_clf.predict(test_numerics_only)

# To create a submission:
submission = pd.DataFrame()
submission['installation_id'] = test.installation_id
submission['accuracy_group'] = test_prediction
#submission.head()
submission.to_csv('preds_balanced.csv')
submission.accuracy_group.value_counts()

3    379
0    247
2    216
1    158
Name: accuracy_group, dtype: int64

In [42]:
test

Unnamed: 0,timestamp,event_count,event_code,game_time,title_12 Monkeys,title_Air Show,title_All Star Sorting,title_Balancing Act,title_Bird Measurer (Assessment),title_Bottle Filler (Activity),...,total_event_count,avg_event_count,avg_review_incorrect_feedback,avg_review_correct_feedback,total_rounds_beat,total_movies_skipped,total_movies_watched,total_elsewhere_clicks,total_help_button_clicks,total_play_again
867,1568296332193000000,1,2000,0,2.0,0.0,79.0,1.0,0.0,0.0,...,47503,54.726959,1363.212121,2750.333333,7,0,0,190,0,0
2718,1570652596209000000,1,2000,0,1.0,72.0,56.0,3.0,61.0,221.0,...,174649,64.232806,2279.760000,2611.152778,82,2,7,385,2,0
149,1569065301757000000,1,2000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,5191,34.606667,0.000000,0.000000,0,0,0,14,0,0
233,1564244890394000000,1,2000,0,1.0,0.0,78.0,0.0,0.0,0.0,...,9681,41.371795,3377.500000,2650.666667,3,0,0,28,1,0
951,1567793126197000000,1,2000,0,0.0,0.0,420.0,1.0,0.0,0.0,...,101904,107.042017,318.037037,2936.444444,25,0,1,134,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,1569635090390000000,1,2000,0,0.0,0.0,92.0,0.0,82.0,0.0,...,7407,34.938679,968.000000,2215.750000,7,0,0,41,0,1
302,1570987104776000000,1,2000,0,1.0,0.0,0.0,2.0,0.0,0.0,...,9459,31.217822,502.125000,3566.875000,7,0,0,56,0,2
525,1570480618937000000,1,2000,0,1.0,0.0,0.0,3.0,0.0,0.0,...,30362,57.722433,2216.500000,2076.300000,6,0,0,136,0,0
258,1568142042792000000,1,2000,0,0.0,0.0,0.0,1.0,0.0,0.0,...,9365,36.158301,1259.000000,3309.500000,2,0,0,45,2,0


## Try original datasets

In [88]:
#trainX = pd.read_csv('trainX.csv')
#test = pd.read_csv('trainY.csv')
#testX = pd.read_csv('testX.csv')

In [89]:
train = pd.read_csv('full_train.csv')

In [97]:
test = pd.read_csv('original_X_test.csv')

In [106]:
train = train.drop('game_session', axis=1)
test = test.drop('game_session', axis=1)

In [107]:
vc = VotingClassifier(estimators=[
    ('Adaboost',ac),
    ('rf', rf), 
    ('gbc', gbc),
    ('et', et),
    #('svc', svc),
    ('rc', rc),
    ('mlp', mlp),
    ('dt', dt),
    ('bc_rf', bc_rf),
    ('bc_gbc', bc_gbc),
    ('bc_abc', bc_abc),
    #('catboost', clf)
    ],
     n_jobs=-1,
     voting='hard')

quick_eval(train, vc)#, cv=True)     #initial w/o catboost .564

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


The accuracy of VotingClassifier is 0.6044260027662517


('VotingClassifier', 0.6044260027662517)

In [84]:
estimators = [
    ('Adaboost',ac),
    ('rf', rf), 
    ('gbc', gbc),
    ('et', et),
    #('svc', svc),
    ('rc', rc),
    ('mlp', mlp),
    ('dt', dt),
    ('bc_rf', bc_rf),
    ('bc_gbc', bc_gbc),
    ('bc_abc', bc_abc),
]
stacking_clf = StackingClassifier(estimators=estimators, 
                final_estimator=RandomForestClassifier(), n_jobs=-1)        # created, load from pickle
quick_eval(train, stacking_clf)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


The accuracy of StackingClassifier is 0.6085753803596127


('StackingClassifier', 0.6085753803596127)

In [104]:
test.head()

Unnamed: 0,game_session,Clip,Activity,Assessment,Game,Chow Time,Pirate's Tale,Chicken Balancer (Activity),Dino Drink,Bug Measurer (Activity),...,session_title_33,session_title_34,session_title_35,session_title_36,session_title_37,session_title_38,session_title_39,session_title_40,session_title_41,session_title_42
0,348d7f09f96af313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1fef5d54cb4b775a,0.0,0.0,0.111111,0.0,0.011361,0.0,0.159763,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,4b165a330a0bdd6c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,be0b655ad1fee30c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,46e8bbed71df7520,0.0,0.0,0.0,0.0,0.022516,0.0,0.326923,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## Predict on original Datasets

In [108]:
#test_numerics_only = test._get_numeric_data()
final_prediction = vc.predict(test)
sample_sub = pd.read_csv('data/sample_submission.csv')
# To create a submission:
submission = pd.DataFrame()
submission['installation_id'] = sample_sub.installation_id
submission['accuracy_group'] = final_prediction
submission.head()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0,installation_id,accuracy_group
0,00abaee7,3
1,01242218,3
2,017c5718,3
3,01a44906,3
4,01bc6cb6,3


In [109]:
submission.accuracy_group.value_counts()

3    732
0    252
1     14
2      2
Name: accuracy_group, dtype: int64

In [110]:
submission.to_csv('preds2.csv', index=False)
submission.accuracy_group.value_counts()

3    732
0    252
1     14
2      2
Name: accuracy_group, dtype: int64

In [111]:
from sklearn.metrics import cohen_kappa_score

In [40]:
cohen_kappa_score(y_pred, y_test)

NameError: name 'cohen_kappa_score' is not defined

In [42]:
reduced_train = pd.read_csv('reduce_train.csv')
reduced_test = pd.read_csv('reduce_test.csv')
reduced_train.shape, reduced_test.shape

((17690, 891), (1000, 891))

In [44]:
quick_eval(train, rf, cv=True)
quick_eval(reduced_train, rf, cv=True)

The CV score of RandomForestClassifier is 0.5391746749576032
The CV score of RandomForestClassifier is 0.5603165630299605


('RandomForestClassifier', 0.5603165630299605)