## Load Environments

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import log

In [80]:
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, LeaveOneGroupOut

In [3]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

## Read in raw data sets, training, tournament (val, test, live)

In [12]:
raw_train = pd.read_csv('train.csv', index_col=0)
raw_test = pd.read_csv('test.csv', index_col=0)

In [13]:
raw_train.head()

Unnamed: 0_level_0,era,data_type,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
518647,era1,train,0.26647,0.42487,0.81401,0.22889,0.27456,0.55654,0.5531,0.71899,...,0.18847,0.77235,0.55002,0.20237,0.79605,0.82971,0.45757,0.69761,0.53739,1
581542,era1,train,0.41334,0.47533,0.71847,0.40792,0.32433,0.55806,0.59592,0.5183,...,0.32083,0.72435,0.63751,0.29143,0.6786,0.70083,0.59967,0.53103,0.47446,1
630790,era1,train,0.48937,0.5603,0.5915,0.46432,0.42291,0.54177,0.53542,0.50577,...,0.42195,0.62651,0.51604,0.42938,0.56744,0.60008,0.46966,0.50322,0.42803,1
646251,era1,train,0.61195,0.65958,0.45877,0.5673,0.51889,0.45049,0.5603,0.39115,...,0.54803,0.5912,0.5816,0.51828,0.4387,0.47011,0.56007,0.36374,0.31552,1
558386,era1,train,0.43758,0.50085,0.60446,0.46663,0.47157,0.59667,0.40161,0.5626,...,0.40535,0.54366,0.44763,0.37668,0.59931,0.59539,0.43771,0.54767,0.43742,1


In [115]:
train = raw_train[raw_train.loc[:,'data_type']=='train'].drop(['data_type', 'era'], axis=1)
train_eras = raw_train[raw_train.loc[:,'data_type']=='train']['era']

valid = raw_test[raw_test.loc[:,'data_type']=='validation'].drop(['data_type', 'era'], axis=1)
val_eras = raw_test[raw_test.loc[:,'data_type']=='validation']['era']

test = raw_test[raw_test.loc[:,'data_type']=='test'].drop(['data_type', 'target', 'era'], axis=1)
live = raw_test[raw_test.loc[:,'data_type']=='live'].drop(['data_type', 'target', 'era'], axis=1)

In [116]:
x_train = train.drop(['target'], axis=1)
x_val = valid.drop(['target'], axis=1)
y_train = train['target']
y_val = valid['target']

In [41]:
x_train.head()

Unnamed: 0_level_0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
518647,0.26647,0.42487,0.81401,0.22889,0.27456,0.55654,0.5531,0.71899,0.20496,0.62848,...,0.26651,0.18847,0.77235,0.55002,0.20237,0.79605,0.82971,0.45757,0.69761,0.53739
581542,0.41334,0.47533,0.71847,0.40792,0.32433,0.55806,0.59592,0.5183,0.25778,0.49711,...,0.37999,0.32083,0.72435,0.63751,0.29143,0.6786,0.70083,0.59967,0.53103,0.47446
630790,0.48937,0.5603,0.5915,0.46432,0.42291,0.54177,0.53542,0.50577,0.3274,0.58043,...,0.46203,0.42195,0.62651,0.51604,0.42938,0.56744,0.60008,0.46966,0.50322,0.42803
646251,0.61195,0.65958,0.45877,0.5673,0.51889,0.45049,0.5603,0.39115,0.33862,0.62694,...,0.6173,0.54803,0.5912,0.5816,0.51828,0.4387,0.47011,0.56007,0.36374,0.31552
558386,0.43758,0.50085,0.60446,0.46663,0.47157,0.59667,0.40161,0.5626,0.36587,0.61275,...,0.47069,0.40535,0.54366,0.44763,0.37668,0.59931,0.59539,0.43771,0.54767,0.43742


In [341]:
pca = PCA(n_components=19)
x_train_pca = pca.fit_transform(x_train.values)
#x_train_pca = x_train.values
x_val_pca = pca.transform(x_val.values)
#x_val_pca = x_val.values

In [342]:
logo = LeaveOneGroupOut()
cv = logo.split(x_train_pca, y_train.values, eras.values)

In [343]:
param_grid = {'n_estimators': [50, 100],
              'max_depth': [6],
              'min_samples_leaf': [1]}

clf = RandomForestClassifier(random_state=42)

grid_clf = GridSearchCV(clf, param_grid=param_grid, cv=5)

In [344]:
grid_clf.fit(x_train_pca, y_train.values)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [35, 50], 'max_depth': [6], 'min_samples_leaf': [1, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [345]:
grid_clf. best_params_

{'max_depth': 6, 'min_samples_leaf': 1, 'n_estimators': 50}

In [346]:
mod = grid_clf.best_estimator_

In [347]:
print(mod.score(x_train_pca, y_train.values))
print(mod.score(x_val_pca, y_val.values))
print(log_loss(y_val.values, mod.predict_proba(x_val_pca)))

0.538360776717
0.517259978425
0.692260515823


In [348]:
val_logo = LeaveOneGroupOut()
scores = []

fail = 0
guessing = -log(.5)

for _, index in val_logo.split(x_val_pca, y_val.values, val_eras.values):
    score = log_loss(y_val.iloc[index].values, mod.predict_proba(x_val_pca[index,:]))
    
    print(score)
    
    if(score > guessing):
        fail += 1
    
print(fail / 12.0)

0.691755726606
0.690074284023
0.695349366545
0.695899246912
0.692032526394
0.691809089207
0.695757142103
0.693343938302
0.690947521345
0.690370583661
0.689315820698
0.690134475223
0.333333333333


In [340]:
print(x_val.shape)
print(test.shape)
print(live.shape)
to_pred = pd.concat([x_val,test,live])
print(to_pred.shape)

(16686, 21)
(27693, 21)
(1246, 21)
(45625, 21)


In [318]:
pred_pca = pca.transform(to_pred.values)

NotFittedError: This PCA instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [323]:
preds = mod.predict_proba(pred_pca)[:,1]
#preds = mod.predict_proba(to_pred.values)[:,1]

In [324]:
sub = pd.concat([pd.Series(to_pred.index), pd.Series(preds)], axis=1)
sub.columns = ['id', 'probability']
sub.head()

Unnamed: 0,id,probability
0,535632,0.521102
1,526056,0.488422
2,621500,0.513818
3,561590,0.477882
4,523587,0.517308


In [325]:
sub.describe()

Unnamed: 0,id,probability
count,45625.0,45625.0
mean,587661.584285,0.504372
std,44456.299783,0.015447
min,511026.0,0.359975
25%,549026.0,0.495349
50%,587541.0,0.504477
75%,625886.0,0.515017
max,665047.0,0.658474


In [326]:
sub.to_csv('sub16_sk_pcarf.csv', index=False)