#### Load data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [76]:
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import log_loss
from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV

In [3]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

Using TensorFlow backend.


In [68]:
raw_train = pd.read_csv('data/train.csv')
raw_test = pd.read_csv('data/test.csv')

In [69]:
raw_train.head()

Unnamed: 0,id,era,data_type,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21,target
0,518647,era1,train,0.26647,0.42487,0.81401,0.22889,0.27456,0.55654,0.5531,...,0.18847,0.77235,0.55002,0.20237,0.79605,0.82971,0.45757,0.69761,0.53739,1
1,581542,era1,train,0.41334,0.47533,0.71847,0.40792,0.32433,0.55806,0.59592,...,0.32083,0.72435,0.63751,0.29143,0.6786,0.70083,0.59967,0.53103,0.47446,1
2,630790,era1,train,0.48937,0.5603,0.5915,0.46432,0.42291,0.54177,0.53542,...,0.42195,0.62651,0.51604,0.42938,0.56744,0.60008,0.46966,0.50322,0.42803,1
3,646251,era1,train,0.61195,0.65958,0.45877,0.5673,0.51889,0.45049,0.5603,...,0.54803,0.5912,0.5816,0.51828,0.4387,0.47011,0.56007,0.36374,0.31552,1
4,558386,era1,train,0.43758,0.50085,0.60446,0.46663,0.47157,0.59667,0.40161,...,0.40535,0.54366,0.44763,0.37668,0.59931,0.59539,0.43771,0.54767,0.43742,1


In [72]:
all_data = pd.concat([raw_train,raw_test])
proc = all_data

In [73]:
train = proc[proc.loc[:,'data_type']=='train'].drop(['data_type'], axis=1)
valid = proc[proc.loc[:,'data_type']=='validation'].drop(['data_type'], axis=1)
test = proc[proc.loc[:,'data_type']=='test'].drop(['data_type', 'target'], axis=1)
live = proc[proc.loc[:,'data_type']=='live'].drop(['data_type', 'target'], axis=1)

In [86]:
x_train = train.drop(['target'], axis=1)
x_val = valid.drop(['target'], axis=1)
y_train = train['target']
y_val = valid['target']

In [87]:
logo = LeaveOneGroupOut()
groups = LabelEncoder().fit_transform(x_train['era'])
logo.get_n_splits(x_train, y_train, groups)

96

In [90]:
param_grid = {'n_estimators': [5, 10],
              'max_depth': [5, 7, 9]}

clf = RandomForestClassifier(random_state=42)

cv = logo.split(x_train.drop(['era'],axis=1).values, y_train.values, groups)

grid_clf = GridSearchCV(clf, param_grid=param_grid, cv=5)

In [91]:
grid_clf.fit(x_train.drop(['era'],axis=1).values, y_train.values)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10], 'max_depth': [5, 7, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [92]:
grid_clf. best_params_

{'max_depth': 5, 'n_estimators': 10}

In [18]:
pca = PCA(n_components=2)
x_train_pca = pca.fit_transform(x_train.drop(['id'], axis=1).values)
x_val_pca = pca.transform(x_val.drop(['id'], axis=1).values)

In [19]:
plotx = x_val_pca[:,1]
ploty = x_val_pca[:,0]

#### Random Forest Model

In [21]:
y_train.head()

Unnamed: 0,id,target,class_0,class_1
0,518647,1.0,0,1
1,581542,1.0,0,1
2,630790,1.0,0,1
3,646251,1.0,0,1
4,558386,1.0,0,1


In [67]:
yt = y_train['target'].values
yv = y_val['target'].values
xt = x_train.drop(['id','cluster'],axis=1).values
xv = x_val.drop(['id','cluster'],axis=1).values

In [56]:
rf = RandomForestClassifier(random_state=42, max_depth=4, n_estimators=125)
rf.fit(xt, yt)
print(rf.score(xt, yt))
print(rf.score(xv, yv))

0.527162031272
0.513244636222


In [58]:
log_loss(yv, rf.predict_proba(xv))

0.69261735626667043

In [44]:
x_train.head()

Unnamed: 0,id,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21,cluster
0,518647,0.26647,0.42487,0.81401,0.22889,0.27456,0.55654,0.5531,0.71899,0.20496,...,0.18847,0.77235,0.55002,0.20237,0.79605,0.82971,0.45757,0.69761,0.53739,3
1,581542,0.41334,0.47533,0.71847,0.40792,0.32433,0.55806,0.59592,0.5183,0.25778,...,0.32083,0.72435,0.63751,0.29143,0.6786,0.70083,0.59967,0.53103,0.47446,4
2,630790,0.48937,0.5603,0.5915,0.46432,0.42291,0.54177,0.53542,0.50577,0.3274,...,0.42195,0.62651,0.51604,0.42938,0.56744,0.60008,0.46966,0.50322,0.42803,2
3,646251,0.61195,0.65958,0.45877,0.5673,0.51889,0.45049,0.5603,0.39115,0.33862,...,0.54803,0.5912,0.5816,0.51828,0.4387,0.47011,0.56007,0.36374,0.31552,0
4,558386,0.43758,0.50085,0.60446,0.46663,0.47157,0.59667,0.40161,0.5626,0.36587,...,0.40535,0.54366,0.44763,0.37668,0.59931,0.59539,0.43771,0.54767,0.43742,1


In [45]:
rf.feature_importances_

array([ 0.03024762,  0.04213047,  0.02711518,  0.06183945,  0.03339884,
        0.06210113,  0.03225232,  0.02819904,  0.07242089,  0.05623961,
        0.04406911,  0.06228608,  0.01610526,  0.02784571,  0.05585425,
        0.07434064,  0.02556538,  0.03658372,  0.06764198,  0.05956239,
        0.08420091])

In [46]:
rf.feature_importances_

array([ 0.03024762,  0.04213047,  0.02711518,  0.06183945,  0.03339884,
        0.06210113,  0.03225232,  0.02819904,  0.07242089,  0.05623961,
        0.04406911,  0.06228608,  0.01610526,  0.02784571,  0.05585425,
        0.07434064,  0.02556538,  0.03658372,  0.06764198,  0.05956239,
        0.08420091])

In [47]:
x_val.shape

(16686, 23)

In [48]:
test.shape

(27693, 25)

In [49]:
live.shape

(1246, 25)

In [50]:
to_pred = pd.concat([x_val,test,live])
to_pred.shape

(45625, 25)

In [59]:
rf_preds = rf.predict_proba(to_pred.drop(['id','cluster', 'class_0','class_1'],axis=1).values)[:,1]

In [60]:
sub = pd.concat([to_pred['id'], pd.Series(rf_preds)], axis=1)
sub.columns = ['id', 'probability']
sub.head()

Unnamed: 0,id,probability
0,535632,0.492988
1,526056,0.502603
2,621500,0.485187
3,561590,0.504112
4,523587,0.490503


In [63]:
sub.describe()

Unnamed: 0,id,probability
count,45625.0,45625.0
mean,587661.584285,0.497537
std,44456.299783,0.008316
min,511026.0,0.451311
25%,549026.0,0.493415
50%,587541.0,0.499085
75%,625886.0,0.503164
max,665047.0,0.527661


In [62]:
sub.to_csv('data/sub9_sk_rf.csv', index=False)

#### Keras NN Model

In [220]:
x_train.head()

Unnamed: 0,id,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21,cluster
0,518647,0.26647,0.42487,0.81401,0.22889,0.27456,0.55654,0.5531,0.71899,0.20496,...,0.18847,0.77235,0.55002,0.20237,0.79605,0.82971,0.45757,0.69761,0.53739,4
1,581542,0.41334,0.47533,0.71847,0.40792,0.32433,0.55806,0.59592,0.5183,0.25778,...,0.32083,0.72435,0.63751,0.29143,0.6786,0.70083,0.59967,0.53103,0.47446,3
2,630790,0.48937,0.5603,0.5915,0.46432,0.42291,0.54177,0.53542,0.50577,0.3274,...,0.42195,0.62651,0.51604,0.42938,0.56744,0.60008,0.46966,0.50322,0.42803,0
3,646251,0.61195,0.65958,0.45877,0.5673,0.51889,0.45049,0.5603,0.39115,0.33862,...,0.54803,0.5912,0.5816,0.51828,0.4387,0.47011,0.56007,0.36374,0.31552,2
4,558386,0.43758,0.50085,0.60446,0.46663,0.47157,0.59667,0.40161,0.5626,0.36587,...,0.40535,0.54366,0.44763,0.37668,0.59931,0.59539,0.43771,0.54767,0.43742,1


In [258]:
yt = y_train[['class_0', 'class_1']].values
yv = y_val[['class_0', 'class_1']].values
xt = x_train.drop(['id','cluster'],axis=1).values
xv = x_val.drop(['id','cluster'],axis=1).values

In [298]:
def build_nn():
    model = Sequential()
    
    model.add(Dense(256, input_dim=xt.shape[1], activation='relu', init='normal'))
    model.add(Dropout(0.25))
    
    for i in range(0,4):    
        model.add(Dense(128, activation='relu', init='normal'))
        model.add(Dropout(0.25))
    
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [299]:
nn = build_nn()

In [302]:
nn.fit(xt, yt, validation_data=(xv, yv), nb_epoch=10, batch_size=1024, verbose=2)

Train on 108405 samples, validate on 16686 samples
Epoch 1/10
2s - loss: 0.6930 - acc: 0.5052 - val_loss: 0.6929 - val_acc: 0.5099
Epoch 2/10
2s - loss: 0.6930 - acc: 0.5048 - val_loss: 0.6930 - val_acc: 0.5001
Epoch 3/10
2s - loss: 0.6930 - acc: 0.5080 - val_loss: 0.6930 - val_acc: 0.5014
Epoch 4/10
2s - loss: 0.6929 - acc: 0.5100 - val_loss: 0.6927 - val_acc: 0.5137
Epoch 5/10
2s - loss: 0.6930 - acc: 0.5075 - val_loss: 0.6928 - val_acc: 0.5140
Epoch 6/10
2s - loss: 0.6930 - acc: 0.5078 - val_loss: 0.6932 - val_acc: 0.4999
Epoch 7/10
2s - loss: 0.6930 - acc: 0.5080 - val_loss: 0.6928 - val_acc: 0.5125
Epoch 8/10
2s - loss: 0.6930 - acc: 0.5053 - val_loss: 0.6930 - val_acc: 0.5043
Epoch 9/10
2s - loss: 0.6930 - acc: 0.5087 - val_loss: 0.6929 - val_acc: 0.5121
Epoch 10/10
2s - loss: 0.6930 - acc: 0.5083 - val_loss: 0.6931 - val_acc: 0.4981


<keras.callbacks.History at 0x2b3b90af7c10>

In [285]:
to_pred.head()

Unnamed: 0,class_0,class_1,cluster,feature1,feature10,feature11,feature12,feature13,feature14,feature15,...,feature20,feature21,feature3,feature4,feature5,feature6,feature7,feature8,feature9,id
0,,,4,0.54177,0.59308,0.46092,0.54357,0.46999,0.65848,0.61414,...,0.39919,0.33837,0.54365,0.53625,0.43622,0.4901,0.59626,0.4333,0.28352,535632
1,,,4,0.46513,0.60046,0.40061,0.49994,0.39465,0.66286,0.58364,...,0.46704,0.37416,0.61386,0.48131,0.3993,0.41525,0.54916,0.50796,0.30968,526056
2,,,0,0.57906,0.50209,0.44545,0.48268,0.48369,0.63943,0.56807,...,0.43983,0.43469,0.55062,0.50582,0.42531,0.53203,0.61312,0.39767,0.35517,621500
3,,,1,0.49406,0.58279,0.44185,0.5241,0.43125,0.64981,0.6419,...,0.42992,0.36342,0.59178,0.50592,0.44062,0.54272,0.55,0.46094,0.26085,561590
4,,,4,0.59719,0.47754,0.43377,0.43311,0.48639,0.57764,0.45624,...,0.53296,0.55182,0.55065,0.41892,0.45174,0.47165,0.56128,0.41783,0.48816,523587


In [286]:
nn_preds = nn.predict(to_pred.drop(['class_0', 'class_1', 'cluster', 'id'], axis=1).values)[:,1]

In [287]:
sub = pd.concat([to_pred['id'], pd.Series(nn_preds)], axis=1)
sub.columns = ['id', 'probability']
sub.head()

Unnamed: 0,id,probability
0,535632,0.501846
1,526056,0.502337
2,621500,0.502229
3,561590,0.50203
4,523587,0.503714


In [288]:
sub.describe()

Unnamed: 0,id,probability
count,45625.0,45625.0
mean,587661.584285,0.502893
std,44456.299783,0.001236
min,511026.0,0.499008
25%,549026.0,0.502141
50%,587541.0,0.502664
75%,625886.0,0.503435
max,665047.0,0.515456


In [280]:
sub.to_csv('data/sub7_sk_nn.csv', index=False)

In [255]:
#ensemble = pd.concat([pd.Series(nn_preds), pd.Series(rf_preds)], axis=1)
#ensemble['probability'] = ensemble.mean(axis=1)
#ensemble.head()

Unnamed: 0,0,1,probability
0,0.504987,0.563644,0.534316
1,0.504987,0.517274,0.511131
2,0.504987,0.504285,0.504636
3,0.504987,0.503814,0.504401
4,0.504987,0.500529,0.502758
