## Load Environments

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import log

In [24]:
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, LeaveOneGroupOut, ParameterGrid

In [3]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [41]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam
from keras.regularizers import l2 

## Read in raw data sets, training, tournament (val, test, live)

In [5]:
raw_train = pd.read_csv('data/train_62.csv', index_col=0)
raw_test = pd.read_csv('data/test_62.csv', index_col=0)

In [6]:
train = raw_train[raw_train.loc[:,'data_type']=='train'].drop(['data_type', 'era'], axis=1)
train_eras = raw_train[raw_train.loc[:,'data_type']=='train']['era']

valid = raw_test[raw_test.loc[:,'data_type']=='validation'].drop(['data_type', 'era'], axis=1)
val_eras = raw_test[raw_test.loc[:,'data_type']=='validation']['era']

test = raw_test[raw_test.loc[:,'data_type']=='test'].drop(['data_type', 'target', 'era'], axis=1)
live = raw_test[raw_test.loc[:,'data_type']=='live'].drop(['data_type', 'target', 'era'], axis=1)

In [7]:
x_train = train.drop(['target'], axis=1)
x_val = valid.drop(['target'], axis=1)
y_train = train['target']
y_val = valid['target']

## Data Preprocessing

In [8]:
def score_sk_model(mod, x_val_pca, y_val, val_eras):    
    print('Logloss: ' + str(log_loss(y_val.values, mod.predict_proba(x_val_pca))))
    val_logo = LeaveOneGroupOut()
    scores = []
    fail = 0
    guessing = -log(.5)
    for _, index in val_logo.split(x_val_pca, y_val.values, val_eras.values):
        score = log_loss(y_val.iloc[index].values, mod.predict_proba(x_val_pca[index,:]))
        #score = log_loss(y_val.iloc[index].values, mod.predict(x_val_pca[index,:]))
        print(score)
        if(score > guessing):
            fail += 1
    print(fail / 12.0)

In [9]:
pca = PCA(n_components=4)
x_train_pca = pca.fit_transform(x_train.values)
#x_train_pca = x_train.values
x_val_pca = pca.transform(x_val.values)
#x_val_pca = x_val.values

# logo = LeaveOneGroupOut()
# cv = logo.split(x_train_pca, y_train.values, eras.values)

In [74]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x_train.values)
x_train_scale = scaler.transform(x_train.values)
x_val_scale = scaler.transform(x_val.values)

## Random Forest Model

In [25]:
# est: 85, md: 5, ml:1

param_grid_1 = {'n_estimators': [100],
              'max_depth': [5, 6, 7] }

rf_1 = RandomForestClassifier(random_state=42, verbose=1, n_jobs=-1)

#grid_rf_1 = ParameterGrid()
grid_rf_1 = GridSearchCV(rf_1, param_grid=param_grid_1, cv=2)

In [80]:
# est: 85, md: 5, ml:1

param_grid_2 = {'n_estimators': [10, 120, 200],
              'max_depth': [6, 12, 20] }

rf_2 = RandomForestClassifier(random_state=42, verbose=1, n_jobs=-1)

#grid_rf_1 = ParameterGrid()
grid_rf_2 = GridSearchCV(rf_2, param_grid=param_grid_2, cv=5)

In [26]:
grid_rf_1.fit(x_train.values, y_train.values)
print(grid_rf_1. best_params_)

[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.0s finished
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.0s finished
[Parallel(n_jobs=24)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  95 out of  95 | elapsed:    1.9s finished
[Parallel(n_jobs=24)]: Done  95 out of  95 | elapsed:    0.1s finished
[Parallel(n_jobs=24)]: Done  95 out of  95 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  95 out of  95 | elapse

{'n_estimators': 100, 'max_depth': 6}


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.4s finished


In [81]:
grid_rf_2.fit(x_train.values, y_train.values)
print(grid_rf_2. best_params_)

[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.4s remaining:    0.2s
[Parall

{'n_estimators': 200, 'max_depth': 12}


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   16.1s finished


In [27]:
-log(.5)

0.6931471805599453

In [83]:
score_sk_model(grid_rf_2.best_estimator_, x_val.values, y_val, val_eras)

[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.2s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


Logloss: 0.69312914851


[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


0.691979078755


[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


0.693705498676


[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


0.693765819997


[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


0.692900223006


[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


0.693902010011


[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


0.692310153744


[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


0.69586554289


[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


0.69518161862


[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


0.691801856541


[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


0.690626149927


[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished
[Parallel(n_jobs=24)]: Done   8 tasks      | elapsed:    0.0s


0.691994542269
0.69338083111
0.5


[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


In [82]:
rf_1 = grid_rf_1.best_estimator_
rf_2 = grid_rf_2.best_estimator_

## SGD Classifier

In [75]:
grid_1 = {  }

sgd_1 = SGDClassifier(random_state=42, verbose=0, n_jobs=-1, loss='log', learning_rate='optimal')

grid_sgd_1 = GridSearchCV(sgd_1, param_grid=grid_1, cv=6)

In [76]:
grid_sgd_1.fit(x_train_scale, y_train.values)
print(grid_sgd_1. best_params_)

{}


In [77]:
sgd = grid_sgd_1.best_estimator_

In [78]:
score_sk_model(sgd, x_val.values, y_val, val_eras)

Logloss: 0.694700828091
0.694763506654
0.694914727404
0.696756113421
0.697200999946
0.695413070214
0.695137212918
0.694162390565
0.695230936429
0.693007787975
0.691519728511
0.69458342587
0.693676307389
0.833333333333


## Keras Model

In [35]:
def score_keras_model(mod, x_val, y_val, val_eras):    
    
    print('Logloss: ' + str(log_loss(y_val.values, mod.predict(x_val.values))))
    
    val_logo = LeaveOneGroupOut()
    scores = []
    fail = 0
    guessing = -log(.5)
    
    for _, index in val_logo.split(x_val.values, y_val.values, val_eras.values):
        score = log_loss(y_val.iloc[index].values, mod.predict(x_val.values[index,:]))
        print(score)
        if(score > guessing):
            fail += 1
    print(fail / 12.0)

In [54]:
def compile_nn(input_dim):
    
    model = Sequential()    
    model.add(Dense(512, input_dim=input_dim, activation='relu', init='normal', W_regularizer=l2(0.001)))    
    model.add(Dropout(0.4))
    
    for i in range(0,4):        
        model.add(Dense(512, activation='relu', init='normal', W_regularizer=l2(0.001)))
        model.add(Dropout(0.4))
    
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    return model

nn = compile_nn(x_train.shape[1])

In [79]:
nn.fit(x_train.values, y_train.values, validation_data=(x_val.values, y_val.values),
           nb_epoch=10, batch_size=500, verbose=2)

Train on 108405 samples, validate on 16686 samples
Epoch 1/10
14s - loss: 0.6934 - acc: 0.5040 - val_loss: 0.6935 - val_acc: 0.4982
Epoch 2/10
14s - loss: 0.6934 - acc: 0.5041 - val_loss: 0.6935 - val_acc: 0.4982
Epoch 3/10
14s - loss: 0.6934 - acc: 0.5041 - val_loss: 0.6935 - val_acc: 0.4982
Epoch 4/10
14s - loss: 0.6934 - acc: 0.5041 - val_loss: 0.6934 - val_acc: 0.4982
Epoch 5/10
14s - loss: 0.6934 - acc: 0.5035 - val_loss: 0.6935 - val_acc: 0.4982
Epoch 6/10
14s - loss: 0.6934 - acc: 0.5036 - val_loss: 0.6935 - val_acc: 0.4982
Epoch 7/10
14s - loss: 0.6934 - acc: 0.5040 - val_loss: 0.6935 - val_acc: 0.4982
Epoch 8/10
14s - loss: 0.6934 - acc: 0.5031 - val_loss: 0.6935 - val_acc: 0.4982
Epoch 9/10
14s - loss: 0.6934 - acc: 0.5038 - val_loss: 0.6935 - val_acc: 0.4982
Epoch 10/10
14s - loss: 0.6934 - acc: 0.5036 - val_loss: 0.6935 - val_acc: 0.4982


<keras.callbacks.History at 0x2b45c4f3a850>

In [53]:
score_keras_model(nn, x_val, y_val, val_eras)

Logloss: 0.693223366556
0.693122177049
0.693196973545
0.692975956408
0.693058342165
0.693136847739
0.692947238747
0.693346140927
0.693254197692
0.693596618931
0.693635535614
0.693149273616
0.693251757821
0.583333333333


## Voting Ensemble between models

In [93]:
def pick_extreme(row):
    
    #print(row[0])
    
    max_vote = np.max(row)
    min_vote = np.min(row)
    
    vote = 0.0
    
    if((max_vote >= .5) and (min_vote >= .5)):
        vote = max_vote
        
    elif((max_vote <= .5) and (min_vote <= .5)):
        vote = min_vote
        
    else:
        vote = np.mean
        
    return vote

In [94]:
# must switch to validation for scoring
rf_1_preds = pd.Series(rf_1.predict_proba(x_val.values)[:,1], index=y_val.index)
#rf_2_preds = pd.Series(rf_2.predict_proba(x_val_pca)[:,1], index=y_val.index)
nn_preds = pd.Series(nn.predict_proba(x_val.values)[:,0], index=y_val.index)

ens = pd.concat([rf_1_preds, nn_preds], axis=1)
ens.columns = ['rf_1', 'nn']

vote = ens.apply(func=np.mean, axis=1)
#vote = ens.mean()

ens['vote'] = vote
ens['target'] = y_val
ens.describe()

Unnamed: 0,rf_1,rf_2,target
count,16686.0,16686.0,16686.0
mean,0.504675,0.504718,0.498202
std,0.01611,0.019327,0.500012
min,0.321381,0.405391,0.0
25%,0.495364,0.495181,0.0
50%,0.505039,0.508478,0.0
75%,0.515639,0.517689,1.0
max,0.618494,0.560419,1.0


In [95]:
ens.head()

Unnamed: 0_level_0,rf_1,rf_2,vote,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
535632,0.50973,0.501092,0.50973,1.0
526056,0.492063,0.498484,0.492063,1.0
621500,0.505847,0.509371,0.509371,0.0
561590,0.482536,0.485528,0.482536,1.0
523587,0.515976,0.500454,0.515976,1.0


In [96]:
def score_ensemble(ens, val_eras):
    print('Logloss: ' + str(log_loss(ens['target'].values, ens['vote'].values)))
    val_logo = LeaveOneGroupOut()
    scores = []
    fail = 0
    guessing = -log(.5)
    for _, index in val_logo.split(ens['vote'].values, ens['target'].values, val_eras.values):
        score = log_loss(ens['target'].iloc[index].values, ens['vote'].iloc[index].values)
        print(score)
        if(score > guessing):
            fail += 1
                
    print(fail / 12.0)
                
score_ensemble(ens, val_eras)

TypeError: float() argument must be a string or a number

## Submission

In [39]:
to_pred = pd.concat([x_val,test,live])

# SK Learn model
#pred_pca = pca.transform(to_pred.values)
#rf_1_preds = pd.Series(rf_1.predict_proba(to_pred.values)[:,1])
#rf_2_preds = pd.Series(rf_2.predict_proba(pred_pca)[:,1])

#Keras model
nn_preds = pd.Series(nn.predict(to_pred.values)[:,0])

#to_vote = pd.concat([rf_1_preds, rf_2_preds], axis=1)

# Ensemble Tree
#ens_preds = pd.Series(tree.predict_proba(to_vote.values)[:,1])
#ens_preds = to_vote.mean(axis=1)
ens_preds = nn_preds

In [40]:
ens_preds.describe()

count    45630.000000
mean         0.503124
std          0.004284
min          0.498657
25%          0.499918
50%          0.502135
75%          0.505005
max          0.534094
dtype: float64

In [33]:
sub = pd.concat([pd.Series(to_pred.index), pd.Series(ens_preds)], axis=1)
sub.columns = ['id', 'probability']
sub.head()

Unnamed: 0,id,probability
0,96144,0.517328
1,17982,0.52337
2,96161,0.486145
3,53895,0.494751
4,7267,0.510167


In [34]:
sub.to_csv('sub_62_1_rf.csv', index=False)