## Load Environments

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import log

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, LeaveOneGroupOut

In [3]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

## Read in raw data sets, training, tournament (val, test, live)

In [4]:
raw_train = pd.read_csv('data/train_62.csv', index_col=0)
raw_test = pd.read_csv('data/test_62.csv', index_col=0)

In [5]:
raw_train.head()

Unnamed: 0_level_0,era,data_type,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30869,era1,train,0.80581,0.59023,0.33846,0.66893,0.60931,0.44031,0.54041,0.19611,...,0.71156,0.44718,0.49803,0.65204,0.2991,0.33321,0.62169,0.30362,0.40045,1
153709,era1,train,0.54412,0.49768,0.63767,0.45867,0.34125,0.5188,0.66379,0.39711,...,0.41867,0.69744,0.60291,0.42221,0.57828,0.63202,0.59929,0.48143,0.50796,1
39100,era1,train,0.5202,0.38561,0.73734,0.28803,0.23947,0.28526,0.8193,0.40533,...,0.32548,0.85534,0.74794,0.29552,0.66491,0.74581,0.78279,0.52793,0.58432,1
146072,era1,train,0.25519,0.40359,0.83019,0.31328,0.26406,0.56522,0.50346,0.6929,...,0.18752,0.73311,0.54077,0.19624,0.80862,0.81713,0.45656,0.66848,0.51978,1
140880,era1,train,0.60276,0.43162,0.55852,0.41156,0.42314,0.28048,0.59067,0.41218,...,0.47205,0.60394,0.47279,0.47331,0.5221,0.58061,0.54359,0.52048,0.53967,1


In [8]:
train = raw_train[raw_train.loc[:,'data_type']=='train'].drop(['data_type', 'era'], axis=1)
train_eras = raw_train[raw_train.loc[:,'data_type']=='train']['era']

valid = raw_test[raw_test.loc[:,'data_type']=='validation'].drop(['data_type', 'era'], axis=1)
val_eras = raw_test[raw_test.loc[:,'data_type']=='validation']['era']

test = raw_test[raw_test.loc[:,'data_type']=='test'].drop(['data_type', 'target', 'era'], axis=1)
live = raw_test[raw_test.loc[:,'data_type']=='live'].drop(['data_type', 'target', 'era'], axis=1)

In [9]:
x_train = train.drop(['target'], axis=1)
x_val = valid.drop(['target'], axis=1)
y_train = train['target']
y_val = valid['target']

## Data Preprocessing

In [10]:
def score_sk_model(mod, x_val_pca, y_val, val_eras):    
    print('Logloss: ' + str(log_loss(y_val.values, mod.predict_proba(x_val_pca))))
    val_logo = LeaveOneGroupOut()
    scores = []
    fail = 0
    guessing = -log(.5)
    for _, index in val_logo.split(x_val_pca, y_val.values, val_eras.values):
        score = log_loss(y_val.iloc[index].values, mod.predict_proba(x_val_pca[index,:]))
        print(score)
        if(score > guessing):
            fail += 1
    print(fail / 12.0)

In [11]:
pca = PCA(n_components=7)
x_train_pca = pca.fit_transform(x_train.values)
#x_train_pca = x_train.values
x_val_pca = pca.transform(x_val.values)
#x_val_pca = x_val.values

# logo = LeaveOneGroupOut()
# cv = logo.split(x_train_pca, y_train.values, eras.values)

## Random Forest Model

In [38]:
# est: 85, md: 5, ml:1

param_grid_1 = {'n_estimators': [80],
              'max_depth': [7, 8] }

clf_1 = RandomForestClassifier(random_state=42)

grid_clf_1 = GridSearchCV(clf_1, param_grid=param_grid_1, cv=5)

In [39]:
# est: 85, md: 5, ml:1

param_grid_2 = {'n_estimators': [100, 120],
              'max_depth': [6] }

clf_2 = RandomForestClassifier(random_state=42)

grid_clf_2 = GridSearchCV(clf_2, param_grid=param_grid_2, cv=5)

In [40]:
grid_clf_1.fit(x_train.values, y_train.values)
print(grid_clf_1. best_params_)

{'n_estimators': 80, 'max_depth': 7}


In [41]:
grid_clf_2.fit(x_train_pca, y_train.values)
print(grid_clf_2. best_params_)

{'n_estimators': 120, 'max_depth': 6}


In [42]:
score_sk_model(grid_clf_1.best_estimator_, x_val.values, y_val, val_eras)

Logloss: 0.692532315799
0.692175421212
0.692359910455
0.692504542591
0.692532377914
0.692499693782
0.69219797567
0.694310042667
0.69338462236
0.691294813804
0.691555839973
0.692888058728
0.692637659196
0.166666666667


In [43]:
score_sk_model(grid_clf_2.best_estimator_, x_val_pca, y_val, val_eras)

Logloss: 0.692358821252
0.691887040127
0.690162663113
0.695965829119
0.69637016538
0.691718777066
0.6922198876
0.695817290271
0.69365445496
0.690877916363
0.69006242186
0.689105588616
0.690104485242
0.333333333333


In [44]:
rf_1 = grid_clf_1.best_estimator_
rf_2 = grid_clf_2.best_estimator_

## Keras Model

In [27]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam

Using TensorFlow backend.


In [28]:
def score_keras_model(mod, x_val, y_val, val_eras):    
    
    print('Logloss: ' + str(log_loss(y_val.values, mod.predict(x_val.values))))
    
    val_logo = LeaveOneGroupOut()
    scores = []
    fail = 0
    guessing = -log(.5)
    
    for _, index in val_logo.split(x_val.values, y_val.values, val_eras.values):
        score = log_loss(y_val.iloc[index].values, mod.predict(x_val.values[index,:]))
        print(score)
        if(score > guessing):
            fail += 1
    print(fail / 12.0)

In [41]:
def compile_nn(input_dim):
    
    model = Sequential()    
    model.add(Dense(512, input_dim=input_dim, activation='relu', init='normal'))    
    model.add(Dropout(0.1))
    
    for i in range(0,3):        
        model.add(Dense(512, activation='relu', init='normal'))
        model.add(Dropout(0.1))
    
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    return model

nn = compile_nn(x_train.shape[1])

In [48]:
nn.fit(x_train.values, y_train.values, validation_data=(x_val.values, y_val.values),
           nb_epoch=15, batch_size=64, verbose=2)

Train on 108405 samples, validate on 16686 samples
Epoch 1/15
23s - loss: 0.6930 - acc: 0.5077 - val_loss: 0.6928 - val_acc: 0.5125
Epoch 2/15
23s - loss: 0.6930 - acc: 0.5068 - val_loss: 0.6930 - val_acc: 0.5077
Epoch 3/15
23s - loss: 0.6930 - acc: 0.5090 - val_loss: 0.6929 - val_acc: 0.5123
Epoch 4/15
23s - loss: 0.6929 - acc: 0.5087 - val_loss: 0.6932 - val_acc: 0.4982
Epoch 5/15
23s - loss: 0.6930 - acc: 0.5084 - val_loss: 0.6928 - val_acc: 0.5149
Epoch 6/15
23s - loss: 0.6930 - acc: 0.5079 - val_loss: 0.6930 - val_acc: 0.5077
Epoch 7/15
23s - loss: 0.6929 - acc: 0.5062 - val_loss: 0.6928 - val_acc: 0.5127
Epoch 8/15
23s - loss: 0.6930 - acc: 0.5063 - val_loss: 0.6931 - val_acc: 0.4982
Epoch 9/15
23s - loss: 0.6930 - acc: 0.5097 - val_loss: 0.6928 - val_acc: 0.5143
Epoch 10/15
23s - loss: 0.6930 - acc: 0.5083 - val_loss: 0.6928 - val_acc: 0.5122
Epoch 11/15
23s - loss: 0.6929 - acc: 0.5104 - val_loss: 0.6928 - val_acc: 0.5132
Epoch 12/15
23s - loss: 0.6929 - acc: 0.5075 - val_loss:

<keras.callbacks.History at 0x2b8f95a73550>

In [50]:
score_keras_model(nn, x_val, y_val, val_eras)

Logloss: 0.692844254414
0.692775817342
0.693034849444
0.693521662513
0.693324561479
0.693058599557
0.69291375513
0.69266847655
0.692740625415
0.692540373578
0.691682733661
0.692791734761
0.693088249177
0.166666666667


## Voting Ensemble between models

In [93]:
def pick_extreme(row):
    
    #print(row[0])
    
    max_vote = np.max(row)
    min_vote = np.min(row)
    
    vote = 0.0
    
    if((max_vote >= .5) and (min_vote >= .5)):
        vote = max_vote
        #vote = 1
    elif((max_vote <= .5) and (min_vote <= .5)):
        vote = min_vote
        #vote = 0
    else:
        vote = np.mean
        
    return vote

In [94]:
# must switch to validation for scoring
rf_1_preds = pd.Series(rf_1.predict_proba(x_val.values)[:,1], index=y_val.index)
rf_2_preds = pd.Series(rf_2.predict_proba(x_val_pca)[:,1], index=y_val.index)

ens = pd.concat([rf_1_preds, rf_2_preds], axis=1)
ens.columns = ['rf_1', 'rf_2']

vote = ens.apply(func=pick_extreme, axis=1)

ens['vote'] = vote
ens['target'] = y_val
ens.describe()

Unnamed: 0,rf_1,rf_2,target
count,16686.0,16686.0,16686.0
mean,0.504675,0.504718,0.498202
std,0.01611,0.019327,0.500012
min,0.321381,0.405391,0.0
25%,0.495364,0.495181,0.0
50%,0.505039,0.508478,0.0
75%,0.515639,0.517689,1.0
max,0.618494,0.560419,1.0


In [95]:
ens.head()

Unnamed: 0_level_0,rf_1,rf_2,vote,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
535632,0.50973,0.501092,0.50973,1.0
526056,0.492063,0.498484,0.492063,1.0
621500,0.505847,0.509371,0.509371,0.0
561590,0.482536,0.485528,0.482536,1.0
523587,0.515976,0.500454,0.515976,1.0


In [96]:
def score_ensemble(ens, val_eras):
    print('Logloss: ' + str(log_loss(ens['target'].values, ens['vote'].values)))
    val_logo = LeaveOneGroupOut()
    scores = []
    fail = 0
    guessing = -log(.5)
    for _, index in val_logo.split(ens['vote'].values, ens['target'].values, val_eras.values):
        score = log_loss(ens['target'].iloc[index].values, ens['vote'].iloc[index].values)
        print(score)
        if(score > guessing):
            fail += 1
                
    print(fail / 12.0)
                
score_ensemble(ens, val_eras)

TypeError: float() argument must be a string or a number

## Submission

In [65]:
to_pred = pd.concat([x_val,test,live])

# SK Learn model
pred_pca = pca.transform(to_pred.values)
rf_1_preds = pd.Series(rf_1.predict_proba(to_pred.values)[:,1])
rf_2_preds = pd.Series(rf_2.predict_proba(pred_pca)[:,1])

#Keras model
#nn_preds = pd.Series(nn.predict(to_pred.values)[:,0])

to_vote = pd.concat([rf_1_preds, rf_2_preds], axis=1)

# Ensemble Tree
#ens_preds = pd.Series(tree.predict_proba(to_vote.values)[:,1])
ens_preds = to_vote.mean(axis=1)

In [66]:
ens_preds.describe()

count    45625.000000
mean         0.504611
std          0.015593
min          0.392300
25%          0.496025
50%          0.505737
75%          0.515293
max          0.585255
dtype: float64

In [67]:
sub = pd.concat([pd.Series(to_pred.index), pd.Series(ens_preds)], axis=1)
sub.columns = ['id', 'probability']
sub.head()

Unnamed: 0,id,probability
0,535632,0.505411
1,526056,0.495273
2,621500,0.507609
3,561590,0.484032
4,523587,0.508215


In [68]:
sub.to_csv('sub22_ens.csv', index=False)