## Load Environments

In [1]:
import warnings; warnings.filterwarnings('ignore')
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import log

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, LeaveOneGroupOut, ParameterGrid

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam
from keras.regularizers import l2 

Using TensorFlow backend.


## Read in raw data sets, training, tournament (val, test, live)

In [31]:
def import_data_sets():
    train = pd.read_csv('numerai_training_data.csv', index_col=0).drop('data_type', axis=1)
    df = pd.read_csv('numerai_tournament_data.csv', index_col=0)
    valid = df.loc[df['data_type']=='validation'].drop('data_type', axis=1)
    test = df.loc[df['data_type']=='test'].drop('data_type', axis=1)
    live = df.loc[df['data_type']=='live'].drop('data_type', axis=1)
    return(train, valid, test, live)

In [32]:
train, valid, test, live = import_data_sets()

In [33]:
train.head()

Unnamed: 0_level_0,era,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22364,era1,0.52781,0.48414,0.61717,0.41186,0.38068,0.46056,0.60864,0.45759,0.38317,...,0.41597,0.66218,0.5372,0.42039,0.57638,0.62859,0.54002,0.52455,0.51074,1
75590,era1,0.60637,0.53147,0.44517,0.59033,0.58602,0.49229,0.36727,0.42125,0.53139,...,0.57008,0.41445,0.37146,0.53461,0.44034,0.43857,0.43735,0.45014,0.4086,0
86902,era1,0.86798,0.85566,0.18941,0.79383,0.70528,0.45049,0.57506,0.1618,0.41253,...,0.81797,0.4628,0.58799,0.76926,0.16697,0.20779,0.60567,0.13658,0.16671,0
59315,era1,0.53087,0.7022,0.46779,0.51353,0.53306,0.69446,0.45941,0.53112,0.32825,...,0.51404,0.53808,0.39215,0.56777,0.46307,0.50659,0.29862,0.48974,0.36266,0
85510,era1,0.38542,0.52888,0.74849,0.35956,0.24036,0.39117,0.7024,0.55795,0.21672,...,0.26845,0.83703,0.67446,0.30819,0.70442,0.76197,0.55612,0.5333,0.44172,0


In [26]:
feature_cols = [f for f in train.columns if "feature" in f]

In [36]:
x_train = train[feature_cols]
x_val = valid[feature_cols]
y_train = train['target']
y_val = valid['target']

train_eras = train['era'].values
val_eras = valid['era'].values

In [39]:
def score_keras_model(model, x_val, y_val, eras):    
    
    print('Logloss: ' + str(log_loss(y_val.values, mod.predict_proba(x_val_pca))))
    
    val_logo = LeaveOneGroupOut()
    scores = []
    fail = 0
    guessing = -log(.5)
    
    for _, index in val_logo.split(x_val, y_val, eras):
        score = log_loss(y_val.iloc[index].values, model.predict(x_val.values[index,:]))
        print(score)
        if(score > guessing):
            fail += 1
            
    print(fail / 12.0)

## Data Preprocessing

In [38]:
pca = PCA(n_components=8, whiten=True)
x_train_pca = pca.fit_transform(x_train.values)
x_val_pca = pca.transform(x_val.values)

## Keras Model

In [43]:
def compile_nn(input_dim):
    
    model = Sequential()    
    model.add(Dense(24, input_dim=input_dim, activation='relu', init='normal', W_regularizer=l2(0.001)))    
    model.add(Dropout(0.3))
    
    for i in range(0,4):        
        model.add(Dense(128, activation='relu', init='normal', W_regularizer=l2(0.001)))
        model.add(Dropout(0.3))
    
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    return model

nn = compile_nn(x_train.shape[1])

In [42]:
nn.fit(x_train.values, y_train.values, validation_data=(x_val.values, y_val.values),
           nb_epoch=1, batch_size=256, verbose=2)

Train on 108405 samples, validate on 16686 samples
Epoch 1/1


KeyboardInterrupt: 

In [53]:
score_keras_model(nn, x_val, y_val, val_eras)

Logloss: 0.693223366556
0.693122177049
0.693196973545
0.692975956408
0.693058342165
0.693136847739
0.692947238747
0.693346140927
0.693254197692
0.693596618931
0.693635535614
0.693149273616
0.693251757821
0.583333333333


## Submission

In [39]:
to_pred = pd.concat([x_val,test,live])

#Keras model
nn_preds = pd.Series(nn.predict(to_pred.values)[:,0])

#to_vote = pd.concat([rf_1_preds, rf_2_preds], axis=1)

# Ensemble Tree
#ens_preds = pd.Series(tree.predict_proba(to_vote.values)[:,1])
#ens_preds = to_vote.mean(axis=1)
ens_preds = nn_preds

In [40]:
ens_preds.describe()

count    45630.000000
mean         0.503124
std          0.004284
min          0.498657
25%          0.499918
50%          0.502135
75%          0.505005
max          0.534094
dtype: float64

In [33]:
sub = pd.concat([pd.Series(to_pred.index), pd.Series(ens_preds)], axis=1)
sub.columns = ['id', 'probability']
sub.head()

Unnamed: 0,id,probability
0,96144,0.517328
1,17982,0.52337
2,96161,0.486145
3,53895,0.494751
4,7267,0.510167


In [34]:
sub.to_csv('sub_62_1_rf.csv', index=False)