## Load Environments

In [1]:
%matplotlib inline

from __future__ import print_function
from __future__ import division
import warnings; warnings.filterwarnings('ignore')

import time
import random
from math import log

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('white')
sns.set_context('notebook', font_scale=2)
random.seed(125)
np.random.seed(137)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, LeaveOneGroupOut, ParameterGrid
from sklearn.manifold import TSNE
from sklearn.pipeline import make_pipeline, Pipeline

## Read in raw data sets, training, tournament (val, test, live)

In [3]:
!ls /input

numerai_tournament_data.csv  numerai_training_data.csv


In [4]:
def import_data_sets():
    train = pd.read_csv('/input/numerai_training_data.csv', index_col=0).drop('data_type', axis=1)
    df = pd.read_csv('/input/numerai_tournament_data.csv', index_col=0)
    valid = df.loc[df['data_type']=='validation'].drop('data_type', axis=1)
    test = df.loc[df['data_type']=='test'].drop('data_type', axis=1)
    live = df.loc[df['data_type']=='live'].drop('data_type', axis=1)
    return(train, valid, test, live)

In [5]:
train, valid, test, live = import_data_sets()

In [6]:
feature_cols = [f for f in train.columns if "feature" in f]

In [7]:
x_train = train[feature_cols]
x_val = valid[feature_cols]
x_test = test[feature_cols]
x_live = live[feature_cols]
y_train = train['target']
y_val = valid['target']


train_eras = train['era'].values
val_eras = valid['era'].values

In [8]:
def score_model(model, x_val, y_val, eras):    
    
    print('Logloss: ' + str(log_loss(y_val.values, model.predict_proba(x_val.values))))
    
    val_logo = LeaveOneGroupOut()
    scores = []
    fail = 0
    guessing = -log(.5)
    
    print('Guessing gets you: ' + str(guessing))
    
    for _, index in val_logo.split(x_val, y_val, eras):
        
        score = log_loss(y_val.iloc[index].values, model.predict_proba(x_val.iloc[index].values))
        
        print(score)
        
        if(score > guessing):
            fail += 1
            
    print(fail / 12.0)

## Grid Search CV Pipeline - Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

In [15]:
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('reduce_dim', PCA()),
    ('classify', LogisticRegression(penalty="l2"))
])

#N_DIM = [15, 16, 17]
N_DIM = [16]
#C_OPTIONS = [0.01, 0.001, 0.0001]
C_OPTIONS = [0.001]

param_grid = [{
        'reduce_dim__n_components': N_DIM,
        'classify__C': C_OPTIONS
    }]

lr_grid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid, verbose=2, scoring='neg_log_loss')

In [16]:
lr_grid.fit(x_train.values, y_train.values)

lr_model = lr_grid.best_estimator_
print(lr_grid.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] classify__C=0.01, reduce_dim__n_components=15 ...................
[CV] .... classify__C=0.01, reduce_dim__n_components=15, total=   4.7s
[CV] classify__C=0.01, reduce_dim__n_components=15 ...................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.0s remaining:    0.0s


[CV] .... classify__C=0.01, reduce_dim__n_components=15, total=   4.5s
[CV] classify__C=0.01, reduce_dim__n_components=15 ...................
[CV] .... classify__C=0.01, reduce_dim__n_components=15, total=   4.4s
[CV] classify__C=0.01, reduce_dim__n_components=15 ...................
[CV] .... classify__C=0.01, reduce_dim__n_components=15, total=   4.4s
[CV] classify__C=0.01, reduce_dim__n_components=15 ...................
[CV] .... classify__C=0.01, reduce_dim__n_components=15, total=   4.3s
[CV] classify__C=0.01, reduce_dim__n_components=16 ...................
[CV] .... classify__C=0.01, reduce_dim__n_components=16, total=   4.5s
[CV] classify__C=0.01, reduce_dim__n_components=16 ...................
[CV] .... classify__C=0.01, reduce_dim__n_components=16, total=   4.6s
[CV] classify__C=0.01, reduce_dim__n_components=16 ...................
[CV] .... classify__C=0.01, reduce_dim__n_components=16, total=   4.5s
[CV] classify__C=0.01, reduce_dim__n_components=16 ...................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  3.5min finished


{'classify__C': 0.001, 'reduce_dim__n_components': 16}


In [21]:
from sklearn.externals import joblib

joblib.dump(lr_model, 'lr_model.pkl')
lr_model = joblib.load('lr_model.pkl') 

In [17]:
score_model(lr_model, x_val, y_val, val_eras)

Logloss: 0.6925321039
Guessing gets you: 0.6931471805599453
0.692708306802
0.691789774824
0.692840669709
0.691724231983
0.692949258116
0.691884858088
0.693575076127
0.692723084604
0.690784376419
0.692858722031
0.692621439188
0.693991242519
0.16666666666666666


## GridSearch CV Pipeline - XGBoost Classifier

In [18]:
from xgboost import XGBClassifier

In [None]:
#idx = np.random.choice(len(x_train_pca), size=90000, replace=False)

In [19]:
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('reduce_dim', PCA(n_components=16)),
    ('classify', XGBClassifier(eval_metric='logloss', n_jobs=-1, silent=False))
])

N_EST = [50, 60]
LR = [0.01, 0.1]
DEPTH = [10]

param_grid = [{
        'classify__n_estimators': N_EST,
        'classify__learning_rate': LR,
        'classify__max_depth': DEPTH        
    }]

xgb_grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid, verbose=2, scoring='neg_log_loss')

In [20]:
xgb_grid.fit(x_train.values, y_train.values)

xgb_model = xgb_grid.best_estimator_
print(xgb_grid.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=10 
[CV]  classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=10, total=   5.5s
[CV] classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.8s remaining:    0.0s


[CV]  classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=10, total=   5.7s
[CV] classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=10 
[CV]  classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=10, total=   5.8s
[CV] classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=25 
[CV]  classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=25, total=   9.7s
[CV] classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=25 
[CV]  classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=25, total=   9.9s
[CV] classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=25 
[CV]  classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=25, total=   9.9s
[CV] classify__max_depth=10, classify__learning_rate=0.01, classify__n_estimators=50 
[CV]  classify__max_depth=10, classify__learning_rate=0.01, class

[CV]  classify__max_depth=30, classify__learning_rate=0.001, classify__n_estimators=10, total=  11.3s
[CV] classify__max_depth=30, classify__learning_rate=0.001, classify__n_estimators=10 
[CV]  classify__max_depth=30, classify__learning_rate=0.001, classify__n_estimators=10, total=  11.6s
[CV] classify__max_depth=30, classify__learning_rate=0.001, classify__n_estimators=10 
[CV]  classify__max_depth=30, classify__learning_rate=0.001, classify__n_estimators=10, total=  11.6s
[CV] classify__max_depth=30, classify__learning_rate=0.001, classify__n_estimators=25 
[CV]  classify__max_depth=30, classify__learning_rate=0.001, classify__n_estimators=25, total=  24.1s
[CV] classify__max_depth=30, classify__learning_rate=0.001, classify__n_estimators=25 
[CV]  classify__max_depth=30, classify__learning_rate=0.001, classify__n_estimators=25, total=  25.0s
[CV] classify__max_depth=30, classify__learning_rate=0.001, classify__n_estimators=25 
[CV]  classify__max_depth=30, classify__learning_rate=0

[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed: 27.1min finished


{'classify__max_depth': 10, 'classify__learning_rate': 0.01, 'classify__n_estimators': 50}


In [22]:
joblib.dump(xgb_model, 'xgb_model.pkl')
xgb_model = joblib.load('xgb_model.pkl') 

['xgb_model.pkl']

In [23]:
score_model(xgb_model, x_val, y_val, val_eras)

Logloss: 0.69285381285
Guessing gets you: 0.6931471805599453
0.693066785738
0.69238226394
0.693468806384
0.692503554093
0.69279399785
0.692856942409
0.693158837057
0.693184564591
0.691992692898
0.692502629215
0.692662535111
0.693736877074
0.3333333333333333


## Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=80, min_samples_split=10, max_depth=40, n_jobs=-1,
                            verbose=0).fit(x_train_pca.values, y_train.values)

In [None]:
score_model(rf, x_val_pca, y_val, val_eras)

## Ensemble

In [29]:
to_pred = pd.concat([x_val, x_test, x_live])

xgb_preds = pd.Series(xgb_model.predict_proba(to_pred.values)[:,0])
lr_preds = pd.Series(lr_model.predict_proba(to_pred.values)[:,0])

to_vote = pd.concat([xgb_preds, lr_preds], axis=1)

# Ensemble Tree
ens_preds = to_vote.mean(axis=1)
#ens_preds = lr_preds

In [30]:
ens_preds.describe()

count    243686.000000
mean          0.500042
std           0.014829
min           0.441034
25%           0.489719
50%           0.499809
75%           0.510203
max           0.560941
dtype: float64

In [31]:
sub = pd.concat([pd.Series(to_pred.index), pd.Series(ens_preds)], axis=1)
sub.columns = ['id', 'probability']
sub.head()

Unnamed: 0,id,probability
0,n0ccd86bc449a493,0.516402
1,n835f1ade074f496,0.493559
2,ne64378a45fbc4b7,0.507942
3,n1ed84cd94d77407,0.539599
4,n0e26cff583d9458,0.506431


In [41]:
pred_path = 'sub_sk_88_1.csv'

sub.to_csv(pred_path, index=False)

In [37]:
from numeraiapi.numerapi import NumerAPI

In [33]:
key = 'HOZSYDP45H7FCVV3N2QWRZVXTRYWS74QODA2ON5TOUHRJLG3STJOU63M2ZVYTZGN'

In [39]:
def upload_preds(path_to_preds):
    # set example username and round
    example_public_id = "spacekitty"
    example_secret_key = key

    # some API calls do not require logging in
    napi = NumerAPI(verbosity="info")
    
    # download current dataset
    napi.download_current_dataset(unzip=True)
    
    # provide api tokens
    napi = NumerAPI(example_public_id, example_secret_key)

    # upload predictions
    submission_id = napi.upload_predictions(path_to_preds)
    
    # check submission status
    napi.submission_status()

In [None]:
upload_preds(pred_path)