## Objective
This notebook looks at the problems posed in the Numer.AI financial prediction tournament.  Its objective is to better understand the application of ML to extremely noises problems found in the areas of financial markets and economics.

## Load data

In [1]:
import numpy as np
import pandas as pd

In [150]:
train = pd.read_csv('data/numerai_training_data.csv')
test = pd.read_csv('data/numerai_tournament_data.csv')

## Adversarial validation selection
Ensure that the training data is representative of the test data set

In [151]:
train['is_test'] = 0
test['is_test'] = 1

In [152]:
test['t_id'].shape

(264877,)

In [156]:
data = pd.concat(( train, test ))

data = data.iloc[ np.random.permutation(len( data )) ]
data.reset_index( drop = True, inplace = True )

x_to_select = data.drop( ['target', 'is_test', 't_id' ], axis = 1 )
y_to_select = data['is_test']

#### Predict if representative of test set
Use a random forest to choose which data should be included for training.  This helps to mitigate noise data or data that is irrelevant to the prediction problem.

In [159]:
from sklearn.ensemble import RandomForestClassifier

rf_model_select = RandomForestClassifier(n_estimators=10).fit(x_to_select, y_to_select)

In [160]:
x_adv = train.drop(['target','is_test'], axis=1)
y_adv = train['target']

In [None]:
pred = rf_model_select.predict_proba(x_adv) > 0.95

In [None]:
selected = (train[pred > 0]).drop('is_test', axis=1)

In [None]:
selected.describe()

## Random Forest Classifier for target prediction

In [165]:
y_selected_bin = pd.get_dummies(selected['target']).values
y_selected = selected['target'].values
x_selected = selected.drop('target', axis=1).values

In [166]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_selected, y_selected, test_size=.33, random_state=42)

In [167]:
rf_model = RandomForestClassifier(n_estimators=900, max_depth=6).fit(x_train, y_train)

In [168]:
y_pred = rf_model.predict(x_valid)
y_prob = rf_model.predict_proba(x_valid)

In [169]:
from sklearn.metrics import roc_auc_score

print(roc_auc_score(y_valid, y_pred))

0.513614838109


In [170]:
print(np.max(y_prob[:,1]))
print(np.min(y_prob[:,1]))

0.600139643751
0.396400255301


In [171]:
y_sub = rf_model.predict_proba(test.drop(['t_id','is_test'], axis=1))

In [172]:
print(np.max(y_sub[:,1]))
print(np.min(y_sub[:,1]))

0.590737464136
0.420029060885


In [173]:
submit = pd.DataFrame({ 't_id':test['t_id'], 'probability':y_sub[:,1] })
submit.to_csv('sub_5_rf.csv', index=False)

## SVM Classifier for target prediction

In [174]:
from sklearn.svm import SVC

svm_model = SVC(probability=True).fit(x_train, y_train)

In [175]:
y_pred = svm_model.predict(x_valid)
y_prob = svm_model.predict_proba(x_valid)

In [176]:
print(roc_auc_score(y_valid, y_pred))

0.509301929298


In [177]:
y_sub = svm_model.predict_proba(test.drop(['t_id','is_test'], axis=1))

KeyboardInterrupt: 

In [None]:
print(np.max(y_sub[:,1]))
print(np.min(y_sub[:,1]))

In [None]:
submit = pd.DataFrame({ 't_id':test['t_id'], 'probability':y_sub[:,1] })
submit.to_csv('sub_2_svm.csv', index=False)

## Deep Learning Classifier for target prediction

In [117]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

Using TensorFlow backend.


In [118]:
x_train.shape

(7967, 50)

In [119]:
x_train, x_valid, y_train, y_valid = train_test_split(x_selected, y_selected_bin, test_size=.33, random_state=42)

In [148]:
model = Sequential()

model.add(Dense(512, input_dim=(x_train.shape[1]), init='normal'))

for i in range(1,8):
    model.add(Dense(512, init='normal'))
    model.add(Dropout(.4))

model.add(Dense(2))
model.add(Activation('softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metric='accuracy')

In [149]:
model.fit(x_train, y_train, nb_epoch=500, batch_size=2048, verbose=2, validation_split=0.2)

Train on 6373 samples, validate on 1594 samples
Epoch 1/500
1s - loss: 2.2064 - val_loss: 0.6986
Epoch 2/500
1s - loss: 1.3040 - val_loss: 0.7172
Epoch 3/500
1s - loss: 1.0662 - val_loss: 0.8007
Epoch 4/500
0s - loss: 0.9448 - val_loss: 0.7488
Epoch 5/500
0s - loss: 0.8995 - val_loss: 0.7176
Epoch 6/500
1s - loss: 0.8477 - val_loss: 0.7162
Epoch 7/500
1s - loss: 0.8237 - val_loss: 0.6994
Epoch 8/500
1s - loss: 0.8148 - val_loss: 0.6947
Epoch 9/500
1s - loss: 0.7780 - val_loss: 0.6963
Epoch 10/500
1s - loss: 0.7850 - val_loss: 0.6937
Epoch 11/500
1s - loss: 0.7763 - val_loss: 0.6938
Epoch 12/500
1s - loss: 0.7574 - val_loss: 0.6943
Epoch 13/500
0s - loss: 0.7487 - val_loss: 0.6949
Epoch 14/500
1s - loss: 0.7633 - val_loss: 0.6944
Epoch 15/500
0s - loss: 0.7559 - val_loss: 0.6941
Epoch 16/500
1s - loss: 0.7466 - val_loss: 0.6942
Epoch 17/500
1s - loss: 0.7490 - val_loss: 0.6949
Epoch 18/500
1s - loss: 0.7487 - val_loss: 0.6934
Epoch 19/500
0s - loss: 0.7480 - val_loss: 0.6952
Epoch 20/50

KeyboardInterrupt: 

In [139]:
y_pred = model.predict(x_valid)

In [140]:
print(roc_auc_score(y_valid, y_pred))

0.525915153729


In [141]:
print(np.max(pred[:,1]))
print(np.min(pred[:,1]))

0.555118
0.413291


In [145]:
y_sub = model.predict(test.drop(['t_id','is_test'], axis=1).values)

In [146]:
submit = pd.DataFrame({ 't_id':test['t_id'], 'probability':y_sub[:,1] })
submit.to_csv('sub_1_nn.csv', index=False)