# Numerai
Tournament 77, 16th Oct 2017

In [3]:
!unzip numerai_datasets.zip

Archive:  numerai_datasets.zip
  inflating: numerai_training_data.csv  
  inflating: example_model.py        
  inflating: example_predictions.csv  
  inflating: numerai_tournament_data.csv  
  inflating: example_model.r         


In [5]:
!rm numerai_datasets.zip

In [12]:
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, linear_model


def numerai_sample():
    # Set seed for reproducibility
    np.random.seed(0)

    print("Loading data...")
    # Load the data from the CSV files
    training_data = pd.read_csv('numerai_training_data.csv', header=0)
    prediction_data = pd.read_csv('numerai_tournament_data.csv', header=0)


    # Transform the loaded CSV data into numpy arrays
    features = [f for f in list(training_data) if "feature" in f]
    X = training_data[features]
    Y = training_data["target"]
    x_prediction = prediction_data[features]
    ids = prediction_data["id"]

    # This is your model that will learn to predict
    model = linear_model.LogisticRegression(n_jobs=-1)

    print("Training...")
    # Your model is trained on the training_data
    model.fit(X, Y)

    print("Predicting...")
    # Your trained model is now used to make predictions on the numerai_tournament_data
    # The model returns two columns: [probability of 0, probability of 1]
    # We are just interested in the probability that the target is 1.
    y_prediction = model.predict_proba(x_prediction)
    results = y_prediction[:, 1]
    results_df = pd.DataFrame(data={'probability':results})
    joined = pd.DataFrame(ids).join(results_df)

    print("Writing predictions to predictions.csv")
    # Save the predictions out to a CSV file
    joined.to_csv("predictions.csv", index=False)
    # Now you can upload these predictions on numer.ai
    
    return(X,Y,x_prediction,ids,model,joined)

X,Y,x_prediction,ids,model, results = numerai_sample()

train_pred = model.predict_proba(X)
print(metrics.log_loss(Y,train_pred))

Loading data...
Training...
Predicting...
Writing predictions to predictions.csv
0.691666894301


So Logistic Regression model is just better than -ln(0.5) = 0.6931. That's the number to beat
## Model 1. Dense NN

In [66]:
#First create a validation set, 15% of dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.40, random_state=42)

In [67]:
X_trn = X_train.values
y_trn = y_train.values
X_tst = X_test.values
y_tst = y_test.values

In [68]:
X_train.shape

(321427, 50)

In [85]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization

nn = Sequential([
    BatchNormalization(input_shape=(50,)),
    Dense(100,activation='tanh'),
    #Dropout(0.1),
    Dense(50,activation='tanh'),
    #Dropout(0.1),
    #Dropout(0.2),
    Dense(1,activation='sigmoid')
])

nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [86]:
nn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_2 (Batch (None, 50)                200       
_________________________________________________________________
dense_48 (Dense)             (None, 100)               5100      
_________________________________________________________________
dense_49 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_50 (Dense)             (None, 1)                 51        
Total params: 10,401
Trainable params: 10,301
Non-trainable params: 100
_________________________________________________________________


In [87]:
# nn.optimizer.lr = 0.01
nn.fit(X_trn, y_trn, validation_data=(X_tst,y_tst), epochs=1, batch_size=64)

Train on 321427 samples, validate on 214286 samples
Epoch 1/1


<keras.callbacks.History at 0x7faa4365e2e8>

In [None]:
nn.optimizer.lr = 0.1
nn.fit(X_trn, y_trn, validation_data=(X_tst,y_tst), epochs=20, batch_size=64)

Train on 321427 samples, validate on 214286 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20