In [1]:
import pandas as pd 
df = pd.read_csv('numerai_tournament_data.csv')
ids = df['id']
ids

0         n0003aa52cab36c2
1         n000920ed083903f
2         n0038e640522c4a6
3         n004ac94a87dc54b
4         n0052fe97ea0c05f
                ...       
385400    nffe313b9f23c2fa
385401    nffe3cdede8c06ff
385402    nffed6f12819a473
385403    nfff7ae43eb5106b
385404    nfffd768115d3800
Name: id, Length: 385405, dtype: object

In [2]:
df.head()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target_kazutsugi
0,n0003aa52cab36c2,era121,validation,0.25,0.75,0.5,0.5,0.0,0.75,0.5,...,0.75,0.75,1.0,0.75,0.5,0.5,1.0,0.0,0.0,0.0
1,n000920ed083903f,era121,validation,0.75,0.5,0.75,1.0,0.5,0.0,0.0,...,0.5,0.5,0.75,1.0,0.75,0.5,0.5,0.5,0.5,0.25
2,n0038e640522c4a6,era121,validation,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.5,0.25,0.0,0.0,0.5,0.5,0.0,1.0
3,n004ac94a87dc54b,era121,validation,0.75,1.0,1.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.25,0.75
4,n0052fe97ea0c05f,era121,validation,0.25,0.5,0.5,0.25,1.0,0.5,0.5,...,0.5,0.75,0.0,0.0,0.75,1.0,0.0,0.25,1.0,1.0


In [5]:
import os
os.environ['KERAS_BACKEND' ] = 'tensorflow'
os.environ['MKL_THREADING_LAYER'] = 'GNU'
import keras as ks
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import TensorBoard
import keras
import pandas as pd
import numpy as np
from keras import optimizers
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Dropout, Activation
from sklearn.preprocessing import MinMaxScaler
import time 

TOURNAMENT_NAME = "kazutsugi"
TARGET_NAME = f"target_{TOURNAMENT_NAME}"
PREDICTION_NAME = f"prediction_{TOURNAMENT_NAME}"

BENCHMARK = 0.002
BAND = 0.04


# Submissions are scored by spearman correlation
def score(df):
    # method="first" breaks ties based on order in array
    return np.corrcoef(
        df[TARGET_NAME],
	df[PREDICTION_NAME].rank(pct=True, method="first")
    )[0,1]


# The payout function
def payout(scores):
    return ((scores - BENCHMARK)/BAND).clip(lower=-1, upper=1)
NAME='mlp_katzugi_benchmark'

def main():

    print("# Loading data...")
    # The training data is used to train your model how to predict the targets.
    training_data = pd.read_csv("numerai_training_data.csv").set_index("id")
    # The tournament data is the data that Numerai uses to evaluate your model.
    tournament_data = pd.read_csv("numerai_tournament_data.csv").set_index("id")
    tournament_data.head()
    feature_names = [f for f in training_data.columns if f.startswith("feature")]
    id_names = [i for i in tournament_data.columns if i.startswith("id")]
    print(f"Loaded {len(feature_names)} features") 
#     print(f"Loaded {len(id_names)} ids")
    print(id_names)
#     print(feature_names)

    print("Training model")
    batch_size = 710

    dropout = 0.2

    visible = Input(shape=(310,))
    hidden1 = Dense(3, activation='sigmoid')(visible)
    hidden1 = Dropout(dropout)(hidden1)
    hidden2 = Dense(3, activation='sigmoid')(hidden1)
    hidden3 = Dense(3, activation='sigmoid')(hidden2)
    output = Dense(1, activation='sigmoid')(hidden3)
    model = Model(inputs=visible, outputs=output)
    model.compile(loss='binary_crossentropy',optimizer='rmsprop')
    model.summary()
    tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))
    model.fit(training_data[feature_names], training_data[TARGET_NAME],batch_size=batch_size,epochs=15,validation_split=0.63,callbacks=[tensorboard],shuffle=False)
    print("Generating predictions")
    training_data[PREDICTION_NAME] = model.predict(training_data[feature_names])
    tournament_data[PREDICTION_NAME] = model.predict(tournament_data[feature_names])

    # Check the per-era correlations on the training set
    train_correlations = training_data.groupby("era").apply(score)
    print(f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}")
    print(f"On training the average per-era payout is {payout(train_correlations).mean()}")
    
    # Check the per-era correlations on the validation set
    validation_data = tournament_data[tournament_data.data_type == "validation"]
    validation_correlations = validation_data.groupby("era").apply(score)
    print(f"On validation the correlation has mean {validation_correlations.mean()} and std {validation_correlations.std()}")
    print(f"On validation the average per-era payout is {payout(validation_correlations).mean()}")
    
    
    tournament_data[PREDICTION_NAME].to_csv(TOURNAMENT_NAME + "_submission.csv")
#     results = tournament_data[PREDICTION_NAME][:, 1]
#     results_df = pd.DataFrame(data={'probability_kazutsugi':results})



if __name__ == '__main__':
    main()


# Loading data...
Loaded 310 features
[]
Training model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 310)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 3)                 933       
_________________________________________________________________
dropout_3 (Dropout)          (None, 3)                 0         
_________________________________________________________________
dense_10 (Dense)             (None, 3)                 12        
_________________________________________________________________
dense_11 (Dense)             (None, 3)                 12        
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 4         
Total params: 961
Trainable params: 961
Non-trainable params: 0
______________________

  % delta_t_median)


Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Generating predictions
On training the correlation has mean 0.045244572331765716 and std 0.04394125071520474
On training the average per-era payout is 0.6355469731000182
On validation the correlation has mean 0.03131725801628915 and std 0.03140101642811669
On validation the average per-era payout is 0.5548969702887269




In [11]:
import os
os.environ['KERAS_BACKEND' ] = 'tensorflow'
os.environ['MKL_THREADING_LAYER'] = 'GNU'
import keras as ks
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import TensorBoard
import keras
import pandas as pd
import numpy as np
from keras import optimizers
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Dropout, Activation
from sklearn.preprocessing import MinMaxScaler
import time 


def main():

    # Set seed for reproducibility
    NAME = "MLP"
    np.random.seed(0)
    print("Loading data...")
    # Load the data from the CSV files
    training_data = pd.read_csv('numerai_training_data.csv', header=0)
    print('original train data shape: {},\t{} \n\n \t:'.format(training_data.shape[0],training_data.shape[1]))
    pd.DataFrame(training_data)
#     prediction_data = pd.read_csv('numerai_tournament_data.csv', header=0)
#     print('original prediction data shape: {},\t{} \n\n \t:'.format(prediction_data.shape[0],prediction_data.shape[1]))
    
#     complete_training_data = pd.concat([training_data, prediction_data])
#     print('total training / valdation shape {}'.format(complete_training_data))
    
#     # Transform the loaded CSV data into numpy arrays

#     features = [f for f in list(training_data) if "feature" in f]
#     print(features)

#     X = training_data[features]
#     mini= MinMaxScaler(feature_range=(0,1)) 
#     X = mini.fit_transform(X)

#     Y = training_data["target_frank"]
#     Y= keras.utils.to_categorical(Y,2) 

#     x_prediction = prediction_data[features]
#     x_prediction = mini.fit_transform(x_prediction)

#     ids = prediction_data["id"]  

#     batch_size = 710

#     dropout = 0.2

#     visible = Input(shape=(50,))
#     hidden1 = Dense(10, activation='relu')(visible)
#     hidden2 = Dense(20, activation='relu')(hidden1)
#     hidden3 = Dense(10, activation='relu')(hidden2)
#     output = Dense(2, activation='sigmoid')(hidden3)
#     model = Model(inputs=visible, outputs=output)
#     model.compile(loss='binary_crossentropy',optimizer='rmsprop')
#     model.summary()
#     tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))
#     model.fit(X,Y,batch_size=batch_size,epochs=10,validation_split=0.33,callbacks=[tensorboard])
    

#     y_prediction = model.predict(x_prediction)
#     evaluate = model.evaluate(x_prediction,y_prediction)
    
#     probabilities = y_prediction[:, 1]
#     print("- probabilities:", probabilities[1:6])

#     # We can see the probability does seem to be good at predicting the
#     # true target correctly.
#     print("- target:", prediction_data['target_frank'][1:6])
#     print("- rounded probability:", [np.round(p) for p in probabilities][1:6])

#     # But overall the accuracy is very low.
#     correct = [
#         np.round(x) == y
#         for (x, y) in zip(probabilities, prediction_data['target_frank'])
#     ]
#     print("- accuracy: ", sum(correct) / float(prediction_data.shape[0]))

#     tournament_corr = np.corrcoef(prediction_data['target_frank'],
#                                   prediction_data['target_elizabeth'])
#     print("- frank vs elizabeth corr:", tournament_corr)
#     # You can see that target_elizabeth is accurate using the frank model as well.
#     correct = [
#         np.round(x) == y
#         for (x, y) in zip(probabilities, prediction_data['target_elizabeth'])
#     ]
#     print("- elizabeth using frank:",
#           sum(correct) / float(prediction_data.shape[0]))

#     # Numerai measures models on logloss instead of accuracy. The lower the logloss the better.
#     # Numerai only pays models with logloss < 0.693 on the live portion of the tournament data.)

#     print("- validation logloss:",
#           model.evaluate(x_prediction,y_prediction))
    
#     results = y_prediction[:, 1]
#     results_df = pd.DataFrame(data={'probability_frank':results})

#     joined = pd.DataFrame(ids).join(results_df)
#     pd.DataFrame(joined[:5])


#     print("Writing predictions to predictions.csv")
#     path = 'predictions_{:},{}'.format(time.strftime("%Y-%m-%d_%Hh%Mm%Ss", time.gmtime()),NAME) + '.csv'
#     print()
#     print("Writing predictions to " + path.strip())
#     joined.to_csv(path,float_format='%.15f', index=False)

if __name__ == '__main__':

    main()

Loading data...
original train data shape: 501808,	314 

 	:


In [1]:
import os
os.environ['KERAS_BACKEND' ] = 'tensorflow'
os.environ['MKL_THREADING_LAYER'] = 'GNU'
import keras as ks
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import TensorBoard
import keras
import pandas as pd
import numpy as np
from keras import optimizers
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Dropout, Activation
from sklearn.preprocessing import MinMaxScaler
import time 


def main():

    # Set seed for reproducibility
    NAME = "MLP"
    np.random.seed(0)



    print("Loading data...")

    # Load the data from the CSV files

    training_data = pd.read_csv('numerai_training_data.csv', header=0)
    print('original train data shape: {},\t{} \n\n \t:'.format(training_data.shape[0],training_data.shape[1]))

    prediction_data = pd.read_csv('numerai_tournament_data.csv', header=0)
    print('original prediction data shape: {},\t{} \n\n \t:'.format(prediction_data.shape[0],prediction_data.shape[1]))
    
    complete_training_data = pd.concat([training_data, prediction_data])
    print('total training / valdation shape {}'.format(complete_training_data))
    
    



    # Transform the loaded CSV data into numpy arrays

    features = [f for f in list(training_data) if "feature" in f]
    print(features)

    X = training_data[features]
    mini= MinMaxScaler(feature_range=(0,1)) 
    X = mini.fit_transform(X)
#     X = X.values
    Y = training_data["target_bernie"]
    Y= keras.utils.to_categorical(Y,2) 

    x_prediction = prediction_data[features]
    x_prediction = mini.fit_transform(x_prediction)

    ids = prediction_data["id"]

#     X = X.values

#     Y = to_categorical(Y, num_classes=2)

    

    batch_size = 710

    dropout = 0.2

    

    m_in = Input(shape=(50,))

    m1 = Dense(50,)(m_in)
    m1 = Activation('relu')(m1)
    m1 = BatchNormalization(momentum=.99999,axis=-1)(m1)

    m2 = Dense(100)(m1)
    m2 = Activation('relu')(m2)
    m2 = BatchNormalization(momentum=.999,axis=-1)(m2)
    
    m3 = Dense(25)(m2)
    m3 = Activation('relu')(m3)
    
    m3 = Dense(25)(m3)
    m3 = Dropout(dropout)(m3) 
    m3 = Activation('relu')(m3)
    
    m3 = Dense(25)(m3)
    m3 = Activation('relu')(m3)
    m3 = BatchNormalization(momentum=.99,axis=-1)(m3)
    
    m3 = Dense(100)(m3)
    m3 = Activation('relu')(m3)
    
    m3 = Dense(25)(m3)
    m3 = Activation('relu')(m3) 
    
    m4 = Dense(25)(m3)
    m4 = Activation('relu')(m4) 
    m4 = Dropout(dropout)(m4) 
    m4 = BatchNormalization(momentum=.9,axis=-1)(m4)

    
    
    m5 = Dense(2)(m4)
    m_out = Activation('sigmoid')(m5)

    model = Model(inputs=m_in, outputs=m_out)



    model.compile(loss='binary_crossentropy',optimizer='rmsprop')
    tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))
    model.fit(X,Y,batch_size=batch_size,epochs=10,validation_split=0.33,callbacks=[tensorboard])
    
    

    

    y_prediction = model.predict(x_prediction)
    evaluate = model.evaluate(x_prediction,y_prediction)
    
    probabilities = y_prediction[:, 1]
    print("- probabilities:", probabilities[1:6])

    # We can see the probability does seem to be good at predicting the
    # true target correctly.
    print("- target:", prediction_data['target_bernie'][1:6])
    print("- rounded probability:", [np.round(p) for p in probabilities][1:6])

    # But overall the accuracy is very low.
    correct = [
        np.round(x) == y
        for (x, y) in zip(probabilities, prediction_data['target_bernie'])
    ]
    print("- accuracy: ", sum(correct) / float(prediction_data.shape[0]))

    # The targets for each of the tournaments are very correlated.
    tournament_corr = np.corrcoef(prediction_data['target_bernie'],
                                  prediction_data['target_elizabeth'])
    print("- bernie vs elizabeth corr:", tournament_corr)
    # You can see that target_elizabeth is accurate using the bernie model as well.
    correct = [
        np.round(x) == y
        for (x, y) in zip(probabilities, prediction_data['target_elizabeth'])
    ]
    print("- elizabeth using bernie:",
          sum(correct) / float(prediction_data.shape[0]))

    # Numerai measures models on logloss instead of accuracy. The lower the logloss the better.
    # Numerai only pays models with logloss < 0.693 on the live portion of the tournament data.)

    print("- validation logloss:",
          model.evaluate(x_prediction,y_prediction))

    results = y_prediction[:, 1]

    # -----

    

    results_df = pd.DataFrame(data={'probability_bernie':results})

    joined = pd.DataFrame(ids).join(results_df)
    pd.DataFrame(joined[:5])


    print("Writing predictions to predictions.csv")

    # Save the predictions out to a CSV file
    path = 'predictions_{:}'.format(time.strftime("%Y-%m-%d_%Hh%Mm%Ss", time.gmtime())) + '.csv'
    print()
    print("Writing predictions to " + path.strip())
    # # Save the predictions out to a CSV file
    joined.to_csv(path,float_format='%.15f', index=False)


    # Now you can upload these predictions on numer.ai





if __name__ == '__main__':

    main()

Using TensorFlow backend.


Loading data...
original train data shape: 502732,	60 

 	:
original prediction data shape: 333917,	60 

 	:
total training / valdation shape                       id   era data_type  feature1  feature2  feature3  \
0       n0003126ff2349f6  era1     train   0.54836   0.31077   0.37524   
1       n003d773d29b57ec  era1     train   0.34712   0.40275   0.42747   
2       n0074df2dc6810b6  era1     train   0.50871   0.48639   0.47544   
3       n0090630f530903e  era1     train   0.61363   0.40268   0.53779   
4       n00af19089546fe9  era1     train   0.30704   0.47273   0.54495   
5       n011d2da12b1e735  era1     train   0.52336   0.59136   0.60506   
6       n014149cadeee55d  era1     train   0.30875   0.62510   0.35229   
7       n0148a4dcf539aba  era1     train   0.40632   0.30590   0.43227   
8       n015855690d31908  era1     train   0.48193   0.27060   0.50228   
9       n0169447f4d6a10e  era1     train   0.51191   0.53663   0.42109   
10      n01703ba4eff8fe7  era1     train   0

Train on 336830 samples, validate on 165902 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
- probabilities: [0.5038235  0.47747394 0.52584994 0.50304914 0.508566  ]
- target: 1    1.0
2    0.0
3    0.0
4    0.0
5    1.0
Name: target_bernie, dtype: float64
- rounded probability: [1.0, 0.0, 1.0, 1.0, 1.0]
- accuracy:  0.08481448982831062
- bernie vs elizabeth corr: [[nan nan]
 [nan nan]]
- elizabeth using bernie: 0.08516787105777783
- validation logloss: 0.6917378780136868
Writing predictions to predictions.csv

Writing predictions to predictions_2019-04-04_23h44m30s.csv


In [None]:
    m4 = Dense(25)(m3)
    m4 = Activation('sigmoid')(m4)
    m4 = Dropout(dropout)(m4) 