In [10]:
import os
os.environ['KERAS_BACKEND' ] = 'tensorflow'
os.environ['MKL_THREADING_LAYER'] = 'GNU'
import keras as ks
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import TensorBoard
import keras
import pandas as pd
import numpy as np
from keras import optimizers
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Dropout, Activation
from sklearn.preprocessing import MinMaxScaler
import time 


def main():

    # Set seed for reproducibility
    NAME = "MLP"
    np.random.seed(0)
    print("Loading data...")
    # Load the data from the CSV files
    training_data = pd.read_csv('train.csv', header=0)
    print('original train data shape: {},\t{} \n\n \t:'.format(training_data.shape[0],training_data.shape[1]))

    prediction_data = pd.read_csv('test.csv', header=0)
    print('original prediction data shape: {},\t{} \n\n \t:'.format(prediction_data.shape[0],prediction_data.shape[1]))
    
    complete_training_data = pd.concat([training_data, prediction_data])
    print('total training / valdation shape {}'.format(complete_training_data))
    
    # Transform the loaded CSV data into numpy arrays

    X = training_data.drop(["target"], axis=1)
    mini= MinMaxScaler(feature_range=(0,1)) 
    X = mini.fit_transform(X)

    Y = training_data["target"]
    Y= keras.utils.to_categorical(Y,2) 

    x_prediction = prediction_data
    x_prediction = mini.fit_transform(x_prediction)

    ids = prediction_data["id"]  

    batch_size = 710

    dropout = 0.2

    visible = Input(shape=(301,))
    hidden1 = Dense(5, activation='relu')(visible)
    hidden1 = Dense(5, activation='relu')(hidden1)
    hidden1 = Dense(5, activation='relu')(hidden1)
    hidden1 = Dense(5, activation='relu')(hidden1)
    hidden1 = Dense(5, activation='relu')(hidden1)
    hidden1 = Dense(5, activation='relu')(hidden1)
    hidden2 = Dense(5, activation='relu')(hidden1)
    hidden2 = Dense(5, activation='relu')(hidden2)
    hidden2 = Dense(5, activation='relu')(hidden2)
    hidden2 = Dense(5, activation='relu')(hidden2)
    hidden2 = Dense(5, activation='relu')(hidden2)
    hidden2 = Dense(5, activation='relu')(hidden2)
    hidden2 = Dense(5, activation='relu')(hidden2)
    hidden3 = Dense(5, activation='relu')(hidden2)
    hidden3 = Dense(5, activation='relu')(hidden3)
    hidden3 = Dense(5, activation='relu')(hidden3)
    hidden3 = Dense(5, activation='relu')(hidden3)
    merge = keras.layers.concatenate([hidden1,hidden2,hidden3], axis=1)
    hidden4 = Dense(5,activation='relu')(merge)
    output = Dense(2, activation='sigmoid')(hidden4)
    model = Model(inputs=visible, outputs=output)
    model.compile(loss='binary_crossentropy',optimizer='rmsprop')
    model.summary()
    tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))
    model.fit(X,Y,batch_size=batch_size,epochs=10,validation_split=0.33,callbacks=[tensorboard])
    

    y_prediction = model.predict(x_prediction)
    evaluate = model.evaluate(x_prediction,y_prediction)
    
    probabilities = y_prediction[:, 1]
    print("- probabilities:", probabilities[1:6])

    # We can see the probability does seem to be good at predicting the
    # true target correctly.
#     print("- target:", prediction_data['target_bernie'][1:6])
#     print("- rounded probability:", [np.round(p) for p in probabilities][1:6])

#     # But overall the accuracy is very low.
#     correct = [
#         np.round(x) == y
#         for (x, y) in zip(probabilities, prediction_data['target_bernie'])
#     ]
#     print("- accuracy: ", sum(correct) / float(prediction_data.shape[0]))

#     tournament_corr = np.corrcoef(prediction_data['target_bernie'],
#                                   prediction_data['target_elizabeth'])
#     print("- bernie vs elizabeth corr:", tournament_corr)
#     # You can see that target_elizabeth is accurate using the bernie model as well.
#     correct = [
#         np.round(x) == y
#         for (x, y) in zip(probabilities, prediction_data['target_elizabeth'])
#     ]
#     print("- elizabeth using bernie:",
#           sum(correct) / float(prediction_data.shape[0]))

    # Numerai measures models on logloss instead of accuracy. The lower the logloss the better.
    # Numerai only pays models with logloss < 0.693 on the live portion of the tournament data.)

    print("- validation logloss:",
          model.evaluate(x_prediction,y_prediction))
    
    results = y_prediction[:, 1]
    results_df = pd.DataFrame(data={'target':results})

    joined = pd.DataFrame(ids).join(results_df)
    pd.DataFrame(joined[:5])


    print("Writing predictions to predictions.csv")
    path = 'predictions_{:}_{}_1'.format(time.strftime("%Y-%m-%d_%Hh%Mm%Ss", time.gmtime()),NAME) + '.csv'
    print()
    print("Writing predictions to " + path.strip())
    joined.to_csv(path,float_format='%.15f', index=False)

if __name__ == '__main__':

    main()

Loading data...
original train data shape: 250,	302 

 	:
original prediction data shape: 19750,	301 

 	:
total training / valdation shape            0      1     10    100    101    102    103    104    105    106  \
0     -0.098  2.165 -0.912  1.329  0.452 -0.704  2.218 -1.844  0.158 -1.649   
1      1.081 -0.973  2.907 -1.208  0.893  0.379  1.396  0.581 -0.475 -0.056   
2     -0.523 -0.089  0.459 -0.901  2.895  0.651  1.006 -0.587  0.208 -0.106   
3      0.067 -0.021  0.335 -0.546 -1.125 -0.418  0.281 -0.193  0.764  1.282   
4      2.347 -0.831  0.190  0.680  1.515 -0.617 -0.918 -0.243  0.689 -0.465   
5     -0.641 -0.576 -1.369 -1.019  1.358 -0.310 -1.891 -0.700 -2.351 -0.785   
6     -0.490  0.557  1.168  1.314 -0.207  1.033 -0.461  0.821  0.673  0.056   
7      1.252 -1.370  0.626  0.306  0.117  0.174  0.742 -0.556  0.144  1.034   
8      1.410 -1.097 -1.256  1.250  1.508  0.277 -0.364 -1.202 -0.088 -0.512   
9     -1.811  0.566  2.173  0.060  1.888  0.755  2.024 -0.541 -0.026  

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 301)          0                                            
__________________________________________________________________________________________________
dense_79 (Dense)                (None, 5)            1510        input_6[0][0]                    
__________________________________________________________________________________________________
dense_80 (Dense)                (None, 5)            30          dense_79[0][0]                   
__________________________________________________________________________________________________
dense_81 (Dense)                (None, 5)            30          dense_80[0][0]                   
__________________________________________________________________________________________________
dense_82 (

In [1]:
import os
os.environ['KERAS_BACKEND' ] = 'tensorflow'
os.environ['MKL_THREADING_LAYER'] = 'GNU'
import keras as ks
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import TensorBoard
import keras
import pandas as pd
import numpy as np
from keras import optimizers
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Dropout, Activation
from sklearn.preprocessing import MinMaxScaler
import time 


def main():

    # Set seed for reproducibility
    NAME = "MLP"
    np.random.seed(0)



    print("Loading data...")

    # Load the data from the CSV files

    training_data = pd.read_csv('numerai_training_data.csv', header=0)
    print('original train data shape: {},\t{} \n\n \t:'.format(training_data.shape[0],training_data.shape[1]))

    prediction_data = pd.read_csv('numerai_tournament_data.csv', header=0)
    print('original prediction data shape: {},\t{} \n\n \t:'.format(prediction_data.shape[0],prediction_data.shape[1]))
    
    complete_training_data = pd.concat([training_data, prediction_data])
    print('total training / valdation shape {}'.format(complete_training_data))
    
    



    # Transform the loaded CSV data into numpy arrays

    features = [f for f in list(training_data) if "feature" in f]
    print(features)

    X = training_data[features]
    mini= MinMaxScaler(feature_range=(0,1)) 
    X = mini.fit_transform(X)
#     X = X.values
    Y = training_data["target"]
    Y= keras.utils.to_categorical(Y,2) 

    x_prediction = prediction_data[features]
    x_prediction = mini.fit_transform(x_prediction)

    ids = prediction_data["id"]

#     X = X.values

#     Y = to_categorical(Y, num_classes=2)

    

    batch_size = 710

    dropout = 0.2

    

    m_in = Input(shape=(50,))

    m1 = Dense(50,)(m_in)
    m1 = Activation('relu')(m1)
    m1 = BatchNormalization(momentum=.99999,axis=-1)(m1)

    m2 = Dense(100)(m1)
    m2 = Activation('relu')(m2)
    m2 = BatchNormalization(momentum=.999,axis=-1)(m2)
    
    m3 = Dense(25)(m2)
    m3 = Activation('relu')(m3)
    
    m3 = Dense(25)(m3)
    m3 = Dropout(dropout)(m3) 
    m3 = Activation('relu')(m3)
    
    m3 = Dense(25)(m3)
    m3 = Activation('relu')(m3)
    m3 = BatchNormalization(momentum=.99,axis=-1)(m3)
    
    m3 = Dense(100)(m3)
    m3 = Activation('relu')(m3)
    
    m3 = Dense(25)(m3)
    m3 = Activation('relu')(m3) 
    
    m4 = Dense(25)(m3)
    m4 = Activation('relu')(m4) 
    m4 = Dropout(dropout)(m4) 
    m4 = BatchNormalization(momentum=.9,axis=-1)(m4)

    
    
    m5 = Dense(2)(m4)
    m_out = Activation('sigmoid')(m5)

    model = Model(inputs=m_in, outputs=m_out)



    model.compile(loss='binary_crossentropy',optimizer='rmsprop')
    tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))
    model.fit(X,Y,batch_size=batch_size,epochs=10,validation_split=0.33,callbacks=[tensorboard])
    
    

    

    y_prediction = model.predict(x_prediction)
    evaluate = model.evaluate(x_prediction,y_prediction)
    
    probabilities = y_prediction[:, 1]
    print("- probabilities:", probabilities[1:6])

    # We can see the probability does seem to be good at predicting the
    # true target correctly.
    print("- target:", prediction_data['target'][1:6])
    print("- rounded probability:", [np.round(p) for p in probabilities][1:6])

    # But overall the accuracy is very low.
    correct = [
        np.round(x) == y
        for (x, y) in zip(probabilities, prediction_data['target'])
    ]
    print("- accuracy: ", sum(correct) / float(prediction_data.shape[0]))

    # The targets for each of the tournaments are very correlated.
    tournament_corr = np.corrcoef(prediction_data['target'],
                                  prediction_data['target_elizabeth'])
    print("- bernie vs elizabeth corr:", tournament_corr)
    # You can see that target_elizabeth is accurate using the bernie model as well.
    correct = [
        np.round(x) == y
        for (x, y) in zip(probabilities, prediction_data['target_elizabeth'])
    ]
    print("- elizabeth using bernie:",
          sum(correct) / float(prediction_data.shape[0]))

    # Numerai measures models on logloss instead of accuracy. The lower the logloss the better.
    # Numerai only pays models with logloss < 0.693 on the live portion of the tournament data.)

    print("- validation logloss:",
          model.evaluate(x_prediction,y_prediction))

    results = y_prediction[:, 1]

    # -----

    

    results_df = pd.DataFrame(data={'probability_bernie':results})

    joined = pd.DataFrame(ids).join(results_df)
    pd.DataFrame(joined[:5])


    print("Writing predictions to predictions.csv")

    # Save the predictions out to a CSV file
    path = 'predictions_{:}'.format(time.strftime("%Y-%m-%d_%Hh%Mm%Ss", time.gmtime())) + '.csv'
    print()
    print("Writing predictions to " + path.strip())
    # # Save the predictions out to a CSV file
    joined.to_csv(path,float_format='%.15f', index=False)


    # Now you can upload these predictions on numer.ai





if __name__ == '__main__':

    main()

Using TensorFlow backend.


Loading data...
original train data shape: 502732,	60 

 	:
original prediction data shape: 333917,	60 

 	:
total training / valdation shape                       id   era data_type  feature1  feature2  feature3  \
0       n0003126ff2349f6  era1     train   0.54836   0.31077   0.37524   
1       n003d773d29b57ec  era1     train   0.34712   0.40275   0.42747   
2       n0074df2dc6810b6  era1     train   0.50871   0.48639   0.47544   
3       n0090630f530903e  era1     train   0.61363   0.40268   0.53779   
4       n00af19089546fe9  era1     train   0.30704   0.47273   0.54495   
5       n011d2da12b1e735  era1     train   0.52336   0.59136   0.60506   
6       n014149cadeee55d  era1     train   0.30875   0.62510   0.35229   
7       n0148a4dcf539aba  era1     train   0.40632   0.30590   0.43227   
8       n015855690d31908  era1     train   0.48193   0.27060   0.50228   
9       n0169447f4d6a10e  era1     train   0.51191   0.53663   0.42109   
10      n01703ba4eff8fe7  era1     train   0

Train on 336830 samples, validate on 165902 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
- probabilities: [0.5038235  0.47747394 0.52584994 0.50304914 0.508566  ]
- target: 1    1.0
2    0.0
3    0.0
4    0.0
5    1.0
Name: target_bernie, dtype: float64
- rounded probability: [1.0, 0.0, 1.0, 1.0, 1.0]
- accuracy:  0.08481448982831062
- bernie vs elizabeth corr: [[nan nan]
 [nan nan]]
- elizabeth using bernie: 0.08516787105777783
- validation logloss: 0.6917378780136868
Writing predictions to predictions.csv

Writing predictions to predictions_2019-04-04_23h44m30s.csv


In [None]:
    m4 = Dense(25)(m3)
    m4 = Activation('sigmoid')(m4)
    m4 = Dropout(dropout)(m4) 