In [1]:
import os
os.environ['KERAS_BACKEND' ] = 'tensorflow'
os.environ['MKL_THREADING_LAYER'] = 'GNU'
import keras as ks
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import TensorBoard
import keras
import pandas as pd
import numpy as np
from keras import optimizers
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense,Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Dropout, Activation
from sklearn.preprocessing import MinMaxScaler
from keras.layers.recurrent import LSTM
import time 
from keras.layers import Input, Dense, LSTM, MaxPooling1D, Conv1D
from keras.models import Model
from keras.layers.merge import concatenate
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import TimeDistributed
def main():

    # Set seed for reproducibility
    NAME = "Shared_input_layer" #this is the name for the Tensor Graph
    np.random.seed(0)
    print("Loading data...")
    # Load the data from the CSV files
    training_data = pd.read_csv('numerai_training_data.csv', header=0)
    
    print('original train data shape: {},\t{} \n\n \t:'.format(training_data.shape[0],
                                                               training_data.shape[1]))  #Training Data

    prediction_data = pd.read_csv('numerai_tournament_data.csv', header=0)
    print('original prediction data shape: {},\t{} \n\n \t:'.format(prediction_data.shape[0],
                                                                    prediction_data.shape[1])) # Test Data
    
    complete_training_data = pd.concat([training_data, prediction_data])
    print('total training / valdation shape {}'.format(complete_training_data)) # Concatenated Training/Test Data
    
    # Transform the loaded CSV data into numpy arrays
    features = [f for f in list(training_data) if "feature" in f] #Features for training
    print(features)
    #Determine Labels vs Features
    
    #Scale All features 
    X = training_data[features] 
    mini= MinMaxScaler(feature_range=(0,1)) 
    X = mini.fit_transform(X)
    
    #Define Categorical Variables
    Y = training_data["target_bernie"]
    Y= keras.utils.to_categorical(Y,2) 
    
    #Define Prediction Labels
    x_prediction = prediction_data[features]
    x_prediction = mini.fit_transform(x_prediction)
    
    #Id's for prediction Labels 
    ids = prediction_data["id"]  
    #Define Model
    batch_size = 710
    dropout = 0.666666
     
    visible = Input(shape=(50,))
    m1 = Dense(6, activation='sigmoid')(visible)
    m1 = Dense(6, activation='sigmoid')(m1)
    m1 = Dropout(dropout)(m1)
    
    m2 = Dense(6, activation='sigmoid')(visible)
    m2 = Dense(6, activation='sigmoid')(m2)
    
    m3 = Dense(6, activation='sigmoid')(visible)
    m3 = Dense(6, activation='sigmoid')(m3)
    m3 = Dropout(dropout)(m3)
    
    merge = concatenate([m1,m2,m3],axis=1)
    
    output = Dense(2, activation='sigmoid')(merge)
    model = Model(inputs=visible, outputs=output)
    model.compile(loss='binary_crossentropy',optimizer='rmsprop')
    model.summary()
    
    tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))
    model.fit(X,Y,batch_size=batch_size,epochs=30,
              validation_split=0.4,
              callbacks=[tensorboard])
    
    y_prediction = model.predict(x_prediction)

    evaluate = model.evaluate(x_prediction,y_prediction)
    
    eras = prediction_data.era.unique()
    count = 0
    count_consistent = 0
    better_than_random_era_count = 0
    for era in eras:
        count += 1
        current_valid_data = prediction_data[prediction_data.era==era]
        features = [f for f in list(complete_training_data) if "feature" in f]
        X_valid = current_valid_data[features]
        Y_valid = current_valid_data["target_bernie"]
        loss = evaluate
        if (loss < -np.log(.5)):
            consistent = True
            count_consistent += 1
            better_than_random_era_count += 1
        else:
            consistent = False
        print("{}: loss - {} consistent: {} - better than random {}".format(era, loss, consistent,better_than_random_era_count))
    print ("Consistency: {}".format(count_consistent/count))
        
    
    probabilities = y_prediction[:, 1]
    print("- probabilities:", probabilities[1:6])

    # We can see the probability does seem to be good at predicting the
    # true target correctly.
    print("- target:", prediction_data['target_bernie'][1:6])
    print("- rounded probability:", [np.round(p) for p in probabilities][1:6])

    # But overall the accuracy is very low.
    correct = [
        np.round(x) == y
        for (x, y) in zip(probabilities, prediction_data['target_bernie'])
    ]
    print("- accuracy: ", np.sum(correct) / np.float(prediction_data.shape[0]))

    tournament_corr = np.corrcoef(prediction_data['target_bernie'],
                                  prediction_data['target_ken'])
    print("- bernie vs ken corr:", tournament_corr)
    # You can see that target_ken is accurate using the bernie model as well.
    correct = [
        np.round(x) == y
        for (x, y) in zip(probabilities, prediction_data['target_ken'])
    ]
    print("- ken using bernie:",
          np.sum(correct) / np.float(prediction_data.shape[0]))


 

    print("- validation logloss:",
          model.evaluate(x_prediction,y_prediction))
    
    results = y_prediction[:, 1]
    results_df = pd.DataFrame(data={'probability_bernie':results})

    joined = pd.DataFrame(ids).join(results_df)
    pd.DataFrame(joined[:5])


    print("Writing predictions to predictions.csv")
    path = 'predictions_{:}_{}_1'.format(time.strftime("%Y-%m-%d_%Hh%Mm%Ss", time.gmtime()),NAME) + '.csv'
    print()
    print("Writing predictions to " + path.strip())
    joined.to_csv(path,float_format='%.15f', index=False)

if __name__ == '__main__':

    main()

Using TensorFlow backend.


Loading data...
original train data shape: 502732,	60 

 	:
original prediction data shape: 333925,	60 

 	:
total training / valdation shape                       id   era data_type  feature1  feature2  feature3  \
0       n0003126ff2349f6  era1     train   0.54836   0.31077   0.37524   
1       n003d773d29b57ec  era1     train   0.34712   0.40275   0.42747   
2       n0074df2dc6810b6  era1     train   0.50871   0.48639   0.47544   
3       n0090630f530903e  era1     train   0.61363   0.40268   0.53779   
4       n00af19089546fe9  era1     train   0.30704   0.47273   0.54495   
5       n011d2da12b1e735  era1     train   0.52336   0.59136   0.60506   
6       n014149cadeee55d  era1     train   0.30875   0.62510   0.35229   
7       n0148a4dcf539aba  era1     train   0.40632   0.30590   0.43227   
8       n015855690d31908  era1     train   0.48193   0.27060   0.50228   
9       n0169447f4d6a10e  era1     train   0.51191   0.53663   0.42109   
10      n01703ba4eff8fe7  era1     train   0

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 6)            306         input_1[0][0]                    
__________________________________________________________________________________________________
dense_5 (Dense)                 (None, 6)            306         input_1[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)      

  data = yaml.load(f.read()) or {}


Instructions for updating:
Use tf.cast instead.
Train on 301639 samples, validate on 201093 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
era121: loss - 0.6924828147092087 consistent: True - better than random 1
era122: loss - 0.6924828147092087 consistent: True - better than random 2
era123: loss - 0.6924828147092087 consistent: True - better than random 3
era124: loss - 0.6924828147092087 consistent: True - better than random 4
era125: loss - 0.6924828147092087 consistent: True - better than random 5
era126: loss - 0.6924828147092087 consistent: True - better than random 6
era127: loss - 0.6924828147092087 consistent: True - better than random 7
era128: loss - 0.69248281470920

era186: loss - 0.6924828147092087 consistent: True - better than random 66
eraX: loss - 0.6924828147092087 consistent: True - better than random 67
Consistency: 1.0
- probabilities: [0.4934881  0.47265428 0.48932266 0.48251784 0.48494077]
- target: 1    1.0
2    0.0
3    0.0
4    0.0
5    1.0
Name: target_charles, dtype: float64
- rounded probability: [0.0, 0.0, 0.0, 0.0, 0.0]
- accuracy:  0.08622295425619525
- charles vs ken corr: [[nan nan]
 [nan nan]]
- ken using charles: 0.0863367522647301
- validation logloss: 0.6924828147092087
Writing predictions to predictions.csv

Writing predictions to predictions_2019-04-12_17h13m20s_Shared_input_layer_1.csv
