<a href="https://colab.research.google.com/github/craigschindler/numerai/blob/main/NeuralNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import dependencies
import numpy as np
import pandas as pd
!pip install numerapi
import numerapi
import sklearn.linear_model
from numpy import loadtxt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Conv2D

from tensorflow.keras import activations
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from scipy import stats

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('fivethirtyeight')

import graphviz
import pydot

from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

## The datasets

### Datasets 
*   `training_data` is used to train your model
*   `tournament_data` is used to evaluate your model

### Column descriptions
*   id: a randomized id that corresponds to a stock 
*   era: a period of time
*   data_type: either `train`, `validation`, `test`, or `live` 
*   feature_*: abstract financial features of the stock 
*   target: abstract measure of stock performance




In [None]:
# load the training data set
training_data = pd.read_csv("https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_training_data.csv.xz")
training_data["erano"] = training_data.era.str.slice(3).astype(int)

In [None]:
# load the tournament dataset and validation data set
tournament_data = pd.read_csv("https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_tournament_data.csv.xz")
validation_data = tournament_data.loc[tournament_data['data_type'] == "validation"].copy()
validation_data["erano"] = validation_data.era.str.slice(3).astype(int)

In [None]:
tournament_data.loc[tournament_data['data_type'] == "test"].tail()

In [None]:
len(tournament_data.loc[tournament_data['data_type'] == "live"])

## Train the model


In [None]:
# find only the feature columns
feature_cols = training_data.columns[training_data.columns.str.startswith('feature')]

In [None]:
# select those columns out of the training dataset and validation dataset
training_features = training_data[feature_cols]
validation_features = validation_data[feature_cols]
training_targets = training_data["target"]
validation_targets = validation_data["target"]

In [None]:
#set hyperparameters for neural network
hyperparameters = {}
nn_training_data = {}
nn_validation_data = {}
#######################################
hyperparameters["learning_rate"] = 0.001
hyperparameters["hidden_layer_1_units"] = 256
hyperparameters["hidden_layer_2_units"] = 128
hyperparameters["hidden_layer_3_units"] = 64
hyperparameters["dropout_layer_1_rate"] = 0.1
hyperparameters["dropout_layer_2_rate"] = 0.1
hyperparameters["dropout_layer_3_rate"] = 0.1
hyperparameters["batch_size"] = 128
hyperparameters["epochs"] = 15
nn_training_data["examples"] = training_features.to_numpy()
nn_training_data["targets"] = training_targets.to_numpy()
nn_validation_data["examples"] = validation_features.to_numpy()
nn_validation_data["targets"] = validation_targets.to_numpy()
es = tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True, monitor='val_loss')
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=0, verbose=1, mode='min')
callbacks = [es, lr_scheduler]
#######################################

In [None]:
#create model
model = Sequential()
model.add(Dense(hyperparameters["hidden_layer_1_units"], input_dim=310)) #there are 310 features, i.e. inputs
model.add(BatchNormalization())
model.add(Activation(activations.relu))
model.add(Dropout(hyperparameters["dropout_layer_1_rate"]))
model.add(Dense(hyperparameters["hidden_layer_2_units"]))
model.add(BatchNormalization())
model.add(Activation(activations.relu))
model.add(Dropout(hyperparameters["dropout_layer_2_rate"]))
model.add(Dense(hyperparameters["hidden_layer_3_units"]))
model.add(BatchNormalization())
model.add(Activation(activations.relu))
model.add(Dropout(hyperparameters["dropout_layer_3_rate"]))
model.add(Dense(1))
model.add(Activation(activations.linear))
opt = tf.keras.optimizers.Adam(learning_rate=hyperparameters["learning_rate"]) #default adam learning rate is learning_rate=0.001
model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=opt)
history = model.fit(nn_training_data["examples"], nn_training_data["targets"], epochs=hyperparameters["epochs"], batch_size=hyperparameters["batch_size"],validation_data=(nn_validation_data["examples"],nn_validation_data["targets"]),callbacks=callbacks)






In [None]:

#model.save('/content/drive/My Drive/Colab Notebooks/mymdl') #UNCOMMENT TO SAVE MODEL
model = tf.keras.models.load_model('/content/drive/My Drive/Colab Notebooks/mymdl') #UNCOMMENT TO LOAD MODEL

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
history.history['val_loss']

## Calculate Validation Performance


In [None]:
# The models should be scored based on the rank-correlation (spearman) with the target
def spearman_corr(y_true, y_pred):
    #return stats.spearmanr(y_true,y_pred)[0]
    ranked_predictions = y_pred.rank(pct=True, method="first")
    return np.corrcoef(y_true, ranked_predictions)[0,1]
def pearson_corr(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0,1]

df = validation_data
df_features = validation_features

df["predictions"] = model.predict(df_features.to_numpy())

corr_array = []

for erano in df.erano.unique():
    tdf = df.loc[df['erano'] == erano]
    corr_array.append(spearman_corr(tdf["target"], tdf["predictions"]))
plt.hist(corr_array)
plt.title("histogram of validation corr by era")
plt.show()
print("sharpe ratio: " + str(np.mean(corr_array)/np.std(corr_array)))
print("mean of validation corrs by era: " + str(np.mean(corr_array)))
print("std of validation corrs by era: " + str(np.std(corr_array)))

## 4. Generate your first predictions
Now that we have a trained model, we can use it to make predictions on the tournament data.



In [None]:
# select the feature columns from the tournament data
live_features = tournament_data[feature_cols]

In [None]:
# predict the target on the live features
predictions = model.predict(live_features.to_numpy())

In [None]:
# predictions must have an `id` column and a `prediction_kazutsugi` column
predictions_df = tournament_data["id"].to_frame()
predictions_df["prediction"] = predictions
predictions_df.head()

## 5. Make your first submission
To enter the tournament, we must submit the predictions back to Numerai. We will use the `numerapi` library to do this.

In [None]:
# Get your API keys and model_id from https://numer.ai/submit
# public_id = "<YOUR PUBLIC ID HERE"
# secret_key = "<YOUR SECRET KEY HERE>"
# model_id = "<YOUR MODEL ID HERE>"
# napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)

In [None]:
# Upload your predictions
predictions_df.to_csv("predictions.csv", index=False)
files.download('predictions.csv')
# submission_id = napi.upload_predictions("predictions.csv", model_id=model_id)

# Done 🚀
Good job! You just made your first submission on Numerai!

Head back over to https://numer.ai/submit to continue.