In [1]:
# data.csv is a small preview of data-training.csv
# It is okay to use data.csv for debugging purposes, but for the actual training data-training.csv should be used
# data-training.csv was used to produce the HTML of this Jupyter notebook, but is not included in this repository

DATA_LOCATION = '../data-training.csv'

In [2]:
# Save data to dataframe

import pandas as pd
df = pd.read_csv(DATA_LOCATION)

In [3]:
# Fill NaN values with zeroes, as required by the problem statement

df.fillna(0, inplace = True)

In [4]:
# Basic data exploration:
# (1) Print the first lines of the table
df.head()

# (2) Print mean, standard deviation, and other basic statistical info
df.describe()

# (3) Print correlation matrix
df.corr()

Unnamed: 0,askRate0,askRate1,askRate2,askRate3,askRate4,askRate5,askRate6,askRate7,askRate8,askRate9,...,bidSize6,bidSize7,bidSize8,bidSize9,bidSize10,bidSize11,bidSize12,bidSize13,bidSize14,y
askRate0,1.000000,0.978121,0.962060,0.943962,0.923187,0.906472,0.890943,0.877397,0.866211,0.855753,...,-0.177827,-0.176697,-0.186292,-0.171415,-0.191442,-0.185028,-0.197875,-0.194533,-0.195466,-0.011365
askRate1,0.978121,1.000000,0.983577,0.965077,0.943823,0.926730,0.910843,0.896994,0.885555,0.874861,...,-0.174499,-0.172914,-0.182693,-0.167820,-0.188033,-0.181255,-0.193599,-0.190151,-0.191212,-0.010363
askRate2,0.962060,0.983577,1.000000,0.981198,0.959574,0.942187,0.926026,0.911943,0.900311,0.889440,...,-0.171801,-0.170152,-0.179779,-0.165071,-0.185254,-0.178533,-0.190470,-0.186948,-0.188040,-0.010095
askRate3,0.943962,0.965077,0.981198,1.000000,0.977960,0.960242,0.943775,0.929428,0.917574,0.906502,...,-0.168773,-0.167214,-0.176452,-0.161989,-0.181866,-0.175415,-0.187000,-0.183360,-0.184615,-0.009620
askRate4,0.923187,0.943823,0.959574,0.977960,1.000000,0.981890,0.965061,0.950403,0.938289,0.926975,...,-0.165602,-0.163599,-0.172696,-0.158595,-0.179233,-0.171671,-0.182821,-0.179067,-0.180679,-0.009189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
bidSize11,-0.185028,-0.181255,-0.178533,-0.175415,-0.171671,-0.168778,-0.166217,-0.163877,-0.161992,-0.160149,...,0.026057,0.235840,0.024195,0.269019,0.028040,1.000000,0.035424,0.263080,0.033162,0.005139
bidSize12,-0.197875,-0.193599,-0.190470,-0.187000,-0.182821,-0.179691,-0.176645,-0.173900,-0.171869,-0.169884,...,0.259492,0.059747,0.225001,0.045081,0.249411,0.035424,1.000000,0.028798,0.218764,0.004842
bidSize13,-0.194533,-0.190151,-0.186948,-0.183360,-0.179067,-0.175671,-0.172512,-0.169832,-0.167494,-0.165424,...,0.037640,0.225516,0.049657,0.223095,0.045414,0.263080,0.028798,1.000000,0.027354,0.004210
bidSize14,-0.195466,-0.191212,-0.188040,-0.184615,-0.180679,-0.177632,-0.174627,-0.172032,-0.169906,-0.167873,...,0.171910,0.058765,0.204565,0.037405,0.199062,0.033162,0.218764,0.027354,1.000000,0.001813


In [5]:
# Create new features
df['maxaskRate'] = df[['askRate0','askRate1','askRate2','askRate3','askRate4','askRate5','askRate6','askRate7','askRate8','askRate9','askRate10','askRate11','askRate12','askRate13','askRate14']].max(axis=1)
df['minaskRate'] = df[['askRate0','askRate1','askRate2','askRate3','askRate4','askRate5','askRate6','askRate7','askRate8','askRate9','askRate10','askRate11','askRate12','askRate13','askRate14']].min(axis=1)
df['maxbidRate'] = df[['bidRate0','bidRate1','bidRate2','bidRate3','bidRate4','bidRate5','bidRate6','bidRate7','bidRate8','bidRate9','bidRate10','bidRate11','bidRate12','bidRate13','bidRate14']].max(axis=1)
df['minbidRate'] = df[['bidRate0','bidRate1','bidRate2','bidRate3','bidRate4','bidRate5','bidRate6','bidRate7','bidRate8','bidRate9','bidRate10','bidRate11','bidRate12','bidRate13','bidRate14']].min(axis=1)
df['askBidRatio'] = df['minaskRate'] / df['maxbidRate']
df['totalaskSize'] = df[['askSize0','askSize1','askSize2','askSize3','askSize4','askSize5','askSize6','askSize7','askSize8','askSize9','askSize10','askSize11','askSize12','askSize13','askSize14']].sum(axis=1)
df['totalbidSize'] = df[['bidSize0','bidSize1','bidSize2','bidSize3','bidSize4','bidSize5','bidSize6','bidSize7','bidSize8','bidSize9','bidSize10','bidSize11','bidSize12','bidSize13','bidSize14']].sum(axis=1)
df.fillna(0, inplace = True)

# Create a feature 'average ask rate' ('avgaskRate'), where the average is computed after removing zero entries
# 'askNotEqualZero' is a helper feature that counts the number of non-zero entries, that will be dropped later
df['askNotEqualZero'] = df[['askRate0','askRate1','askRate2','askRate3','askRate4','askRate5','askRate6','askRate7','askRate8','askRate9','askRate10','askRate11','askRate12','askRate13','askRate14']].ne(0).sum(axis=1)
df['avgaskRate'] = (df['askRate0']*df['askSize0']+df['askRate1']*df['askSize1']+df['askRate2']*df['askSize2']+df['askRate3']*df['askSize3']+df['askRate4']*df['askSize4']+df['askRate5']*df['askSize5']+df['askRate6']*df['askSize6']+df['askRate7']*df['askSize7']+df['askRate8']*df['askSize8']+df['askRate9']*df['askSize9']+df['askRate10']*df['askSize10']+df['askRate11']*df['askSize11']+df['askRate12']*df['askSize12']+df['askRate13']*df['askSize13']+df['askRate14']*df['askSize14'])/df['askNotEqualZero']

# Create a feature 'average bid rate' ('avgbidRate'), where the average is computed after removing zero entries
# 'bidNotEqualZero' is a helper feature that counts the number of non-zero entries, that will be dropped later
df['bidNotEqualZero'] = df[['bidRate0','bidRate1','bidRate2','bidRate3','bidRate4','bidRate5','bidRate6','bidRate7','bidRate8','bidRate9','bidRate10','bidRate11','bidRate12','bidRate13','bidRate14']].ne(0).sum(axis=1)
df['avgbidRate'] = (df['bidRate0']*df['bidSize0']+df['bidRate1']*df['bidSize1']+df['bidRate2']*df['bidSize2']+df['bidRate3']*df['bidSize3']+df['bidRate4']*df['bidSize4']+df['bidRate5']*df['bidSize5']+df['bidRate6']*df['bidSize6']+df['bidRate7']*df['bidSize7']+df['bidRate8']*df['bidSize8']+df['bidRate9']*df['bidSize9']+df['bidRate10']*df['bidSize10']+df['bidRate11']*df['bidSize11']+df['bidRate12']*df['bidSize12']+df['bidRate13']*df['bidSize13']+df['bidRate14']*df['bidSize14'])/df['bidNotEqualZero']

# The helper variables are dropped here
df = df.drop(['askNotEqualZero','bidNotEqualZero'], axis=1)

In [6]:
# Prepare data for training:

# (1) Create separate dataframes for input and output variables
X = df.drop('y', axis=1)
Y = df['y']

# (2) Split the dataset into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1) 

In [7]:
# Training a random forest regressor
# (see https://en.wikipedia.org/wiki/Random_forest and https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
from sklearn.ensemble import RandomForestRegressor

n_estimators = 10
max_depth = 7

regr = RandomForestRegressor(n_estimators=n_estimators, n_jobs=4, max_depth=max_depth)

regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [8]:
# Store the regressor in a file to be used by for later predictions
import pickle
f = open('forest-regressor-10x7.pckl', 'wb')
pickle.dump(regr, f)
f.close()

In [9]:
# Evaluate the model by computing the R2 score on the test set
from sklearn.metrics import r2_score
yhat = regr.predict(X_test)
r2_score(y_test, yhat)

0.032151644142432656