The dataset represents a order book for an asset over a long period of time, with bids and ask orders and their respective volume, with askRate0 being the best ask and bidRate0 being the best bid. Task is to train a model that predicts the future price movement of that asset.

## best submission - xgboost #0.02025 [askSize0, bidSize0, askSize1, bidSize1,askSize2,bidSize2] 
### submission 9 - 0.01945 Linear Regression using ['pressure0,pressure1,pressure2']
## submission 10-  0.02046 Linear Regression using ['pressure0','pressure1','pressure2','pressure3','volumeDiff0','volumeDiff1','volumeDiff2','volumeDiff3']

In [None]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_columns = None
pd.options.display.max_rows = None
%matplotlib inline

In [None]:
df = pd.read_csv("data-training.csv")
#Represents an order book with depth of up to n = 14, 
#2999,999 rows, 60 features, 1GB
df.iloc[:,15:30] = df.iloc[:,15:30].fillna(0)

In [None]:
def transform(df_copy):
    #Price related features
    df = df_copy.copy()
    #df['spread'] = df['askRate0'] - df['bidRate0'] #difference between best ask and best bid, current liquidity
    df['price'] = (df['askRate0'] + df['bidRate0'])/2 # mean of best ask and best bid
    
    for i in range(15):
        df['askRate'+str(i)] -= df['price']
        df['bidRate'+str(i)] -= df['price']
        
    #volume related features
    df['totalAskVolume'] = df.iloc[:,15:30].sum(axis=1)
    df['totalBidVolume'] = df.iloc[:,45:60].sum(axis=1) 
    df['totalVolumeDiff'] = df['totalAskVolume'] - df['totalBidVolume'] 
    
    
    #
    df['modeAskDepth'] =  df.iloc[:,15:30].idxmax(axis=1).apply(lambda x : int(x[7:]))
    df['modeBidDepth'] =  df.iloc[:,45:60].idxmax(axis=1).apply(lambda x : int(x[7:]))
    
    
    for i in range(10):
        df['pressure'+str(i)] = df['askSize'+str(i)] / (df['askSize'+str(i)] + df['bidSize'+str(i)])
        df['imbalance'+str(i)] = (df['askSize'+str(i)] - df['bidSize'+str(i)]) / (df['askSize'+str(i)] + df['bidSize'+str(i)])
        df['volumeDiff'+str(i)] = df['askSize'+str(i)] - df['bidSize'+str(i)]
        
    return df
    

    #ideas - vectorize bid/asks
df_copy = transform(df)

In [None]:
def trial_features(df_copy):
    df = df_copy.copy()
    #mean, var,skew
#     df['priceVar'] = 0
#     df['priceSkew'] = 0
    df['weightedAskPrice'] = 0
    df['weightedBidPrice'] = 0
    df['askPriceVar'] = 0
    df['bidPriceVar'] = 0
    df['askPriceSkew'] = 0
    df['bidPriceSkew'] = 0
    for i in range(15):
#         df['priceVar'] += np.square(df['bidRate'+str(i)] -df['price'])  * df['bidSize'+str(i)]
#         df['priceVar'] += np.square((df['askRate'+str(i)] -df['price']).fillna(0)) * df['askSize'+str(i)].fillna(0)
        df['weightedAskPrice'] += df['askRate'+str(i)].fillna(0) * df['askSize'+str(i)]
        df['weightedBidPrice'] += df['bidRate'+str(i)] * df['bidSize'+str(i)]

#         df['priceSkew'] += np.power(df['bidRate'+str(i)] -df['price'],3) * df['bidSize'+str(i)]
#         df['priceSkew'] += np.power((df['askRate'+str(i)] -df['price']).fillna(0),3) * df['askSize'+str(i)]
    df['weightedAskPrice'] /= df['totalAskVolume']
    df['weightedBidPrice'] /= df['totalAskVolume']
    
    for i in range(15):
        df['askPriceVar'] += np.square((df['askRate'+str(i)] - df['weightedAskPrice']).fillna(0)) * df['askSize'+str(i)]
        df['askPriceSkew'] += np.power((df['askRate'+str(i)] - df['weightedAskPrice']).fillna(0),3) * df['askSize'+str(i)]
        df['bidPriceVar'] += np.square(df['bidRate'+str(i)] - df['weightedBidPrice']) * df['bidSize'+str(i)]
        df['bidPriceSkew'] += np.power(df['bidRate'+str(i)] - df['weightedBidPrice'],3) * df['bidSize'+str(i)]
    
    df['askPriceVar'] /= df['totalAskVolume']
    df['askPriceStd'] = np.sqrt(df['askPriceVar'])
    df['bidPriceVar'] /= df['totalBidVolume']
    df['bidPriceStd'] = np.sqrt(df['bidPriceVar'])
    df['askPriceSkew'] = df['askPriceSkew'].fillna(0)
    df['askPriceSkew'] /= (np.power(df['askPriceStd'],3) * df['totalAskVolume'])
    df['bidPriceSkew'] /= (np.power(df['bidPriceStd'],3) * df['totalBidVolume'])
    
    df_copy['askPriceDiff'] = df_copy['weightedAskPrice'] - df_copy['price'] 
    df_copy['bidPriceDiff'] = df_copy['weightedBidPrice'] - df_copy['price'] 
        
#     df['priceVar'] = df['priceVar'] / (df['totalAskVolume'] + df['totalBidVolume'])
#     df['priceStd'] = np.sqrt(df['priceVar'])
#     df['priceSkew'] = df['priceSkew'] / ((df['totalAskVolume'] + df['totalBidVolume']) * np.power(df['priceStd'],3))
    return df

df_copy = trial_features(df_copy)

In [None]:
df_copy.corrwith(df['y']).abs().sort_values(ascending=False).head(21)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression,HuberRegressor 
#from sklearn.linear_model import PassiveAggressiveRegressor, ,TheilSenRegressor,SGDRegressor
from sklearn.metrics import r2_score


features = ['pressure0','pressure1','pressure2','pressure3',
            'volumeDiff0','volumeDiff1','volumeDiff2','volumeDiff3']

df_copy2 = df_copy[features+['y']].drop_duplicates()
#df_copy2 = df_copy
X_train = np.array(df_copy2[features]).reshape(-1,len(features))
y_train = np.array(df_copy2['y']).reshape(-1,)

lr = HuberRegressor()

 #best bid vol, best ask vol
scores = cross_val_score(lr,X_train,y_train,cv=2,scoring='r2')
print("Cross Val score: ", scores.mean())
lr.fit(X_train,y_train)
print("In sample score: ",lr.score(X_train,y_train))
print("Coefs: ",lr.coef_)

# from sklearn.externals import joblib
# joblib.dump(lr, '../XTXStarterKit-master/python/LinearRegression.pkl')
# joblib.dump(lr, 'LinearRegression.pkl')


import statsmodels.api as sm

model =sm.OLS(y_train,sm.add_constant(X_train))
model.fit().summary()

#Huber Loss - Cross Val score:  0.015652126650498255,In sample 0.015881198921535056, drop duplicates 
#features = ['pressure0','pressure1','pressure2','pressure3','volumeDiff0','volumeDiff1','volumeDiff2','volumeDiff3']

In [None]:
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.model_selection import cross_val_score

df_copy = df
features = ['askSize0', 'bidSize0', 'askSize1', 'bidSize1','askSize2','bidSize2','modeAskDepth'] 
X_train = df_copy[features].values.reshape(-1,len(features))
y_train = df_copy['y'].values.reshape(-1,1)



params = {'max_depth':2,'n_estimators':150,'learning_rate':0.05}
xgbr = XGBRegressor(max_depth=params['max_depth'],
                    n_estimators=params['n_estimators'],
                    n_jobs=2,
                    learning_rate = params['learning_rate'],
                    objective="reg:squarederror",)
print("Cross val score: ",cross_val_score(xgbr,X_train,y_train,cv=3,scoring='r2').mean())

#0.02025 on test set with [askSize0, bidSize0, askSize1, bidSize1,askSize2,bidSize2] 
#0.015488818975168683 cross val two folds
xgbr.fit(X_train,y_train)
xgb.plot_importance(xgbr)

from sklearn.metrics import r2_score
df['preds'] = xgbr.predict(X_train)
print("In sample score: ",r2_score(y_train,df['preds']))


# from sklearn.externals import joblib
# joblib.dump(xgbr, '../XTXStarterKit-master/python/model.pkl')
# joblib.dump(xgbr, 'model.pkl')




In [None]:
plt.bar(df.iloc[0,0:15],df.iloc[0,15:30])
plt.bar(df.iloc[0,30:45],df.iloc[0,45:60])

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
from matplotlib import animation

fig=plt.figure()
fig.set_size_inches(15,7.5)
axes = plt.gca()
axes.set_xlim([1600,1800])
axes.set_ylim([0,150])



#ax.set_xlim(-1600,1800)
#ax.setylim(0,200)

n=10000 #Number of frames


def animate(i):
    fig.clear()
    axes = plt.gca()
    axes.set_xlim([1600,1640])
    axes.set_ylim([0,150])
    y=df.iloc[i,15:30]
    x=df.iloc[i,0:15]
    plt.bar(x,y)
    y=df.iloc[i,45:60]
    x=df.iloc[i,30:45]
    plt.bar(x,y)
    
    

anim=animation.FuncAnimation(fig,animate,repeat=False,blit=False,frames=n,
                             interval=10)


plt.show()

# Writer = animation.writers['ffmpeg']
# writer = Writer(fps=15, metadata=dict(artist='Me'), bitrate=1800)
# anim.save('orderbook.mp4', writer=writer)

In [None]:
from matplotlib.widgets import Slider

In [None]:
joint distribution of bid asks
kde on

In [None]:
randomforest

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
xgb.plot_tree(xgbr,ax=ax,num_trees=30)

In [None]:
from sklearn.neural_network import MLPRegressor
mlp =MLPRegressor(hidden_layer_sizes=(100,3),alpha=0.0001,learning_rate_init=0.5,
                  max_iter=50,warm_start=True,n_iter_no_change=10,batch_size=1000)
features = ['askSize0','bidSize0']
X_train = np.array(df_copy[features]).reshape(-1,len(features))
y_train = np.array(df_copy['y']).reshape(-1,)
cross_val_score(mlp,X_train,y_train,cv=2,scoring='r2').mean()

In [None]:
def coeff_determination(y_true, y_pred):
    
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )


import tensorflow as tf
from tensorflow.keras import backend as K
model = tf.keras.Sequential()
model.add(tf.keras.layers.SimpleRNN())
model.add(tf.keras.layers.Dropout(rate=0.2))
model.add(tf.keras.layers.Dense(1))


model.compile(optimizer=tf.train.AdamOptimizer(0.01),
              loss='mse', 
              metrics=[coeff_determination])





In [None]:
#tensorflow export
features = ['askSize0','bidSize0']
X_train = np.array(df[features]).reshape(-1,len(features))
y_train = np.array(df['y']).reshape(-1,)


model.fit(X_train, y_train, epochs=10, batch_size=3000)
model.predict(np.array([0,1]).reshape(-1,2))[0][0]
model.save('my_model.h5')