In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_rows=100
pd.options.display.max_columns=100

In [None]:
df = pd.read_csv("data-training.csv")

## impute missing values

#fill missing volumes with 0
df.iloc[:,15:30] = df.iloc[:,15:30].fillna(0)

#fill missing askRates with +0.5 of the prior
for i in range(1,15):
    df.loc[df['askRate'+str(i)].isna(),'askRate'+str(i)] = df.loc[df['askRate'+str(i)].isna(),'askRate'+str(i-1)] + 0.5
    
#add price, spread
df['price'] = (df['askRate0'] + df['bidRate0']) / 2
df['spread'] = df['askRate0'] - df['bidRate0']

In [None]:
#price movement for full dataset
df['price'].plot(figsize=(20,10),grid=True)

In [30]:
df.loc[1001,'price']

1619.75

In [49]:
from statsmodels.tsa.arima_model import ARIMAResults

arima = ARIMAResults(df['price'].head(1000),(1,1,0))
arima = arima.fit()
arima.predict(start=1,end=10000)

1        0.002253
2        0.002505
3        0.002505
4        0.002505
5        0.002505
           ...   
9996     0.002253
9997     0.002253
9998     0.002253
9999     0.002253
10000    0.002253
Length: 10000, dtype: float64

In [None]:
df['price'].tail(100).plot(figsize=(20,10),grid=True)

In [None]:
(df.groupby(df.index//10000)['y'].mean()).plot(figsize=(20,10),grid=True)

In [None]:
(df.groupby(df.index//10000)['price'].mean().diff(1)).plot(figsize=(20,10),grid=True)

# Target Variable Analaysis

In [None]:
import plotly.graph_objects as go


fig = go.Figure(data=go.Scattergl(x=list(range(100000)), y=df['y'].values[:100000]))
fig.show()


In [None]:


import plotly.express as px


import plotly.graph_objects as go
fig = go.Figure([go.Bar(x=df['y'].value_counts().sort_index().index, y=df['y'].value_counts().sort_index().values)])
fig.show()

In [None]:
df['y'].describe()

In [None]:
df['y'].autocorr(80)

### Multivariate Analysis 

In [None]:
#correlation heatmap
import seaborn as sns

sns.heatmap(df.corr())

In [None]:
df.corrwith(df['y']).abs().sort_values(ascending=False)

### Volume  Summary Statistics

In [None]:
#average volume at each level
plt.bar(list(range(-15,0)),df.iloc[:,list(range(59,44,-1))].mean().values)
plt.bar(list(range(1,16)),df.iloc[:,list(range(15,30))].mean().values)

In [None]:
import plotly.graph_objects as go


fig = go.Figure(data=go.Scattergl(x=list(range(df.shape[0])), y=df['askSize0'].values))
fig.show()


In [None]:
#total Ask volume 
df['totalAskVolume'] = df.iloc[:,list(range(15,30))].sum(axis=1).values
df['totalAskVolume'].plot()

In [None]:
#total Bid volume
df['totalBidVolume'] = df.iloc[:,list(range(45,60))].sum(axis=1)
df['totalBidVolume'].plot()

In [None]:
#total Ask - total bid
(df['totalAskVolume']-df['totalBidVolume']).plot()

In [None]:
df['askSize0'].plot()

In [None]:
df['bidSize0'].plot()

In [None]:
(df['askSize0']-df['bidSize0']).plot()

In [None]:
#autocorrelation
df['askSize0'].autocorr(100)

In [None]:
#differenced
df['askSize0'].diff(100).plot()

In [None]:
(df['askSize0'].rolling(10).mean()).corr(df['y'])

# Rates / Price

In [None]:
df['price'].plot()

In [None]:
df['price'].rolling(10000).mean().plot()

In [None]:
(df['price'].rolling(10).max()-df['price'].rolling(10).min()).plot()

In [None]:
df['price'].diff(10).plot()

In [None]:
df['price'].diff(10).plot()

In [None]:
df['y'].plot(figsize=(20,10))

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X = pca.fit_transform(df.iloc[:,list(range(15,25))].values)
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

In [None]:
df['price'].diff(50).plot()

In [None]:
import plotly.graph_objects as go


fig = go.Figure(data=[go.Scattergl(x=list(range(10000)), y=df['askRate0'].values[:10000]),
                     go.Scattergl(x=list(range(10000)), y=df['bidRate0'].values[:10000])]
               )

fig.show()



In [None]:
import plotly.graph_objects as go


fig = go.Figure(data=[go.Scattergl(x=list(range(10000)), y=df['spread'].values[:10000]),]
               )

fig.show()



In [None]:
df['spread'] >= 0.5

In [None]:
for i in range(15):
    df['orderImbalance'+str(i)] = (df['askSize'+str(i)]-df['bidSize'+str(i)]) / (df['askSize'+str(i)]+df['bidSize'+str(i)])
    df['levelImbalance'+str(i)] = (df['askSize'+str(i)]-df['bidSize'+str(i)])

In [None]:
df['orderImbalance0'].hist(bins=100)

In [None]:
df['levelImbalance0'].hist(bins=100)

In [None]:
np.log(df['askSize0']).hist(bins=100)

In [None]:
np.log(df['bidSize0']).hist(bins=100)

In [None]:
(df['askSize0'] ** (1 /df['askSize0'].mean())).hist(bins=100)

In [None]:
(df['askSize1'] ** (1 /df['askSize1'].mean())).hist(bins=100)

In [None]:
(df['bidSize1'] ** (1 /df['askSize1'].mean())).hist(bins=100)

In [None]:
np.log(df['bidSize1']).hist(bins=100)

In [None]:
for i in range(15):
    df.iloc[:,i] -= df['price']
    df.iloc[:,30+i] -= df['price']

In [None]:
df['askRate0'].plot()
df['bidRate0'].plot()

In [None]:
#spread
(df['askRate0']-df['bidRate0']).plot()

In [None]:
(df['askRate0']-df['bidRate0']).hist(bins=100)

In [None]:
(df['askRate0']-df['askRate1']).value_counts()

In [None]:
df['askRate1'].hist()

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
X = tsne.fit_transform(test.values)

In [None]:
np.corrcoef(X[:,1],df.loc[df.iloc[:,list(range(0,15))].drop_duplicates().index,'y'].values.reshape(-1,))

In [None]:
test = df.iloc[:,list(range(15,25))].drop_duplicates()

In [None]:
test.shape

In [None]:
plt.scatter(df.iloc[:,0],df.iloc[:,15])

In [None]:
plt.scatter(df.iloc[:,30],df.iloc[:,45] ** (1 / df.iloc[:,45].mean()))

In [None]:
plt.scatter(df.iloc[:,30],np.log(df.iloc[:,45]))

In [None]:
plt.scatter(df.iloc[:,30],df.iloc[:,45])

In [None]:
(1-df.iloc[:,15] ** (1 / df.iloc[:,15].mean())).hist(bins=100)
((df.iloc[:,45] ** (1 / df.iloc[:,45].mean()))-1).hist(bins=100)

In [None]:
plt.plot((np.dot(df.iloc[:,25:35].values,autoencoder.layers[1].get_weights()[0])+autoencoder.layers[1].get_weights()[1])[:,0])

In [None]:
features = []
for i in range(8):
    features += ['orderImbalance'+str(i)]
# for i in range(4):
#     features += ['levelImbalance'+str(i)]

# for i in range(4):
#     features += ['normAskSize'+str(i)]
# for i in range(4):
#     features += ['normBidSize'+str(i)]
# features += ['priceDiff10','bought','sold']
df2 = df[features+['y']].drop_duplicates().dropna()
X = df2[features].values.reshape(-1,len(features))
y = df2['y'].values.reshape(-1,)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,shuffle=True,test_size=0.4)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model

from tensorflow.keras import backend as K
import tensorflow as tf

def coeff_determination(y_true, y_pred):
    
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square(y_true) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )


inputs = Input(shape=(8,))
x = Dense(8, activation='relu')(inputs)
x = Dense(6, activation='relu')(x)
x = Dense(8, activation='relu')(x)
output = Dense(1)(x)
model = Model(inputs=inputs,outputs=output)



model.compile(optimizer="Adam",loss="mean_squared_error",metrics=[coeff_determination])
model.fit(X_train,y_train,epochs=5,batch_size=256,validation_split=0.2)

In [None]:
for i in range(15):
    df['normAskSize'+str(i)] = df['askSize'+str(i)] ** (1 / df['askSize'+str(i)].mean())
    df['normBidSize'+str(i)] = df['bidSize'+str(i)] ** (1 / df['bidSize'+str(i)].mean())

In [None]:
df['priceDiff10'] = df['price'].diff(10)
df['priceDiff20'] = df['price'].diff(20)
df['priceDiff30'] = df['price'].diff(30)

In [None]:
df['bought'] = ((df['askRate0']+df['price']).diff(1) >= 0)
df['sold'] = ((df['bidRate0']+df['price']).diff(1) <= 0)

In [None]:
df['priceDiff10'] = (df['price']-df['price'].rolling(10).mean())
df['priceDiff20'] = (df['price']-df['price'].rolling(20).mean())
df['priceDiff30'] = (df['price']-df['price'].rolling(30).mean())

In [None]:
features

In [None]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import cross_val_score


lr = LinearRegression()
scores = cross_val_score(lr,X,y,cv=3)
scores.mean()

In [None]:
lr.fit(X,y)

In [None]:
lr.score(df.loc[30:,features],df.loc[30:,'y'])

In [None]:
lr.coef_