In [1]:
import os
import pandas as pd
from data_collection import DataCollection

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
ROOT_DIR = os.path.realpath(os.path.abspath(''))
df_out = pd.read_csv(ROOT_DIR + "/../data/output/res_20190610.csv_aggregate_output.csv")

In [3]:
df_out.head()

Unnamed: 0,timestamp,price,side,bp0,bq0,bp1,bq1,bp2,bq2,bp3,...,ap0,aq0,ap1,aq1,ap2,aq2,ap3,aq3,ap4,aq4
0,0,9990,b,9990,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,9990,b,9990,27,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,9995,a,9990,27,0,0,0,0,0,...,9995,1,0,0,0,0,0,0,0,0
3,0,9990,b,9990,33,0,0,0,0,0,...,9995,1,0,0,0,0,0,0,0,0
4,0,9990,b,9990,34,0,0,0,0,0,...,9995,1,0,0,0,0,0,0,0,0


In this second iteration of data, I will build a table that uses bid/ask spread from the previous feature set but is split by order book depth. <br />

This follows the research found in "Investigating Limit Order Book Characteristics for Short Term Price Prediction: a Machine Learning Approach" (see references) that shows deeper order book levels display greater indications of impending price change <br />

To try this, I will begin with order book level 5 (ax4, bx4)

In [4]:
# first get targets from original data
current_price = df_out.apply(DataCollection.calc_going_rate, 1)
target = DataCollection.calc_target(current_price)

# then calculate features for level 5
spread = df_out.apply(DataCollection.calc_bid_ask_spread_5, 1)
volume_a = df_out["aq4"]
volume_b = df_out["bq4"]

In [8]:
level_5 = pd.DataFrame({
    "timestamp": df_out["timestamp"],
    "spread": spread,
    "volume_a": volume_a,
    "volume_b": volume_b,
    "target": target
})

In [9]:
level_5.head()

Unnamed: 0,timestamp,spread,volume_a,volume_b,target
0,0,0,0,0,no_change
1,0,0,0,0,bullish
2,0,0,0,0,no_change
3,0,0,0,0,no_change
4,0,0,0,0,no_change


In [29]:
level_5[~level_5["volume_a"].eq(0) & ~level_5["volume_b"].eq(0)]

Unnamed: 0,timestamp,spread,volume_a,volume_b,target
49,45,45,24,19,no_change
50,45,45,24,19,no_change
51,45,45,24,20,no_change
52,45,45,24,20,no_change
53,45,45,24,20,no_change
...,...,...,...,...,...
138808,35991588299,45,13,95,no_change
138809,35991591443,45,13,95,no_change
138810,35994717091,45,14,95,no_change
138811,35998622015,45,14,95,no_change


In [34]:
rfc = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(level_5.drop('target', axis=1), level_5['target'], test_size=.3)

In [35]:
rfc.fit(X_train, y_train)

In [36]:
rfc_pred = rfc.predict(X_test)

In [37]:
print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

     bearish       0.13      0.04      0.06       566
     bullish       0.17      0.07      0.10       521
   no_change       0.98      0.99      0.98     40557

    accuracy                           0.97     41644
   macro avg       0.42      0.37      0.38     41644
weighted avg       0.95      0.97      0.96     41644



It seems that the results of this experiment were not so different from the original model