In [37]:
import os
import pandas as pd
from data_collection import DataCollection


In [38]:
ROOT_DIR = os.path.realpath(os.path.abspath(''))
df_out = pd.read_csv(ROOT_DIR + "/../data/output/res_20190610.csv_aggregate_output.csv")

In [39]:
df_out.head()

Unnamed: 0,timestamp,price,side,bp0,bq0,bp1,bq1,bp2,bq2,bp3,...,ap0,aq0,ap1,aq1,ap2,aq2,ap3,aq3,ap4,aq4
0,0,9990,b,9990,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,9990,b,9990,27,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,9995,a,9990,27,0,0,0,0,0,...,9995,1,0,0,0,0,0,0,0,0
3,0,9990,b,9990,33,0,0,0,0,0,...,9995,1,0,0,0,0,0,0,0,0
4,0,9990,b,9990,34,0,0,0,0,0,...,9995,1,0,0,0,0,0,0,0,0


# Calculating our feature set

For more information on implementation, view ./data_collection.py

In [40]:
spread = df_out.apply(DataCollection.calc_bid_ask_spread, 1)
current_price = df_out.apply(DataCollection.calc_going_rate, 1)
vwap_a = df_out.apply(DataCollection.calc_vwap_a, 1)
vwap_b = df_out.apply(DataCollection.calc_vwap_b, 1)
imbalance = df_out.apply(DataCollection.calc_imbal, 1)
volume = df_out.apply(DataCollection.calc_volume, 1)


In [41]:
target = DataCollection.calc_target(current_price)

Raw feature set

In [42]:
feature_set = pd.DataFrame({
    "timestamp": df_out['timestamp'],
    "going_rate": current_price,
    "b/a_spread": spread,
    "ask_vol_weight_avg_price": vwap_a,
    "bid_vol_weight_avg_price": vwap_b,
    "order_imbalance": imbalance,
    "volume": volume,
    "target": target
})

We will remove the first two, and last rows from the feature set. This is because, the first two rows have measures for the going rate. The last row will be removed because the target is arbitrarily set to 'no_change'

In [43]:
feature_set_cleaned = feature_set[3:-1]

In [44]:
feature_set_cleaned.head()

Unnamed: 0,timestamp,going_rate,b/a_spread,ask_vol_weight_avg_price,bid_vol_weight_avg_price,order_imbalance,volume,target
3,0,9992.5,5,9995.0,9990.0,1.0,66,no_change
4,0,9992.5,5,9995.0,9990.0,1.0,68,no_change
5,0,9992.5,5,9995.0,9989.857143,0.5,69,no_change
6,0,9992.5,5,9997.5,9989.857143,1.0,70,no_change
7,0,9992.5,5,9997.5,9989.722222,1.0,72,no_change


To further clean the data, I will remove the order_imbalance column. 
This may be useful in scenarios where there is no maximum order book depth. However, in this case we use a maximum of five. For the majority of the data, this value does not change

In [45]:
feature_set_cleaned = feature_set_cleaned.drop("order_imbalance", axis=1)

# Modeling using the first iteration of features

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [47]:
logmodel = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(feature_set_cleaned.drop('target', axis=1), feature_set_cleaned['target'], test_size=.3)

In [48]:
logmodel.fit(X_train, y_train)

In [49]:
predictions = logmodel.predict(X_test)

In [50]:
from sklearn.metrics import classification_report

In [51]:
print(classification_report(y_test, predictions))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     bearish       0.00      0.00      0.00       526
     bullish       0.00      0.00      0.00       562
   no_change       0.97      1.00      0.99     40555

    accuracy                           0.97     41643
   macro avg       0.32      0.33      0.33     41643
weighted avg       0.95      0.97      0.96     41643



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


This model shows zero accuracy when attempting to predict price change. <br />
This could be due to class imbalance of target results, or maybe the model that was used in the first iteration. <br />
For the next iteration, I will try a random forest classifier before modifying the data further

In [52]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [54]:
rfc_pred = rfc.predict(X_test)

In [55]:
print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

     bearish       0.14      0.04      0.06       526
     bullish       0.23      0.07      0.10       562
   no_change       0.98      0.99      0.98     40555

    accuracy                           0.97     41643
   macro avg       0.45      0.37      0.38     41643
weighted avg       0.95      0.97      0.96     41643



It seems that the random forest classifier performed better when determining some form of prediction for bearish and bullish events <br />
The second iteration of data will be continued in ./model_2.py