In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from data_collection import DataCollection
%matplotlib inline


In [3]:
ROOT_DIR = os.path.realpath(os.path.abspath(''))
df_out = pd.read_csv(ROOT_DIR + "/../data/output/res_20190610.csv_aggregate_output.csv")

In [4]:
df_out.head()

Unnamed: 0,timestamp,price,side,bp0,bq0,bp1,bq1,bp2,bq2,bp3,...,ap0,aq0,ap1,aq1,ap2,aq2,ap3,aq3,ap4,aq4
0,0,9990,b,9990,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,9990,b,9990,27,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,9995,a,9990,27,0,0,0,0,0,...,9995,1,0,0,0,0,0,0,0,0
3,0,9990,b,9990,33,0,0,0,0,0,...,9995,1,0,0,0,0,0,0,0,0
4,0,9990,b,9990,34,0,0,0,0,0,...,9995,1,0,0,0,0,0,0,0,0


# Calculating our feature set

For more information on implementation, view ./data_collection.py

In [5]:
spread = df_out.apply(DataCollection.calc_bid_ask_spread, 1)
current_price = df_out.apply(DataCollection.calc_going_rate, 1)
vwap_a = df_out.apply(DataCollection.calc_vwap_a, 1)
vwap_b = df_out.apply(DataCollection.calc_vwap_b, 1)
imbalance = df_out.apply(DataCollection.calc_imbal, 1)
volume = df_out.apply(DataCollection.calc_volume, 1)


In [6]:
target = DataCollection.calc_target(current_price)

Raw feature set

In [8]:
feature_set = pd.DataFrame({
    "timestamp": df_out['timestamp'],
    "going_rate": current_price,
    "b/a_spread": spread,
    "ask_vol_weight_avg_price": vwap_a,
    "bid_vol_weight_avg_price": vwap_b,
    "order_imbalance": imbalance,
    "volume": volume,
    "target": target
})

We will remove the first two, and last rows from the feature set. This is because, the first two rows have measures for the going rate. The last row will be removed because the target is arbitrarily set to 'no_change'

In [9]:
feature_set_cleaned = feature_set[3:-1]

In [10]:
feature_set_cleaned.head()

Unnamed: 0,timestamp,going_rate,b/a_spread,ask_vol_weight_avg_price,bid_vol_weight_avg_price,order_imbalance,volume,target
3,0,9992.5,5,9995.0,9990.0,1.0,66,no_change
4,0,9992.5,5,9995.0,9990.0,1.0,68,no_change
5,0,9992.5,5,9995.0,9989.857143,0.5,69,no_change
6,0,9992.5,5,9997.5,9989.857143,1.0,70,no_change
7,0,9992.5,5,9997.5,9989.722222,1.0,72,no_change


As you may notice, many of our target values show "no_change"

It is my intuition to limit these values as the model will have an overload of "no_change" targets to train from

To solve this, I will subsample the entries that result in no price change

In [19]:
feature_subset = pd.concat([feature_set_cleaned[~feature_set_cleaned["target"].eq("no_change")], feature_set_cleaned[feature_set_cleaned["target"].eq("no_change")].sample(3000)])

In [24]:
feature_subset.head()

Unnamed: 0,timestamp,going_rate,b/a_spread,ask_vol_weight_avg_price,bid_vol_weight_avg_price,order_imbalance,volume,target
559,1444,9992.5,5,10011.527778,9978.074074,1.0,270,bullish
564,38202,9995.0,10,10016.888889,9978.161765,1.0,272,bullish
577,53154,9997.5,5,10017.078652,9982.897727,1.0,176,bearish
582,58107,9995.0,10,10015.737705,9978.161765,1.0,272,bullish
647,243518,9997.5,5,10016.271186,9984.016393,1.0,244,bullish


# Modeling using the first iteration of features

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [30]:
logmodel = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(feature_subset.drop('target', axis=1), feature_subset['target'], test_size=.3)

In [31]:
logmodel.fit(X_train, y_train)

In [32]:
predictions = logmodel.predict(X_test)

In [33]:
from sklearn.metrics import classification_report

In [34]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

     bearish       0.00      0.00      0.00       520
     bullish       0.00      0.00      0.00       562
   no_change       0.45      1.00      0.62       890

    accuracy                           0.45      1972
   macro avg       0.15      0.33      0.21      1972
weighted avg       0.20      0.45      0.28      1972



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
