# Setting up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import talib

%matplotlib inline
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 500)

In [2]:
# %%time
# path = 'D://Coding//XTX Forecasting Challenge//data-training.csv'
# df = pd.read_csv(path)

In [3]:
%%time
path = 'D://Coding//XTX Forecasting Challenge//data-training.file'
df = pd.read_feather(path, use_threads=8)
df = df.astype('float32')

  labels, = index.labels


Wall time: 8.17 s


# Exploratory Data Analysis

In [None]:
# # Plot of [y] distribution
# fig, ax = plt.subplots(figsize=(15,8))
# sns.kdeplot(df.y, bw=0.01)

In [4]:
%%time
# Some feature engineering
df['spread'] = df.askRate0 - df.bidRate0
df['midRate'] = (df.askRate0 + df.bidRate0) / 2
df['expectedY'] = df.midRate.diff(87).shift(-87).clip(-5,5)

Wall time: 362 ms


In [None]:
# # Figuring out what [y] is
# # y(t) is midRate(t+87) - midRate(t), clipped to (-5.5)
# sum(df.y == df.expectedY)
# df.loc[df.y != df.expectedY]

# fig, ax = plt.subplots(figsize=(15,8))
# i = 500
# shift = 87
# # plt.plot(df.index[0:i], df.midRate[0:i].diff(shift).shift(-shift+14))
# plt.plot(df.index[-i:], df.midRate[-i:].diff(shift).shift(-shift))
# plt.plot(df.index[-i:], df.y[-i:])
# plt.legend(('midRate', 'y'))

# Feature engineering

### Fill nans with zeroes

In [5]:
df.fillna(0, inplace=True)

### Basics

In [6]:
bidSizeList = ['bidSize' + str(i) for i in range(0,15)]
askSizeList = ['askSize' + str(i) for i in range(0,15)]
bidRateList = ['bidRate' + str(i) for i in range(0,15)]
askRateList = ['askRate' + str(i) for i in range(0,15)]

In [16]:
%%time
df['totalBidVol1'] = df.bidSize0 + df.bidSize1
for i in range(2,15):
    df['totalBidVol' + str(i)] = df['totalBidVol' + str(i-1)] + df['bidSize' + str(i)]

df['totalAskVol1'] = df.askSize0 + df.askSize1
for i in range(2,15):
    df['totalAskVol' + str(i)] = df['totalAskVol' + str(i-1)] + df['askSize' + str(i)]
    
for i in range(1,15):
    df['bidAskRatio' + str(i)] = df['totalBidVol' + str(i)] / df['totalAskVol' + str(i)]

df['totalAvailVol'] = df.totalBidVol14 + df.totalAskVol14

Wall time: 1.13 s


In [7]:
%%time
df['vwaBid'] = np.einsum('ij,ji->i', df[bidRateList], df[bidSizeList].T) / df[bidSizeList].sum(axis=1)
df['vwaAsk'] = np.einsum('ij,ji->i', df[askRateList], df[askSizeList].T) / df[askSizeList].sum(axis=1)
df['vwaBidDMid'] = df.midRate - df.vwaBid
df['vwaAskDMid'] = df.vwaAsk - df.midRate
df['diff_vwaBidAskDMid'] = df.vwaAskDMid - df.vwaBidDMid

Wall time: 5.23 s


# Ideas from papers

In [8]:
%%time
# Volume Order Imbalance
# I still disagree with cancelled orders..
b1, a1 = (df.bidRate0 < df.bidRate0.shift(-1)), (df.askRate0 < df.askRate0.shift(-1))
b2, a2 = (df.bidRate0 == df.bidRate0.shift(-1)), (df.askRate0 == df.askRate0.shift(-1))
valsB, valsA = [0, (df.bidSize0 - df.bidSize0.shift(-1))], [0, (df.askSize0 - df.askSize0.shift(-1))]
defaultB, defaultA = df.bidSize0, df.askSize0

df['deltaVBid'] = np.select([b1,b2], valsB, default=defaultB)
df['deltaVAsk'] = np.select([a1,a2], valsA, default=defaultA)
df['VOI'] = df.deltaVBid - df.deltaVAsk

Wall time: 343 ms


In [None]:
%%time
df.to_feather('intermediate.file')

In [None]:
%%time
df = pd.read_feather('intermediate.file')

In [9]:
%%time
# Order Imbalance Ratio
df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)

# when OIR is small, suggests that signal from VOI is weak

Wall time: 45.1 ms


# TA

In [10]:
%%time
overlapList = [talib.DEMA, talib.EMA, talib.HT_TRENDLINE, talib.KAMA, talib.MA,
               talib.SMA, talib.T3, talib.TEMA, talib.TRIMA, talib.WMA]
for func in overlapList:
    df[str(func.__name__)] = func(df.midRate)

Wall time: 1.61 s


In [13]:
%%time
momentumList = [talib.APO, talib.CMO, talib.MOM, talib.PPO, talib.ROC, talib.ROCR,
                talib.RSI, talib.TRIX]
for func in momentumList:
    df[str(func.__name__)] = func(df.midRate)

Wall time: 693 ms


In [17]:
%%time
# not true OBV
volumeIndicatorList = [talib.OBV]
for func in volumeIndicatorList:
    df[str(func.__name__)] = func(df.midRate, df.totalAvailVol)

Wall time: 125 ms


In [18]:
%%time
cycleIndicatorList = [talib.HT_DCPERIOD, talib.HT_DCPHASE, talib.HT_TRENDMODE]
for func in cycleIndicatorList:
    df[str(func.__name__)] = func(df.midRate)

Wall time: 5.72 s


In [19]:
%%time
statisticList = [talib.LINEARREG, talib.LINEARREG_SLOPE, talib.STDDEV, talib.TSF]
for func in statisticList:
    df[str(func.__name__)] = func(df.midRate)

Wall time: 490 ms


In [20]:
%%time
mathOpList = [talib.MAX, talib.MIN]
for func in mathOpList:
    df[str(func.__name__)] = func(df.midRate)

Wall time: 185 ms


# Feature Selection

# Cross-validation

In [24]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from rolling import ExpandingWindowSplit