# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ta
import lightgbm as lgb
import xgboost as xgb
# from fastai import *
# from fastai.tabular import *
from tqdm import tqdm_notebook
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from rolling import RollingWindowSplit
from sklearn.metrics import r2_score as r2d2
from joblib import dump as dumper
from joblib import load as loader
from datetime import datetime, timedelta

%matplotlib inline
# %load_ext line_profiler
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [2]:
path = 'D://Coding//XTX Forecasting Challenge//data-training.file'
df = pd.read_feather(path, use_threads=8)
df = df.astype('float32')
df.fillna(0, inplace=True)

In [3]:
askRateList = ['askRate' + str(i) for i in range(0,15)]
askSizeList = ['askSize' + str(i) for i in range(0,15)]
bidRateList = ['bidRate' + str(i) for i in range(0,15)]
bidSizeList = ['bidSize' + str(i) for i in range(0,15)]

In [4]:
# 70-30 train-valid and test split
def train_valid_test_split(df):
    valid_start = int(0.80*len(df))
    test_start = int(0.95*len(df))
    train_df = df[:valid_start].copy()
    valid_df = df[valid_start:test_start].copy()
    test_df = df[test_start:].copy()
    return train_df, valid_df, test_df
train_df, valid_df, test_df = train_valid_test_split(df)

In [5]:
core = [*askRateList, *askSizeList, *bidRateList, *bidSizeList]

# Exploratory Data Analysis

In [6]:
# # Figuring out what [y] is
# # y(t) is midRate(t+87) - midRate(t), clipped to (-5.5)
# df['expectedY'] = df.midRate.diff(87).shift(-87).clip(-5,5)

# Batch prediction

In [7]:
# the public leaderboard set should have 150k observations: they check running time of 10k in 1h and max 15h

In [8]:
def create_limited_features_orig(df: pd.DataFrame):
    # limited features
    df['midRate'] = (df.bidRate0 + df.askRate0) / 2 # necessary for ohlc
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    df['totalAskVol'] = df[askSizeList].sum(axis=1)
    df['totalBidVol'] = df[bidSizeList].sum(axis=1)
    df['OIR_total'] = (df.totalBidVol - df.totalAskVol)/(df.totalBidVol + df.totalAskVol)
    
    df['spread'] = df.askRate0 - df.bidRate0
    df['vwaBid'] = np.einsum('ij,ji->i', df[bidRateList], df[bidSizeList].T) / df[bidSizeList].sum(axis=1)
    df['vwaAsk'] = np.einsum('ij,ji->i', df[askRateList], df[askSizeList].T) / df[askSizeList].sum(axis=1)
    df['vwaBidDMid'] = df.midRate - df.vwaBid
    df['vwaAskDMid'] = df.vwaAsk - df.midRate
    df['diff_vwaBidAskDMid'] = df.vwaAskDMid - df.vwaBidDMid
    
    b1, a1 = (df.bidRate0 < df.bidRate0.shift(1)), (df.askRate0 < df.askRate0.shift(1))
    b2, a2 = (df.bidRate0 == df.bidRate0.shift(1)), (df.askRate0 == df.askRate0.shift(1))
    valsB, valsA = [0, (df.bidSize0 - df.bidSize0.shift(1))], [0, (df.askSize0 - df.askSize0.shift(1))]
    df['deltaVBid'] = np.select([b1,b2], valsB, default=df.bidSize0)
    df['deltaVAsk'] = np.select([a1,a2], valsA, default=df.askSize0)
    df['VOI'] = df.deltaVBid - df.deltaVAsk
    
#     time features
    tsi = [87, 261, 348, 435, 522]
    trix = [87, 174, 348, 435, 522]
    for t in tsi:        df['tsi' + str(t)] = ta.momentum.tsi(df.midRate, s=t, r=2.25*t)
    for t in trix:       df['trix' + str(t)] = ta.trend.trix(df.midRate, n=t)
    return df

In [9]:
rlcv = RollingWindowSplit(n_splits=3, compatible=True)

In [10]:
# takes 22s
limited_train = create_limited_features_orig(train_df)
limited_valid = create_limited_features_orig(valid_df)

In [11]:
sns.set(style = "whitegrid")

In [12]:
def preprocess(limited_df):
    x_train = limited_df.replace([np.inf, -np.inf], np.nan).fillna(0).drop('y', axis=1).values
    y_train = limited_df.y.values
    return x_train, y_train

# takes 10s
x_scaled_train, y_train = preprocess(limited_train)
x_scaled_valid, y_valid = preprocess(limited_valid)

In [None]:
# # true best params
# fixed_params = {'boosting_type': 'gbdt', 'nthread': 4, 'random_state': 42, 'metric': 'rmse'}
# best_params = {'subsample': 0.7, 'reg_lambda': 1, 'reg_alpha': 0.5, 'num_leaves': 160, 'min_split_gain': 0.75,
#                'min_data_in_leaf': 50, 'max_depth': 3, 'max_bin': 70, 'learning_rate': 0.03, 'colsample_bytree': 0.9}

In [None]:
# '''0.0330, 0.0235, 0.0230'''
# fixed_params = {'boosting_type': 'gbdt', 'nthread': 4, 'random_state': 42, 'metric': 'rmse'}
# best_params = {'subsample': 0.7, 'reg_lambda': 1, 'reg_alpha': 0.5, 'num_leaves': 80, 'min_split_gain': 0.75,
#                'min_data_in_leaf': 50, 'max_depth': 4, 'max_bin': 70, 'learning_rate': 0.03, 'colsample_bytree': 0.9}

In [None]:
# '''0.0411, 0.0242, 0.0234'''
fixed_params = {'boosting_type': 'gbdt', 'nthread': 4, 'random_state': 42, 'metric': 'rmse'}
best_params = {'subsample': 0.7, 'reg_lambda': 1, 'reg_alpha': 0.5, 'num_leaves': 40, 'min_split_gain': 0.75,
               'min_data_in_leaf': 500, 'max_depth': 5, 'max_bin': 70, 'learning_rate': 0.03, 'colsample_bytree': 0.9}

In [30]:
fixed_params = {'boosting_type': 'gbdt', 'nthread': 4, 'random_state': 42, 'metric': 'rmse'}
best_params = {'subsample': 0.7, 'reg_lambda': 1, 'reg_alpha': 0.5, 'num_leaves': 40, 'min_split_gain': 0.75,
               'min_data_in_leaf': 100, 'max_depth': 5, 'max_bin': 70, 'learning_rate': 0.03, 'colsample_bytree': 0.9}

# takes 1 min
lgbm = lgb.LGBMRegressor(**fixed_params, **best_params, eval_set=(x_scaled_valid, y_valid), eval_metric='l2').fit(x_scaled_train, y_train)

In [32]:
# lgbm.booster_.save_model('lgbm.txt')
bst = lgb.Booster(model_file='lgbm.txt')

In [15]:
def score(model, x_scaled_train, x_scaled_valid, y_train, y_valid):
    predictions_train = model.predict(x_scaled_train)
    train_score = r2d2(y_train, predictions_train)
    predictions_valid = model.predict(x_scaled_valid)
    valid_score = r2d2(y_valid, predictions_valid)
    limited_test = create_limited_features_orig(test_df)
    x_scaled_test, y_test = preprocess(limited_test)
    predictions_test = model.predict(x_scaled_test)
    test_score = r2d2(y_test, predictions_test)
    print(f'{train_score:.4f}, {valid_score:.4f}, {test_score:.4f}')
    return predictions_test, limited_test

In [34]:
predictions_test, limited_test = score(bst, x_scaled_train, x_scaled_valid, y_train, y_valid)

0.0411, 0.0242, 0.0234


In [None]:
'''0.0244, 0.0229, 0.0225 original'''
predictions_test, limited_test = score(lgbm, x_scaled_train, x_scaled_valid, y_train, y_valid)

In [None]:
# dumper(lasso, 'lasso.joblib')
# dumper(ridge, 'ridge.joblib')
# lasso = loader('lasso.joblib')
# ridge = loader('ridge.joblib')

In [None]:
plt.subplots(1,1,figsize=(15,8))
limit = 1000
plt.plot(predictions_test[:limit])
plt.plot(limited_test.y.values[:limit])
plt.legend(['predictions', 'y'])

In [None]:
columns = limited_train.columns.drop('y')
a = lgbm.feature_importances_; a

In [None]:
'''You should chuck the variables that meet this condition!'''
columns.values[np.where(a>-1,1,0).astype(bool)]

In [None]:
'''These are the variables that contribute to the lgbm!'''
columns.values[np.where(a>20,1,0).astype(bool)]

# Fast.ai

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fastai import *
from fastai.tabular import *
from tqdm import tqdm_notebook
from sklearn.metrics import r2_score as r2d2
from joblib import dump as dumper
from joblib import load as loader

%matplotlib inline
# %load_ext line_profiler
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [None]:
path = 'D://Coding//XTX Forecasting Challenge//data-training.file'
df = pd.read_feather(path, use_threads=8)
df = df.astype('float32')
df.fillna(0, inplace=True)

In [None]:
askRateList = ['askRate' + str(i) for i in range(0,15)]
askSizeList = ['askSize' + str(i) for i in range(0,15)]
bidRateList = ['bidRate' + str(i) for i in range(0,15)]
bidSizeList = ['bidSize' + str(i) for i in range(0,15)]

In [None]:
# 70-30 train-valid and test split
def train_valid_test_split(df):
    valid_start = int(0.80*len(df))
    test_start = int(0.95*len(df))
    train_df = df[:valid_start].copy()
    valid_df = df[valid_start:test_start].copy()
    test_df = df[test_start:].copy()
    return train_df, valid_df, test_df
train_df, valid_df, test_df = train_valid_test_split(df)

In [None]:
def create_limited_features_orig(df: pd.DataFrame):
    # limited features
    df['midRate'] = (df.bidRate0 + df.askRate0) / 2 # necessary for ohlc
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    df['totalAskVol'] = df[askSizeList].sum(axis=1)
    df['totalBidVol'] = df[bidSizeList].sum(axis=1)
    df['OIR_total'] = (df.totalBidVol - df.totalAskVol)/(df.totalBidVol + df.totalAskVol)
    
    df['spread'] = df.askRate0 - df.bidRate0
    df['vwaBid'] = np.einsum('ij,ji->i', df[bidRateList], df[bidSizeList].T) / df[bidSizeList].sum(axis=1)
    df['vwaAsk'] = np.einsum('ij,ji->i', df[askRateList], df[askSizeList].T) / df[askSizeList].sum(axis=1)
    df['vwaBidDMid'] = df.midRate - df.vwaBid
    df['vwaAskDMid'] = df.vwaAsk - df.midRate
    df['diff_vwaBidAskDMid'] = df.vwaAskDMid - df.vwaBidDMid
    
    b1, a1 = (df.bidRate0 < df.bidRate0.shift(1)), (df.askRate0 < df.askRate0.shift(1))
    b2, a2 = (df.bidRate0 == df.bidRate0.shift(1)), (df.askRate0 == df.askRate0.shift(1))
    valsB, valsA = [0, (df.bidSize0 - df.bidSize0.shift(1))], [0, (df.askSize0 - df.askSize0.shift(1))]
    df['deltaVBid'] = np.select([b1,b2], valsB, default=df.bidSize0)
    df['deltaVAsk'] = np.select([a1,a2], valsA, default=df.askSize0)
    df['VOI'] = df.deltaVBid - df.deltaVAsk
    return df

In [None]:
# takes 22s
limited_train = create_limited_features_orig(train_df)
limited_valid = create_limited_features_orig(valid_df)

In [None]:
core = [*askRateList, *askSizeList, *bidRateList, *bidSizeList]

In [None]:
limited_train = limited_train.iloc[:int(2e6)]

In [None]:
dep_var = 'y'
procs = [FillMissing, Normalize]

In [None]:
# takes 7s on 500k rows, 30s on full df
test = TabularList.from_df(test_df, procs=procs)
data = (TabularList.from_df(df = limited_train, path='.', cont_names = df.columns.drop('y'), procs=procs)
                            .split_by_idx(valid_idx=range(int(0.5*len(limited_train)),len(limited_train)))
                            .label_from_df(cols=dep_var)
                            .add_test(test, label=0)
                            .databunch())

In [None]:
dumper(data, 'data.joblib')
# data = loader('data.joblib')

In [None]:
learn = tabular_learner(data, layers=[500,500,500], metrics=[mean_absolute_error, r2_score],
                        ps=[0.5,0.5,0.5])

In [None]:
learn.model;

In [None]:
learn.lr_find(end_lr=1e1)

In [None]:
sns.set(style = "whitegrid")

In [None]:
learn.recorder.plot()

In [None]:
# model above has already diverged, we will restart.

In [None]:
# 9m for 1 cycle
learn.fit_one_cycle(1, 1e-5, wd=1)

In [None]:
learn.recorder.plot_lr(show_moms=True)

In [None]:
learn.save('new_fastai.joblib')

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.recorder.plot_losses()

In [None]:
# this takes 20s
learn.show_results()

In [None]:
# this takes 30s
preds, y = learn.get_preds(DatasetType.Test)

In [None]:
r2d2(test_df.y.values, preds)

In [None]:
r2d2(test_df.y.values, preds)

In [None]:
plt.subplots(1,1,figsize=(15,8))
plt.plot(test_df.y.values[:10000])
plt.plot(preds[:10000])
plt.legend('y', 'preds')

# Submission testing

In [17]:
def get_next_data_as_df(test_df, iteration, length):
    return pd.DataFrame([test_df.head(length).iloc[iteration][:60].values])

In [18]:
'''
This function takes in a DataFrame row [df] of shape (1,60) and adds 10 cross-sectional features.
Returns a DataFrame of shape (1,70).
'''
def create_limited_features(df):
    df.columns = [*askRateList, *askSizeList, *bidRateList, *bidSizeList]
    # limited features
    df['midRate'] = (df.bidRate0 + df.askRate0) / 2 # necessary for ohlc
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    df['totalAskVol'] = df[askSizeList].sum(axis=1)
    df['totalBidVol'] = df[bidSizeList].sum(axis=1)
    df['OIR_total'] = (df.totalBidVol - df.totalAskVol)/(df.totalBidVol + df.totalAskVol)
    
    df['spread'] = df.askRate0 - df.bidRate0
    df['vwaBid'] = np.einsum('ij,ji->i', df[bidRateList], df[bidSizeList].T) / df[bidSizeList].sum(axis=1)
    df['vwaAsk'] = np.einsum('ij,ji->i', df[askRateList], df[askSizeList].T) / df[askSizeList].sum(axis=1)
    df['vwaBidDMid'] = df.midRate - df.vwaBid
    df['vwaAskDMid'] = df.vwaAsk - df.midRate
    df['diff_vwaBidAskDMid'] = df.vwaAskDMid - df.vwaBidDMid
    
    b1, a1 = (df.bidRate0 < df.bidRate0.shift(1)), (df.askRate0 < df.askRate0.shift(1))
    b2, a2 = (df.bidRate0 == df.bidRate0.shift(1)), (df.askRate0 == df.askRate0.shift(1))
    valsB, valsA = [0, (df.bidSize0 - df.bidSize0.shift(1))], [0, (df.askSize0 - df.askSize0.shift(1))]
    df['deltaVBid'] = np.select([b1,b2], valsB, default=df.bidSize0)
    df['deltaVAsk'] = np.select([a1,a2], valsA, default=df.askSize0)
    df['VOI'] = df.deltaVBid - df.deltaVAsk
    return df

In [19]:
''' Appends to [massive_df]=(many, >70) the DataFrame row [row]=(1,70). '''
def append_to_df(massive_df, row):
    return massive_df.append(row, sort=False)

In [20]:
''' Adds time-dependent features. All features that use shift/diff must come here. '''
def add_time_features(df, massive_df_length):
    tsi = [87, 261, 348, 435, 522]
    trix = [87, 174, 348, 435, 522]
    for t in tsi:        df['tsi' + str(t)] = ta.momentum.tsi(df.midRate, s=t, r=2.25*t)
    for t in trix:       df['trix' + str(t)] = ta.trend.trix(df.midRate, n=t)
    return df[-massive_df_length:]

In [21]:
''' This function takes in all features and makes a bounded prediction. '''
def get_prediction(data):
    X = data.replace([np.inf, -np.inf], np.nan).values
    return np.clip(lgbm.predict(np.atleast_2d(X)), -5, 5)[0]

In [22]:
''' true_rlcvscore runs the submission functions on a test dataframe [test_df] taking the first [length] rows of [test_df].'''
def true_rlcvscore(test_df, length):
    massive_df_length = 1565
    massive_df, predictions = pd.DataFrame(), []
    log_data = pd.DataFrame()  # for debug
    for iteration in tqdm_notebook(range(length)):
        base_row = get_next_data_as_df(test_df, iteration, length)
        row = create_limited_features(base_row)
        massive_df = append_to_df(massive_df, row)
        massive_df = add_time_features(massive_df, massive_df_length)
        data = pd.DataFrame([massive_df.iloc[-1]])
        log_data_row = data.copy() # for debug
        prediction = get_prediction(data)
        predictions.append(prediction)
        log_data = log_data.append(log_data_row, sort=False) # for debug
    true_values = test_df.y.head(length)
    score = r2d2(true_values, predictions)
    print(f'{score:.4f}')
    return predictions, score, log_data, true_values

In [23]:
''' 7.93 it/s takes 8 hours'''
''' 16.00 it/s should take 4h'''
''' about 9 it/s should take 7h'''

' about 9 it/s should take 7h'

In [None]:
# takes 2m30s for 1000
# massive_d = true_rlcvscore(test_df, lgbm, 1000)
test_predictions, test_score, log_data, true_values = true_rlcvscore(test_df, 2000)

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))

In [None]:
log_data

In [None]:
plt.subplots(1,1,figsize=(15,8))
plt.plot(test_predictions)
plt.plot(true_values.values)
plt.legend(['predictions', 'y'])

In [None]:
# %lprun -f true_rlcvscore test_predictions, test_score = true_rlcvscore(test_df, 100)