# Imports

In [None]:
''' try using ta lib but change/adjust for different values of n'''

In [None]:
''' instead of appending, we shift(-1) and then update the last row in place'''

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ta
import lightgbm as lgb
# from fastai import *
# from fastai.tabular import *
from tqdm import tqdm_notebook
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from rolling import RollingWindowSplit
from sklearn.metrics import r2_score as r2d2
from joblib import dump, load
from datetime import datetime, timedelta

%matplotlib inline
# %load_ext line_profiler
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [2]:
path = 'D://Coding//XTX Forecasting Challenge//data-training.file'
df = pd.read_feather(path, use_threads=8)
df = df.astype('float32')
df.fillna(0, inplace=True)

In [3]:
askRateList = ['askRate' + str(i) for i in range(0,15)]
askSizeList = ['askSize' + str(i) for i in range(0,15)]
bidRateList = ['bidRate' + str(i) for i in range(0,15)]
bidSizeList = ['bidSize' + str(i) for i in range(0,15)]

In [4]:
core = [*askRateList, *askSizeList, *bidRateList, *bidSizeList]

# Exploratory Data Analysis

In [5]:
# # Figuring out what [y] is
# # y(t) is midRate(t+87) - midRate(t), clipped to (-5.5)
# df['expectedY'] = df.midRate.diff(87).shift(-87).clip(-5,5)

# Batch prediction

In [6]:
# the public leaderboard set should have 150k observations: they check running time of 10k in 1h and max 15h

In [7]:
# 70-30 train-valid and test split
def train_valid_test_split(df):
    valid_start = int(0.80*len(df))
    test_start = int(0.95*len(df))
    train_df = df[:valid_start].copy()
    valid_df = df[valid_start:test_start].copy()
    test_df = df[test_start:].copy()
    return train_df, valid_df, test_df
train_df, valid_df, test_df = train_valid_test_split(df)

In [8]:
def create_limited_features_orig(df: pd.DataFrame, nums: list):
    # limited features
    df['midRate'] = (df.bidRate0 + df.askRate0) / 2 # necessary for ohlc
    df['bidAskVol'] = df.bidSize0 + df.askSize0 # necessary for ohlc
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    df['totalAskVol'] = df[askSizeList].sum(axis=1)
    df['totalBidVol'] = df[bidSizeList].sum(axis=1)
    df['OIR_total'] = (df.totalBidVol - df.totalAskVol)/(df.totalBidVol + df.totalAskVol)
    
    df['spread'] = df.askRate0 - df.bidRate0
    df['bidAskVol'] = df.askSize0 + df.bidSize0
    df['vwaBid'] = np.einsum('ij,ji->i', df[bidRateList], df[bidSizeList].T) / df[bidSizeList].sum(axis=1)
    df['vwaAsk'] = np.einsum('ij,ji->i', df[askRateList], df[askSizeList].T) / df[askSizeList].sum(axis=1)
    df['vwaBidDMid'] = df.midRate - df.vwaBid
    df['vwaAskDMid'] = df.vwaAsk - df.midRate
    df['diff_vwaBidAskDMid'] = df.vwaAskDMid - df.vwaBidDMid
    
#     time features
    for num in nums:
        df['rsi' + str(num)] = ta.momentum.rsi(df.midRate, n=num)
        df['tsi' + str(num)] = ta.momentum.tsi(df.midRate, s=num, r=2*num)
        df['macd' + str(num)] = ta.trend.macd(df.midRate, n_fast=num, n_slow=int(num*2.5))
        df['macd_diff' + str(num)] = ta.trend.macd_diff(df.midRate, n_fast=num, n_slow=int(num*2.5))
        df['ema' + str(num)] = ta.trend.ema_indicator(df.midRate, n=num)
        df['trix' + str(num)] = ta.trend.trix(df.midRate, n=num)
        df['dpo' + str(num)] = ta.trend.dpo(df.midRate, n=num)
    return df

In [9]:
rlcv = RollingWindowSplit(n_splits=3, compatible=True)

In [10]:
# takes 15s min
num=87
limited_train = create_limited_features_orig(train_df, [*np.arange(num,num*10,num)])
limited_valid = create_limited_features_orig(valid_df, [*np.arange(num,num*10,num)])

In [11]:
def preprocess(limited_df):
    x_train = limited_df.replace([np.inf, -np.inf], np.nan).fillna(0).drop('y', axis=1).values
    y_train = limited_df.y.values
    return x_train, y_train

In [12]:
# takes 10s
x_scaled_train, y_train = preprocess(limited_train)
x_scaled_valid, y_valid = preprocess(limited_valid)

In [13]:
fixed_params = {'boosting_type': 'gbdt',
                'nthread': 4,
                'random_state': 42,
                'metric': 'rmse'}

grid_params = {'learning_rate': [0.03, 0.04, 0.05, 0.1],
                'num_leaves': [120,140,160,180,200,220,240,300,400,500],
                'max_bin': [60,70,80,90,100],
                'max_depth' : [0,1,2,3,4,5,6],
                'colsample_bytree' : [0.7,0.8,0.9,1],
                'subsample' : [0.3,0.5,0.7,0.9],
                'min_split_gain' : [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99],
                'min_data_in_leaf':[30,40,50,60,70,80,90,100],
                'reg_alpha': [0.1,0.3,0.5,0.7,1],
                'reg_lambda': [0.1,1,3,5],
              }

In [14]:
# ''' RandomSearch for optimal LGBM parameters '''
# clf = lgb.LGBMRegressor(**fixed_params, eval_set=(x_scaled_valid, y_valid), eval_metric='l2')
# grid = RandomizedSearchCV(clf, grid_params, verbose=1, cv=rlcv, n_jobs = -1, n_iter=100)
# grid.fit(x_scaled_train, y_train)

In [15]:
# true best params
best_params = {'subsample': 0.7, 'reg_lambda': 1, 'reg_alpha': 0.5, 'num_leaves': 200, 'min_split_gain': 0.75,
               'min_data_in_leaf': 50, 'max_depth': 3, 'max_bin': 70, 'learning_rate': 0.03, 'colsample_bytree': 0.9}

In [16]:
# takes 1 min
lgbm = lgb.LGBMRegressor(**fixed_params, **best_params, eval_set=(x_scaled_valid, y_valid), eval_metric='l2').fit(x_scaled_train, y_train)

In [17]:
dump(lgbm, 'lgbm.joblib')
# lgbm = load('lgbm110.joblib')

['lgbm.joblib']

In [18]:
def score(model, number_list, x_scaled_train, x_scaled_valid, y_train, y_valid):
    train_score = model.score(x_scaled_train, y_train)
    
    predictions_valid = model.predict(x_scaled_valid)
    valid_score = r2d2(y_valid, predictions_valid)
    
    limited_test = create_limited_features_orig(test_df, number_list)
    x_scaled_test, y_test = preprocess(limited_test)
    predictions_test = model.predict(x_scaled_test)
    test_score = r2d2(y_test, predictions_test)
    print(f'{train_score:.4f}, {valid_score:.4f}, {test_score:.4f}')
    return predictions_test, limited_test

In [19]:
predictions_test, limited_test = score(lgbm, [*np.arange(num,num*10,num)], x_scaled_train, x_scaled_valid, y_train, y_valid)

0.0249, 0.0216, 0.0230


In [None]:
'''
0.0249, 0.0216, 0.0230 for 10
0.0252, 0.0218, 0.0201 for 15
0.0257, 0.0218, 0.0205 for 20
'''
predictions_test, limited_test = score(lgbm, [*np.arange(num,num*10,num)], x_scaled_train, x_scaled_valid, y_train, y_valid)

In [None]:
predictions_test, limited_test = score(lgbm, [], x_scaled_train, x_scaled_valid, y_train, y_valid)

In [None]:
nums = [[*np.arange(num,num*10,num)] for num in range(51,151,10)]
def test_various_windows(nums):
    for number_list in tqdm_notebook(nums):
        train = create_limited_features_orig(train_df, number_list)
        valid = create_limited_features_orig(valid_df, number_list)
        x_scaled_train, y_train = preprocess(train)
        x_scaled_valid, y_valid = preprocess(valid)
        lgbm = lgb.LGBMRegressor(**fixed_params, **best_params, eval_set=(x_scaled_valid, y_valid),
                                 eval_metric='l2').fit(x_scaled_train, y_train)
        print(f'Trying {number_list[0]} to {number_list[-1]}:', end='\t')
        predictions, limited_test = score(lgbm, number_list, x_scaled_train, x_scaled_valid,
                                          y_train, y_valid)
test_various_windows(nums)

In [None]:
sns.set(style='whitegrid')

In [None]:
plt.subplots(1,1,figsize=(15,8))
limit = 1000
plt.plot(predictions_test[:limit])
plt.plot(limited_test.y.values[:limit])
plt.legend(['predictions', 'y'])

In [None]:
a = lgbm.feature_importances_; a

In [None]:
columns = limited_train.columns.drop('y')

In [None]:
'''You should chuck the variables that meet this condition!'''
columns.values[np.where(a<10,1,0).astype(bool)]

In [None]:
'''These are the variables that contribute to the lgbm!'''
columns.values[np.where(a>1,1,0).astype(bool)]

# Fast.ai

In [None]:
dep_var = 'y'
procs = [FillMissing, Normalize]

In [None]:
# fillmissing replaces with median // fill with zero could be better

In [None]:
# use a subset of training data
train_df = train_df[:int(5e5)]

In [None]:
# takes 7s on 500k rows, 30s on full df
test = TabularList.from_df(test_df, procs=procs)
data = (TabularList.from_df(df = train_df, path='.', cont_names = df.columns.drop('y'), procs=procs)
                            .split_by_idx(valid_idx=range(int(0.50*len(train_df)),int(len(train_df))))
                            .label_from_df(cols=dep_var)
                            .add_test(test, label=0)
                            .databunch())

In [None]:
learn = tabular_learner(data, layers=[500,200], metrics=[mean_absolute_error, r2_score], ps=[0.001,0.01])

In [None]:
learn.model

In [None]:
learn.lr_find(end_lr=1e1)

In [None]:
sns.set(style = "whitegrid")

In [None]:
learn.recorder.plot()

In [None]:
# model above has already diverged, we will restart.

In [None]:
# create pytorch resnet model






In [None]:
# do some learning







In [None]:
# 2m for 1 cycle
learn.fit_one_cycle(1, 5e-4, wd=0.01)

In [None]:
learn.recorder.plot_lr(show_moms=True)

In [None]:
learn.save('new_fastai')

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.show_results()

In [None]:
Learner.predict??

In [None]:
df.iloc[:100].values.shape

In [None]:
learn.predict(df.iloc[int(8.1e5)])

In [None]:
df.y.iloc[int(8.1e5)]

In [None]:
preds = learn.get_preds()

In [None]:
Learner.get_preds??

# Submission testing

In [None]:
def get_next_data_as_df(test_df, iteration, length):
    return pd.DataFrame([test_df.head(length).iloc[iteration][:60].values])

In [None]:
'''
This function takes in a DataFrame row [df] of shape (1,60) and adds 10 cross-sectional features.
Returns a DataFrame of shape (1,70).
'''
def create_limited_features(df):
    df.columns = [*askRateList, *askSizeList, *bidRateList, *bidSizeList]
    # limited features
    df['midRate'] = (df.bidRate0 + df.askRate0) / 2 # necessary for ohlc
    df['bidAskVol'] = df.bidSize0 + df.askSize0 # necessary for ohlc
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    df['totalAskVol'] = df[askSizeList].sum(axis=1)
    df['totalBidVol'] = df[bidSizeList].sum(axis=1)
    df['OIR_total'] = (df.totalBidVol - df.totalAskVol)/(df.totalBidVol + df.totalAskVol)
    
    df['spread'] = df.askRate0 - df.bidRate0
    df['bidAskVol'] = df.askSize0 + df.bidSize0
    df['vwaBid'] = np.einsum('ij,ji->i', df[bidRateList], df[bidSizeList].T) / df[bidSizeList].sum(axis=1)
    df['vwaAsk'] = np.einsum('ij,ji->i', df[askRateList], df[askSizeList].T) / df[askSizeList].sum(axis=1)
    df['vwaBidDMid'] = df.midRate - df.vwaBid
    df['vwaAskDMid'] = df.vwaAsk - df.midRate
    df['diff_vwaBidAskDMid'] = df.vwaAskDMid - df.vwaBidDMid
    return df

In [None]:
# ''' Appends to [massive_df]=(many, >70) the DataFrame row [row]=(1,70). '''
# def append_to_df(massive_df, row):
#     try: row.index = [massive_df.index[-1] + timedelta(minutes=1)]
#     except IndexError: row.index = [datetime(1970,1,1)]
#     return massive_df.append(row, sort=False)

In [None]:
# ''' Adds time-dependent features. All features that use shift/diff must come here. '''
# def add_time_features(df, num):
#     return df[-num:]

In [None]:
''' This function takes in all features and makes a bounded prediction. '''
def get_prediction(data, model):
    X = data.replace([np.inf, -np.inf], np.nan).values
    return np.clip(model.predict(np.atleast_2d(X)), -5, 5)[0]

In [None]:
''' true_rlcvscore runs the submission functions on a test dataframe [test_df] taking the first [length] rows of [test_df].'''
def true_rlcvscore(test_df, model, length):
    predictions = []
    log_data = pd.DataFrame()  # for debug
    for iteration in tqdm_notebook(range(length)):
        base_row = get_next_data_as_df(test_df, iteration, length)
        row = create_limited_features(base_row)
        data = pd.DataFrame(row)
        log_data_row = data.copy() # for debug
        prediction = get_prediction(data, model)
        predictions.append(prediction)
        log_data = log_data.append(log_data_row, sort=False) # for debug
    true_values = test_df.y.head(length)
    score = r2d2(true_values, predictions)
    print(f'{score:.4f}')
    return predictions, score, log_data, true_values

In [None]:
# takes 2m30s for 1000
test_predictions, test_score, log_data, true_values = true_rlcvscore(test_df, lgbm, 1000)

In [None]:
plt.subplots(1,1,figsize=(15,8))
plt.plot()
plt.plot()
plt.legend(['batch','line-by-line'], fontsize=20)

In [None]:
''' The most important thing now is to reconcile the dataframes in submission and in batch prediction. '''

In [None]:
plt.subplots(1,1,figsize=(15,8))
plt.plot(test_predictions)
plt.plot(true_values.values)
plt.legend(['predictions', 'y'])

In [None]:
# %lprun -f true_rlcvscore test_predictions, test_score = true_rlcvscore(test_df, 100)