In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

from optiver import Directories
from optiver.bench import rmspe
from optiver import utils

dirs = Directories("../..")

In [2]:
def preprocess_df(order_book, trade_book):
    df = pd.DataFrame({"past_vol1": utils.realized_volatility1(order_book), "past_vol2": utils.realized_volatility2(order_book)})
    
    wap1 = utils.wap1(order_book)
    wap2 = utils.wap2(order_book)
    lr1 = utils.lr1(order_book)
    lr2 = utils.lr2(order_book)
    bas1 = utils.bas1(order_book)
    bas2 = utils.bas2(order_book)
    
    def concat_with_aggs(df, series, name):
        return pd.concat([df, utils.agg_df(utils.group_bucket(series), name)], axis="columns")
    
    df = concat_with_aggs(df, wap1, "wap1")
    df = concat_with_aggs(df, lr1, "lr1")
    df = concat_with_aggs(df, bas1, "bas1")

    df = concat_with_aggs(df, wap2, "wap2")
    df = concat_with_aggs(df, lr2, "lr2")
    df = concat_with_aggs(df, bas2, "bas2")
    
    exchange = utils.exchange(trade_book.fillna(0.))
    
    df = concat_with_aggs(df, utils.exchange(trade_book), "exchange")
    df["exchange_total"] = utils.group_bucket(exchange).sum()
    
    return df

In [3]:
# processed_df = utils.preprocess_dfs(dirs.processed / "book_train", dirs.processed / "trade_train", preprocess_df, tqdm=tqdm)

# processed_df.to_hdf(dirs.processed / "processed.h5", key="processed")

processed_df = pd.read_hdf(dirs.processed / "processed.h5")

In [4]:
targets = pd.read_hdf(dirs.processed / "targets_train.h5")

In [5]:
processed_df

Unnamed: 0_level_0,Unnamed: 1_level_0,past_vol1,past_vol2,wap1_mean,wap1_std,wap1_min,wap1_med,wap1_max,lr1_mean,lr1_std,lr1_min,...,bas2_std,bas2_min,bas2_med,bas2_max,exchange_mean,exchange_std,exchange_min,exchange_med,exchange_max,exchange_total
stock_id,time_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,5,0.004499,0.006999,1.003725,0.000693,1.001434,1.003923,1.004920,7.613599e-06,0.000260,-0.000896,...,0.000213,0.000670,0.001134,0.001701,79.753480,118.778412,1.002715,19.565587,500.592485,3190.139181
0,11,0.001204,0.002476,1.000239,0.000262,0.999700,1.000232,1.000834,1.810239e-06,0.000086,-0.000476,...,0.000200,0.000301,0.000652,0.001105,42.978448,77.834931,0.999724,3.000075,280.020061,1289.353432
0,16,0.002369,0.004801,0.999542,0.000864,0.997224,0.999818,1.000878,-1.109201e-05,0.000173,-0.000783,...,0.000295,0.000575,0.001055,0.001917,86.344357,113.450182,0.999928,14.006367,390.443560,2158.608928
0,31,0.002574,0.003637,0.998832,0.000757,0.997447,0.998657,1.000412,-2.376661e-05,0.000236,-0.001296,...,0.000366,0.000648,0.001159,0.002041,130.640370,144.598253,4.991672,69.886646,449.087539,1959.605547
0,62,0.001894,0.003257,0.999619,0.000258,0.999231,0.999586,1.000159,-1.057099e-08,0.000144,-0.000750,...,0.000185,0.000373,0.000700,0.001166,81.375204,117.861344,0.999231,5.498310,340.804756,1790.254496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,32750,0.003293,0.004712,1.000025,0.000971,0.998281,0.999979,1.002136,-1.203937e-05,0.000225,-0.000772,...,0.000216,0.000413,0.001116,0.001946,59.501952,108.076742,0.998884,3.999835,521.188063,2201.572227
126,32751,0.003691,0.005876,0.999582,0.000486,0.998251,0.999611,1.000736,-1.706835e-06,0.000210,-0.001228,...,0.000227,0.000458,0.001176,0.001995,69.428057,102.742016,0.998563,16.977240,528.275877,2568.838117
126,32753,0.004104,0.004991,1.002476,0.001264,1.000633,1.002376,1.006165,1.998029e-05,0.000275,-0.001061,...,0.000233,0.000515,0.000930,0.001617,54.135549,71.181618,1.000655,20.027573,238.169663,2327.828627
126,32763,0.003661,0.005362,1.001809,0.000456,1.000562,1.001788,1.002963,6.429922e-07,0.000184,-0.000607,...,0.000204,0.000329,0.000789,0.001577,117.584943,141.268679,1.001218,100.171185,739.503342,9406.795437


In [6]:
stock_id = 58

all_x = processed_df.loc[stock_id]
y = targets.loc[stock_id]

y_val_pd = y.sample(frac=0.2, random_state=69).sort_index()
val_index = y_val_pd.index
y_train_pd = y.drop(val_index).sort_index()
train_index = y_train_pd.index

all_x_train = all_x.loc[train_index]
all_x_val = all_x.loc[val_index]

In [7]:
# linear_features = [
#     "past_vol1",
#     "past_vol2",
#     "wap1_std",
#     "wap2_std",
#     "lr1_std",
#     "lr1_min",
#     "lr1_max",
#     "lr2_std",
#     "lr2_min",
#     "lr2_max",
#     "bas1_mean",
#     "bas1_std",
#     "bas1_med",
#     "bas1_max",
#     "bas2_mean",
#     "bas2_std",
#     "bas2_med",
#     "bas2_max",
# ]

linear_features = ["past_vol1", "past_vol2", "wap1_std", "wap2_std", "lr1_std", "lr2_std", "lr1_min", "lr2_min", "lr1_max", "lr2_max"]

linear_trade_features = linear_features + ["exchange_total"]

tree_features = linear_trade_features + ["bas1_min", "bas2_min", "exchange_mean", "exchange_std", "exchange_min", "exchange_med"]

# x_train_pd = all_x_train[tree_features].fillna(0.)
# x_val_pd = all_x_val[tree_features].fillna(0.)
x_train_pd = all_x_train.fillna(0.)
x_val_pd = all_x_val.fillna(0.)

In [8]:
x_train = x_train_pd.to_numpy()
x_val = x_val_pd.to_numpy()

y_train = y_train_pd.to_numpy()
y_val = y_val_pd.to_numpy()

past_vol_train = all_x_train[["past_vol1"]].to_numpy()
past_vol_val = all_x_val[["past_vol1"]].to_numpy()

In [9]:
past_baseline = rmspe(past_vol_val[:, 0], y_val)

past_regressor = LinearRegression()

past_regressor.fit(past_vol_train, y_train)

past_linear_baseline = rmspe(past_regressor.predict(past_vol_val), y_val)

past_baseline, past_linear_baseline

(0.3818742052974043, 0.3613903252515755)

In [10]:
lb_regressor = AdaBoostRegressor(LinearRegression(), n_estimators=50, learning_rate=1e-5, random_state=0)

lb_regressor.fit(x_train, y_train)

linear_boost_baseline = rmspe(lb_regressor.predict(x_val), y_val)

linear_boost_baseline

0.3182137827017773

In [11]:
# regressor = LinearRegression()
# regressor = Ridge(alpha=0.00000001)
# regressor = Lasso(alpha=0.000000000001)
# regressor = RandomForestRegressor(oob_score=True, ccp_alpha=3e-8)
tree = DecisionTreeRegressor(max_depth=10, ccp_alpha=2.9e-9)
# tree = LinearRegression()
regressor = AdaBoostRegressor(tree, n_estimators=50, learning_rate=1e-5, random_state=0)

regressor.fit(x_train, y_train)

train_predictions = regressor.predict(x_train)
val_predictions = regressor.predict(x_val)

In [12]:
# def center(data, mean, std):
#     data = data - mean
#     data = data / std
    
#     return data


# train_mean = np.mean(x_train, axis=0)
# train_std = np.std(x_train, axis=0)

# centered_train = center(x_train, train_mean, train_std)

# pca_vals, pca_vecs = np.linalg.eig(np.cov(centered_train.T))
# pca_vals, pca_vecs = np.real(pca_vals), np.real(pca_vecs)

# sort_indices = np.flip(np.argsort(pca_vals))

In [13]:
# np.sum(pca_vals[sort_indices][:10]) / np.sum(pca_vals)

In [14]:
# pca_vecs.shape

In [15]:
# components = pca_vecs[:, sort_indices[:18]]

In [16]:
# centered_val = center(x_val, train_mean, train_std)

# pca_train, pca_val = np.dot(centered_train, components), np.dot(centered_val, components)

# regressor = LinearRegression()

# regressor.fit(pca_train, y_train)

# train_predictions = regressor.predict(pca_train)
# val_predictions = regressor.predict(pca_val)

In [17]:
print("Training Error:", rmspe(train_predictions, y_train))
print("Validation Error:", rmspe(val_predictions, y_val))

Training Error: 0.2588301415042433
Validation Error: 0.2756526326545608


Stock 31 scores:

Linear least squares: 40.739%

Ridge regression fails to lower

Lasso regression fails to lower (maybe)

PCA Least Squares: 40.737%

Least squares tree features: 40.26%

Least squares all features: 39.86% (So much for EDA)

Linear Adaboost: 39.481% (n_estimators=50, learning_rate=1e-5, random_state=0)

Decision Tree Adaboost: 37.384% (same boost params, max_depth=10, ccp_alpha=2.9e-9)

In [43]:
def grade_df(df):
    display(df)
    df = df.fillna(0.)
    
    grade_tqdm.update()
    
    val_df = df.sample(frac=0.2).sort_index()
    train_df = df.drop(val_df.index).sort_index()
    
    y_val = val_df["target"]
    y_train = train_df["target"]
    
    x_val = val_df.drop("target", axis="columns")
    x_train = train_df.drop("target", axis="columns")
    
    y_val = y_val.to_numpy()
    x_val = x_val.to_numpy()
    y_train = y_train.to_numpy()
    x_train = x_train.to_numpy()
    
    regressor_ = AdaBoostRegressor(DecisionTreeRegressor(max_depth=10, ccp_alpha=2.9e-9), n_estimators=50, learning_rate=1e-5, random_state=0)
    # regressor_ = ()
    
    regressor_.fit(x_train, y_train)
    
    result = pd.DataFrame({"actual": val_df["target"].to_numpy(), "prediction": regressor_.predict(x_val)}, index=val_df.index)
    
    return result

In [49]:
all_x_train.index.get_level_values("time_id")[0]

5

In [44]:
grade_tqdm = tqdm(total=92)

all_df = processed_df.copy()

all_df["target"] = targets

grades = all_df.groupby(level="stock_id").apply(grade_df)

  0%|          | 0/92 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,past_vol1,past_vol2,wap1_mean,wap1_std,wap1_min,wap1_med,wap1_max,lr1_mean,lr1_std,lr1_min,...,bas2_min,bas2_med,bas2_max,exchange_mean,exchange_std,exchange_min,exchange_med,exchange_max,exchange_total,target
stock_id,time_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,5,0.004499,0.006999,1.003725,0.000693,1.001434,1.003923,1.004920,7.613599e-06,0.000260,-0.000896,...,0.000670,0.001134,0.001701,79.753480,118.778412,1.002715,19.565587,500.592485,3190.139181,0.004136
0,11,0.001204,0.002476,1.000239,0.000262,0.999700,1.000232,1.000834,1.810239e-06,0.000086,-0.000476,...,0.000301,0.000652,0.001105,42.978448,77.834931,0.999724,3.000075,280.020061,1289.353432,0.001445
0,16,0.002369,0.004801,0.999542,0.000864,0.997224,0.999818,1.000878,-1.109201e-05,0.000173,-0.000783,...,0.000575,0.001055,0.001917,86.344357,113.450182,0.999928,14.006367,390.443560,2158.608928,0.002168
0,31,0.002574,0.003637,0.998832,0.000757,0.997447,0.998657,1.000412,-2.376661e-05,0.000236,-0.001296,...,0.000648,0.001159,0.002041,130.640370,144.598253,4.991672,69.886646,449.087539,1959.605547,0.002195
0,62,0.001894,0.003257,0.999619,0.000258,0.999231,0.999586,1.000159,-1.057099e-08,0.000144,-0.000750,...,0.000373,0.000700,0.001166,81.375204,117.861344,0.999231,5.498310,340.804756,1790.254496,0.001747
0,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,32750,0.002269,0.004290,0.999886,0.000591,0.998626,1.000110,1.001020,-6.506376e-06,0.000187,-0.001137,...,0.000513,0.000977,0.001901,83.677055,139.249289,0.999230,2.001232,515.917174,2091.926372,0.003517
0,32753,0.002206,0.002847,1.000310,0.000551,0.999331,1.000268,1.001504,1.966770e-06,0.000154,-0.000818,...,0.000501,0.000802,0.001151,162.452553,218.533538,0.999474,41.040072,772.733384,4548.671493,0.001190
0,32758,0.002913,0.003266,0.999552,0.000743,0.998057,0.999538,1.000697,8.895445e-06,0.000213,-0.000684,...,0.000252,0.000731,0.001867,117.987861,141.223285,0.998791,99.904272,600.434758,4247.563002,0.004264
0,32763,0.003046,0.005105,1.002357,0.000356,1.001473,1.002313,1.003089,1.698933e-06,0.000174,-0.000778,...,0.000384,0.000719,0.001536,60.838147,81.259971,1.002043,20.043750,383.800644,3224.421796,0.004352


Unnamed: 0_level_0,Unnamed: 1_level_0,past_vol1,past_vol2,wap1_mean,wap1_std,wap1_min,wap1_med,wap1_max,lr1_mean,lr1_std,lr1_min,...,bas2_min,bas2_med,bas2_max,exchange_mean,exchange_std,exchange_min,exchange_med,exchange_max,exchange_total,target
stock_id,time_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,5,0.006245,0.007602,1.003585,0.001174,1.000780,1.003407,1.006438,4.079605e-06,0.000261,-0.001143,...,0.000522,0.000915,0.001438,106.848494,236.712823,1.002132,28.595152,1876.184924,10043.758425,0.006340
1,11,0.002305,0.003033,1.001760,0.000356,1.000975,1.001758,1.002759,1.854058e-07,0.000120,-0.000606,...,0.000467,0.000668,0.001069,84.175033,114.985807,1.001405,20.048826,409.705996,4208.751643,0.002099
1,16,0.002517,0.002973,1.000990,0.000848,0.999428,1.001001,1.002701,2.382088e-06,0.000134,-0.000726,...,0.000285,0.000477,0.000954,139.813550,118.054030,1.000906,100.696093,500.595868,11464.711093,0.002456
1,31,0.003570,0.004298,0.997455,0.000640,0.996246,0.997619,0.998530,-1.052943e-05,0.000274,-0.001160,...,0.000655,0.001019,0.002259,63.087119,88.959993,0.996663,6.977653,344.200179,2081.874914,0.002807
1,62,0.002903,0.004508,1.000682,0.000360,0.999615,1.000675,1.001593,3.151094e-06,0.000193,-0.000835,...,0.000498,0.001066,0.001776,46.519408,58.246531,1.000320,12.020049,200.163531,1162.985203,0.004312
1,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,32750,0.003319,0.005620,0.999631,0.000852,0.998140,0.999490,1.001811,-5.296619e-06,0.000210,-0.000769,...,0.000317,0.000872,0.001661,106.528299,157.786634,0.998536,22.964736,613.677359,5219.886642,0.003009
1,32751,0.003723,0.004996,1.000142,0.000396,0.999400,1.000165,1.000915,-3.007318e-06,0.000213,-0.000911,...,0.000403,0.000874,0.001411,66.305759,95.810751,0.999463,10.001343,388.935380,3248.982168,0.003741
1,32753,0.010829,0.012168,1.007503,0.006260,0.995010,1.010537,1.016151,2.896253e-05,0.000487,-0.002068,...,0.000348,0.001120,0.002621,417.959850,529.104076,0.996865,222.787068,3507.524755,76486.652579,0.012414
1,32758,0.003135,0.004268,1.000854,0.000564,0.999554,1.001075,1.001705,5.514881e-06,0.000178,-0.000741,...,0.000625,0.000893,0.001339,86.182921,67.534891,1.000714,100.017858,320.267906,2240.755934,0.002868


Unnamed: 0_level_0,Unnamed: 1_level_0,past_vol1,past_vol2,wap1_mean,wap1_std,wap1_min,wap1_med,wap1_max,lr1_mean,lr1_std,lr1_min,...,bas2_min,bas2_med,bas2_max,exchange_mean,exchange_std,exchange_min,exchange_med,exchange_max,exchange_total,target
stock_id,time_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2,11,0.000750,0.001128,1.001544,0.000317,1.000708,1.001582,1.002044,1.570964e-06,0.000037,-0.000219,...,0.000233,0.000233,0.000388,188.299672,218.152219,1.000972,100.198221,1044.627673,14687.374452,0.000806
2,16,0.001237,0.001681,0.999809,0.000244,0.999314,0.999795,1.000464,3.026574e-08,0.000054,-0.000267,...,0.000253,0.000253,0.000506,146.868958,116.992704,0.999494,100.000000,592.315456,13952.550992,0.001581
2,31,0.001638,0.002230,0.999640,0.000477,0.998734,0.999524,1.000827,-2.810536e-06,0.000083,-0.000235,...,0.000217,0.000362,0.000579,129.259774,113.418779,0.999096,99.967468,580.952176,10211.522130,0.001599
2,62,0.001204,0.001372,0.999478,0.000291,0.998762,0.999484,1.000005,-4.049953e-07,0.000061,-0.000335,...,0.000219,0.000292,0.000438,123.163258,103.032109,0.999417,99.952629,395.826538,5911.836388,0.001503
2,72,0.002872,0.003389,1.000645,0.000417,0.999700,1.000639,1.001746,2.563817e-07,0.000118,-0.000421,...,0.000234,0.000313,0.000547,226.412939,243.230671,0.999883,131.037354,1311.240880,58188.125300,0.002503
2,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,32750,0.001568,0.001982,1.000759,0.000512,0.999652,1.000958,1.001526,2.226962e-08,0.000070,-0.000287,...,0.000213,0.000284,0.000426,184.211232,199.790060,1.000958,109.519645,1185.134514,19526.390554,0.001598
2,32751,0.001541,0.001930,1.000166,0.000392,0.999197,1.000240,1.000721,3.004848e-07,0.000066,-0.000423,...,0.000227,0.000303,0.000455,140.337971,131.746974,0.999205,100.071907,628.308961,18384.274259,0.001662
2,32753,0.000870,0.001116,1.000348,0.000164,1.000078,1.000335,1.000750,-6.996340e-07,0.000040,-0.000241,...,0.000233,0.000233,0.000388,240.660574,305.559933,1.000233,154.583948,1556.420876,21178.130528,0.000925
2,32763,0.001719,0.002598,1.000246,0.000683,0.999086,1.000259,1.001967,-2.528261e-07,0.000079,-0.000343,...,0.000258,0.000345,0.000862,189.760685,202.467910,1.000000,114.970258,1293.000000,29223.145509,0.004879


KeyboardInterrupt: 

In [42]:
rmspe(grades["prediction"], grades["actual"])

0.28141979055427147

In [34]:
rmspe(grades.loc[58].loc[58]["prediction"], grades.loc[58].loc[58]["actual"])

0.25138128262616377