In [2]:
import pandas as pd
import numpy as np 
import glob 
import warnings 
from collections import Counter
warnings.filterwarnings("ignore")
import plotly.express as px 
import seaborn as sns 
import matplotlib.pyplot as plt 
import lightgbm as lgbm 
from sklearn.model_selection import StratifiedKFold, train_test_split
import math
import os 
import random
import torch 
import pyarrow as pa
import pyarrow.parquet as pq
import torch.nn as nn
#from transformers import AdamW
from torch.utils.data import Dataset , DataLoader
from colorama import Fore , Style
r__=Fore.RED
g__=Fore.GREEN
st__=Style.RESET_ALL

In [257]:
def wap1(row):
    denom = row.ask_size1 + row.bid_size1
    return ((row.bid_price1 * row.ask_size1 + row.ask_price1 * row.bid_size1)/denom)
    
def wap2(row):
    denom = row.ask_size2 + row.bid_size2
    return ((row.bid_price2 * row.ask_size2 + row.ask_price2 * row.bid_size2)/denom)

def log_avg_wap(row):
    return np.log((row.wap1 + row.wap2)/2)

def log_return(list_prices):
    return np.log(list_prices).diff()
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))
def custom_loss(ytrue,ypred) :
    squared_residual = (ytrue-ypred)**2/ytrue
    grad = squared_residual
    hess = np.ones(len(ytrue))
    
    return grad,hess

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False
def custom_rmspe_valid(y_true, y_pred):
    residual = (y_true - y_pred).astype("float")
    residual = residual ** 2 / y_true
    residual = np.mean(residual)
    return "eval_RMSPE", math.sqrt(residual), False
def simple_volatility(series_prix):
    mx = np.max(series_prix)
    mn = np.min(series_prix)
    moy = np.mean(series_prix)
    vol = (moy-mn)/(mx-mn)
    return vol
def count_unique(series):
    return len(np.unique(series))

## 1. Importing the Data


In [258]:

train = pd.read_parquet("target_data/target_train.parquet")
train

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747
...,...,...,...
428162,126,25653,0.003977
428163,126,25654,0.001674
428164,126,25668,0.006922
428165,126,25680,0.002081


In [259]:
book_train_stock_id_0 = pd.read_parquet("stock_book_train/stock_0_train.parquet")
book_train_stock_id_0_t_id5 = book_train_stock_id_0[book_train_stock_id_0["time_id"]==5]
#book_train_stock_id_0_t_id5

In [260]:
trade_train_stock_id_0 = pd.read_parquet("stock_trade_train/stock_0_train.parquet")

In [261]:
trade_train_stock_id_0 

Unnamed: 0,time_id,seconds_in_bucket,price,size,order_count
0,5,21,1.002301,326,12
1,5,46,1.002778,128,4
2,5,50,1.002818,55,1
3,5,57,1.003155,121,5
4,5,68,1.003646,4,1
...,...,...,...,...,...
99253,25683,473,1.002081,106,5
99254,25683,512,1.001077,100,3
99255,25683,529,1.001027,2,1
99256,25683,566,0.999374,2,2


## 2 i) Feature Engineering (TRADE DATA)


In [262]:
#Feature 1 -> Relative volume by 10min bucket

#1. Finding average size (volume) across all time_id's

avg_trade_volume_stock_id_0 = trade_train_stock_id_0.groupby(['time_id']).sum()["size"].mean() #this groups by time id, gets the total sum of the size, then it gets the mean across all time buckets 
 
#2. Get size (volume) of time_id = 5
trade_volume_stock_id_0_time_id_5 = trade_train_stock_id_0[trade_train_stock_id_0['time_id'] == 5]["size"].sum() #gives me the volume of time bucket 5

#3. Compute relative volume
rel_trade_volume_stock_id_0_time_id_5 = trade_volume_stock_id_0_time_id_5/avg_trade_volume_stock_id_0
rel_trade_volume_stock_id_0_time_id_5

0.9872250681883579

In [263]:
#Feature 2 -> Relative price range 

#1. Find min and max of trade price for all seperate time_id. 
min_trade_price_stock_id_0 = trade_train_stock_id_0.groupby(['time_id']).min()
max_trade_price_stock_id_0 = trade_train_stock_id_0.groupby(['time_id']).max()
#2. Find range of trade price and median of trade price for all seperate time_id. 
range_trade_price_stock_id_0 = max_trade_price_stock_id_0 - min_trade_price_stock_id_0 
median_trade_price_stock_id_0 = trade_train_stock_id_0.groupby(['time_id']).median()
#3. Use median to compute how much percent below our minimum is for all seperate time_id. 
lower_percent_range_relative_to_median = (median_trade_price_stock_id_0 - min_trade_price_stock_id_0)/median_trade_price_stock_id_0
#4. Use median to compute how much percent above our maximum is for all seperate time_id. 
upper_percent_range_relative_to_median = (max_trade_price_stock_id_0 - median_trade_price_stock_id_0)/median_trade_price_stock_id_0
#5 Add both values to get total percent range. E.g. 3% below median and 5% above median = 8% total range for all seperate time_id.
total_percent_range = upper_percent_range_relative_to_median + lower_percent_range_relative_to_median

#6. Compute the average percent range across all time id's
avg_total_percent_range = total_percent_range["price"].mean()

#7. Get total percent range for time id 5
min_price_stock_id_0_time_id_5 = trade_train_stock_id_0[trade_train_stock_id_0['time_id'] == 5]["price"].min()
max_price_stock_id_0_time_id_5 = trade_train_stock_id_0[trade_train_stock_id_0['time_id'] == 5]["price"].max()

range_stock_id_0_time_id_5 = max_price_stock_id_0_time_id_5 - min_price_stock_id_0_time_id_5 #gives me the range of price for time bucket 5
median_stock_id_0_time_id_5 = trade_train_stock_id_0[trade_train_stock_id_0['time_id'] == 5]["price"].median()

lower_percent_range_relative_to_median_stock_id_0_time_id_5 = (median_stock_id_0_time_id_5 - min_price_stock_id_0_time_id_5)/median_stock_id_0_time_id_5

upper_percent_range_relative_to_median_stock_id_0_time_id_5= (max_price_stock_id_0_time_id_5 - median_stock_id_0_time_id_5)/median_stock_id_0_time_id_5

total_percent_range_stock_id_0_time_id_5 = lower_percent_range_relative_to_median_stock_id_0_time_id_5 + upper_percent_range_relative_to_median_stock_id_0_time_id_5

#8. Compute relative percent trading range to average
rel_total_percent_range_stock_id_0_time_id_5 = total_percent_range_stock_id_0_time_id_5/avg_total_percent_range
rel_total_percent_range_stock_id_0_time_id_5

0.70972

In [265]:
#Feature 3 -> Time between execution --> size/second 

## 2 ii) Feature Engineering (BOOK DATA)


In [266]:
###Feature Engineering for ORDER BOOK data a###
book_train_stock_id_0

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2
0,5,0,1.001422,1.002301,1.001370,1.002353,3,226,2,100
1,5,1,1.001422,1.002301,1.001370,1.002353,3,100,2,100
2,5,5,1.001422,1.002301,1.001370,1.002405,3,100,2,100
3,5,6,1.001422,1.002301,1.001370,1.002405,3,126,2,100
4,5,7,1.001422,1.002301,1.001370,1.002405,3,126,2,100
...,...,...,...,...,...,...,...,...,...,...
736183,25683,582,0.998323,1.001277,0.998172,1.001778,1,106,100,25
736184,25683,584,0.998773,1.001277,0.998323,1.001778,25,106,1,25
736185,25683,586,0.998923,1.001277,0.998773,1.001778,2,6,25,25
736186,25683,595,0.998923,1.001277,0.998773,1.001778,2,4,25,25


In [267]:
# Compute the first weighted averaged price for each seconds_in_bucket and time ID
 
#book_train_stock_id_0["wap1"] = book_train_stock_id_0.apply(wap1,axis=1)
denom1 = book_train_stock_id_0["ask_size1"] + book_train_stock_id_0["bid_size1"]
volprice1 = book_train_stock_id_0["bid_price1"] * book_train_stock_id_0["ask_size1"] + book_train_stock_id_0["ask_price1"] * book_train_stock_id_0["bid_size1"]
book_train_stock_id_0["wap1"] = volprice1/denom1

In [268]:
# Compute the second weighted averaged price for each seconds_in_bucket and time ID 

#book_train_stock_id_0.loc[:, "wap2"] = book_train_stock_id_0.apply(wap2,axis=1)
denom2 = book_train_stock_id_0["ask_size2"] + book_train_stock_id_0["bid_size2"]
volprice2 = book_train_stock_id_0["bid_price2"] * book_train_stock_id_0["ask_size2"] + book_train_stock_id_0["ask_price2"] * book_train_stock_id_0["bid_size2"]
book_train_stock_id_0["wap2"] = volprice2/denom2

In [269]:
# Compute the avg weighted price using both wap1 and wap2 for each seconds_in_bucket and time ID 

#book_train_stock_id_0.loc[:, "log_avg_wap"] = book_train_stock_id_0.apply(log_avg_wap,axis=1)
book_train_stock_id_0["avg_wap"] = (book_train_stock_id_0["wap1"] + book_train_stock_id_0["wap2"])/2

In [270]:
#Theory, the bigger the spread the higher the volatility!!!
#Getting spread ratio's of 1 and 2
spread_ratio_1 = book_train_stock_id_0["ask_price1"]/book_train_stock_id_0["bid_price1"]
book_train_stock_id_0["spread_ratio_1"] = spread_ratio_1
spread_ratio_2 = book_train_stock_id_0["ask_price2"]/book_train_stock_id_0["bid_price2"]
book_train_stock_id_0["spread_ratio_2"] = spread_ratio_2

In [251]:
#Compute volume imbalance as an average ratio (supply/demand) per time_id
total_bid_size = book_train_stock_id_0["bid_size1"] + book_train_stock_id_0["bid_size2"]
total_ask_size = book_train_stock_id_0["ask_size1"] + book_train_stock_id_0["ask_size2"]
book_train_stock_id_0["vol_imbalance"] = total_ask_size/total_bid_size
#Finding the average volume imbalance by time_id
vol_imbalance = book_train_stock_id_0.groupby(['time_id']).mean()["vol_imbalance"]
book_train_stock_id_0.groupby(['time_id']).mean()

#how much it deviates from a ratio of 1:1 means its more significant a ratio of 10 is the same significance as a ratio of 0.1? So maybe we standardize it and say if ratio <1 do 1/ratio... so ratios like 0.1 can be changed to 10!!!


Unnamed: 0_level_0,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,vol_imbalance
time_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5,293.241722,1.003314,1.004169,1.003139,1.004320,78.264901,74.579470,80.880795,89.771523,10.392625
11,332.590000,1.000011,1.000406,0.999870,1.000541,149.965000,71.145000,95.445000,94.895000,1.303403
16,253.712766,0.999204,0.999929,0.999007,1.000127,96.132979,131.037234,114.526596,74.654255,2.440987
31,268.791667,0.998445,0.999304,0.998255,0.999413,114.458333,120.800000,68.783333,131.225000,2.360404
62,292.909091,0.999407,0.999804,0.999216,0.999913,119.823864,88.477273,87.840909,47.079545,1.840490
...,...,...,...,...,...,...,...,...,...,...
25653,334.255639,0.998724,0.999497,0.998610,0.999707,63.067669,126.180451,28.278195,96.045113,39.953396
25654,291.412935,1.001351,1.001822,1.001223,1.002018,243.044776,66.860697,159.457711,72.313433,0.561149
25668,315.901899,0.996259,0.996981,0.996093,0.997210,66.433544,86.743671,53.768987,73.598101,21.204706
25680,323.354545,0.999198,0.999477,0.999055,0.999563,34.350000,93.604545,75.554545,63.727273,5.239328


## 3 Data Exploration (e.g. correlation)


In [271]:
#Exploring previous realized vol against target next 10 min for stock id 0
target_train = pd.read_parquet("target_data/target_train.parquet")
target_train_stock_id_0 = target_train[target_train["stock_id"] == 0]
target_train_stock_id_0

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747
...,...,...,...
3060,0,25653,0.002747
3061,0,25654,0.002667
3062,0,25668,0.004750
3063,0,25680,0.001235


In [272]:
book_train_stock_id_0.loc[:, 'log_return'] = log_return(book_train_stock_id_0["avg_wap"])
#just extract all rows that aren't null...
book_train_stock_id_0 = book_train_stock_id_0[~book_train_stock_id_0['log_return'].isnull()]


In [273]:
book_train_stock_id_0_time_id_5 = book_train_stock_id_0[book_train_stock_id_0['time_id']==5]
book_train_stock_id_0_time_id_5

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,wap1,wap2,avg_wap,spread_ratio_1,spread_ratio_2,log_return
1,5,1,1.001422,1.002301,1.001370,1.002353,3,100,2,100,1.001448,1.001390,1.001419,1.000878,1.000981,7.034724e-06
2,5,5,1.001422,1.002301,1.001370,1.002405,3,100,2,100,1.001448,1.001391,1.001419,1.000878,1.001033,5.065051e-07
3,5,6,1.001422,1.002301,1.001370,1.002405,3,126,2,100,1.001443,1.001391,1.001417,1.000878,1.001033,-2.576881e-06
4,5,7,1.001422,1.002301,1.001370,1.002405,3,126,2,100,1.001443,1.001391,1.001417,1.000878,1.001033,0.000000e+00
5,5,11,1.001422,1.002301,1.001370,1.002405,3,100,2,100,1.001448,1.001391,1.001419,1.000878,1.001033,2.576881e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,5,585,1.003129,1.003749,1.003025,1.003801,100,3,26,3,1.003731,1.003721,1.003726,1.000619,1.000773,4.482279e-04
298,5,586,1.003129,1.003749,1.002612,1.003801,100,3,2,3,1.003731,1.003087,1.003409,1.000619,1.001186,-3.155930e-04
299,5,587,1.003129,1.003749,1.003025,1.003801,100,3,26,3,1.003731,1.003721,1.003726,1.000619,1.000773,3.155930e-04
300,5,588,1.003129,1.003749,1.002612,1.003801,100,3,2,3,1.003731,1.003087,1.003409,1.000619,1.001186,-3.155930e-04


In [274]:
fig = px.line(book_train_stock_id_0_time_id_5, x="seconds_in_bucket", y="log_return", title='Log return of stock_id_0, time_id_5')
fig.show()

In [276]:
realized_vol = realized_volatility(book_train_stock_id_0_time_id_5['log_return'])
realized_vol

0.0041146791172599435

In [118]:
target_train_stock_id_0

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747
...,...,...,...
3060,0,25653,0.002747
3061,0,25654,0.002667
3062,0,25668,0.004750
3063,0,25680,0.001235


In [132]:
 book_train_stock_id_0

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,wap1,wap2,avg_wap,vol_imbalance,log_return
1,5,1,1.001422,1.002301,1.001370,1.002353,3,100,2,100,1.001448,1.001390,1.001419,40.000000,7.034724e-06
2,5,5,1.001422,1.002301,1.001370,1.002405,3,100,2,100,1.001448,1.001391,1.001419,40.000000,5.065051e-07
3,5,6,1.001422,1.002301,1.001370,1.002405,3,126,2,100,1.001443,1.001391,1.001417,45.200000,-2.576881e-06
4,5,7,1.001422,1.002301,1.001370,1.002405,3,126,2,100,1.001443,1.001391,1.001417,45.200000,0.000000e+00
5,5,11,1.001422,1.002301,1.001370,1.002405,3,100,2,100,1.001448,1.001391,1.001419,40.000000,2.576881e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736183,25683,582,0.998323,1.001277,0.998172,1.001778,1,106,100,25,0.998350,1.001057,0.999703,1.297030,-5.362708e-07
736184,25683,584,0.998773,1.001277,0.998323,1.001778,25,106,1,25,0.999251,0.998455,0.998853,5.038462,-8.507011e-04
736185,25683,586,0.998923,1.001277,0.998773,1.001778,2,6,25,25,0.999512,1.000275,0.999894,1.148148,1.040993e-03
736186,25683,595,0.998923,1.001277,0.998773,1.001778,2,4,25,25,0.999708,1.000275,0.999992,1.074074,9.806272e-05


In [277]:
#Is previous volatility related to next 10 min vol???!!!


realized_vol_id_0 = book_train_stock_id_0.groupby("time_id")["log_return"].apply(realized_volatility)
realized_vol_id_0
vol_id_0_df = pd.DataFrame(columns=[target_train_stock_id_0["target"], realized_vol_id_0])
realized_vol_id_0
#target_train_stock_id_0["target"]
vol_id_0_df = vol_id_0_df.T
vol_id_0_df
vol_id_0_df.reset_index(inplace=True)
vol_id_0_df = vol_id_0_df.rename(columns={"target": "Target Volatility", "log_return": "Realized Volatility"})
vol_id_0_df

Unnamed: 0,Target Volatility,Realized Volatility
0,0.004136,0.004115
1,0.001445,0.004048
2,0.002168,0.002876
3,0.002195,0.003486
4,0.001747,0.003098
...,...,...
3060,0.002747,0.002566
3061,0.002667,0.003456
3062,0.004750,0.007960
3063,0.001235,0.004866


In [278]:
fig = px.scatter(vol_id_0_df, x='Realized Volatility', y='Target Volatility', title='Target Volatility vs Realized Volatility')
fig.update_yaxes(nticks=20)
fig.update_xaxes(nticks=20)
fig.update_layout(xaxis_range=[0, 0.03])

fig.show()

In [279]:
vol_id_0_df.corr()
#It appears that we have a strong positive correlation between realized volatility and target volatility of the prev 10 min? Is this random? Lets check the other 10 min intervals!!!!!
#WAP_AVG to calculate realized vol of first 10 mins
#WAP1 might be better? 

Unnamed: 0,Target Volatility,Realized Volatility
Target Volatility,1.0,0.608736
Realized Volatility,0.608736,1.0


In [280]:
vol_id_0_df_shifted = vol_id_0_df.copy()
vol_id_0_df_shifted["Realized Volatility"] = vol_id_0_df_shifted["Realized Volatility"].shift(1)
vol_id_0_df_shifted.corr()
#Clearly future time series data has no correlation with previous time series data... Only the prveious 10 min and next 10 min are correlaated. Past data has no effect on future 

Unnamed: 0,Target Volatility,Realized Volatility
Target Volatility,1.0,-0.033637
Realized Volatility,-0.033637,1.0


In [5]:
trade_train_stock_id_0 = pd.read_parquet("stock_trade_train/stock_50_train.parquet")
trade_train_stock_id_0

Unnamed: 0,time_id,seconds_in_bucket,price,size,order_count
0,5,4,1.000876,10,1
1,5,6,1.000613,102,2
2,5,7,1.000533,1,1
3,5,9,1.000686,200,3
4,5,16,1.000743,237,5
...,...,...,...,...,...
784837,25683,593,0.985198,600,7
784838,25683,594,0.985198,294,2
784839,25683,596,0.985027,1275,23
784840,25683,597,0.985429,300,3
