In [None]:
import pandas as pd
import numpy as np 
import glob
import os
import matplotlib.pyplot as plt 
import lightgbm as lgbm
import warnings
import seaborn as sns


### Data Preprocessing

In [None]:
train = pd.read_csv("optiver_raw_data/train.csv")
time_id_of_first_80_percent = train.iloc[18384,1]

stock_0 = train[train["stock_id"]==0]
df_train = pd.DataFrame()
df_test = pd.DataFrame()
for i in range(127):
    stock_i = train[train["stock_id"]==i]

    stock_train = stock_i[stock_i['time_id'] <= time_id_of_first_80_percent]
    stock_test = stock_i[stock_i['time_id'] > time_id_of_first_80_percent]

    df_train = pd.concat([df_train, stock_train])
    df_test = pd.concat([df_test, stock_test])

df_train.to_parquet('target_data/target_train.parquet')
df_test.to_parquet('target_data/target_test.parquet')


stock_0 = train[train["stock_id"]==0]
df_train = pd.DataFrame()
df_test = pd.DataFrame()
for i in range(127):
    stock_i = train[train["stock_id"]==i]

    stock_train = stock_i[stock_i['time_id'] <= time_id_of_first_80_percent]
    stock_test = stock_i[stock_i['time_id'] > time_id_of_first_80_percent]

    df_train = pd.concat([df_train, stock_train])
    df_test = pd.concat([df_test, stock_test])

df_train.to_parquet('target_data/target_train.parquet')
df_test.to_parquet('target_data/target_test.parquet')

for i in range(127):
    filename = "optiver_raw_data/trade_train.parquet/stock_id=" + str(i)
    if not os.path.exists(filename):
        continue
    trade_current_stock = pd.read_parquet(filename)

    stock_train = trade_current_stock[trade_current_stock['time_id'] <= time_id_of_first_80_percent]
    stock_train.to_parquet('stock_trade_train/stock_' + str(i) + '_train.parquet')

    stock_test = trade_current_stock[trade_current_stock['time_id'] > time_id_of_first_80_percent]
    stock_test.to_parquet('stock_trade_test/stock_' + str(i) + '_test.parquet')

### Helper Functions

In [None]:
def wap(df):
        return (df['bid_price1'] * df['ask_size1'] +
                df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])

def wap2(df):
    return (df['bid_price2'] * df['ask_size2'] +
            df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(returns):
    return np.sqrt(np.sum(returns ** 2))

def count_unique(series):
    return len(np.unique(series))

def isValidStock(i):
    filename = "stock_trade_train/stock_" + str(i) + "_train.parquet"
    if not os.path.exists(filename):
        return False
    return True

def RMSPEMetric():

    def RMSPE(y_hat, dtrain):
        y = dtrain.get_label()
        elements = ((y - y_hat) / y) ** 2
        return 'RMSPE', float(np.sqrt(np.sum(elements) / len(y))), False

    return RMSPE

def RMSPE(y_hat, y):
    elements = ((y - y_hat) / y) ** 2
    return float(np.sqrt(np.sum(elements) / len(y)))

### Getting Dataset

In [None]:
def book_predictors(stock_id, train_or_test):
    stock_data = pd.read_parquet('stock_book_' + train_or_test + '/stock_' + str(stock_id) + '_' + train_or_test + '.parquet')

    # Only consider last 5 mins of data for all book predictors
    # stock_data = stock_data[stock_data["seconds_in_bucket"] >= 300] #Only consider seconds_in_bucket > 300 

    stock_data['wap'] = wap(stock_data)
    stock_data['log_return'] = stock_data.groupby('time_id')['wap'].apply(log_return)
    stock_data['wap2'] = wap2(stock_data)
    stock_data['log_return2'] = stock_data.groupby('time_id')['wap2'].apply(log_return)
    stock_data['wap_offset'] = abs(stock_data['wap'] - stock_data['wap2'])
    stock_data['price_spread'] = (stock_data['ask_price1'] - stock_data['bid_price1']) / ((stock_data['ask_price1'] + stock_data['bid_price1']) / 2)
    stock_data['bid_spread'] = stock_data['bid_price1'] - stock_data['bid_price2']
    stock_data['ask_spread'] = stock_data['ask_price1'] - stock_data['ask_price2']
    stock_data['total_volume'] = (stock_data['ask_size1'] + stock_data['ask_size2']) + (stock_data['bid_size1'] + stock_data['bid_size2'])
    stock_data['volume_imbalance'] = abs((stock_data['ask_size1'] + stock_data['ask_size2']) - (stock_data['bid_size1'] + stock_data['bid_size2']))

    #Adding features 


    create_feature_dict = {
            'log_return':[realized_volatility],
            'log_return2':[realized_volatility],
            'wap_offset':[np.mean],
            'price_spread':[np.mean],
            'bid_spread':[np.mean],
            'ask_spread':[np.mean],
            'volume_imbalance':[np.mean],
            'total_volume':[np.mean],
            'wap':[np.mean],
    }

    result = pd.DataFrame(stock_data.groupby(['time_id']).agg(create_feature_dict)).reset_index()
    result.columns = result.columns.map('_'.join).str.strip('_')

    return result

def trade_predictors(stock_id, train_or_test):
    stock_data = pd.read_parquet('stock_trade_' + train_or_test + '/stock_' + str(stock_id) + '_' + train_or_test + '.parquet')

    stock_data['log_return'] = stock_data.groupby('time_id')['price'].apply(log_return)

    stock_data['price_max'] = stock_data["price"]
    stock_data['price_min'] = stock_data["price"]
    stock_data['price_median'] = stock_data["price"]

    aggregate_dictionary = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.sum], 
        'price_max':[np.max],
        'price_min':[np.min],
        'price_median':[np.median],
    }
    result = pd.DataFrame(stock_data.groupby('time_id').agg(aggregate_dictionary)).reset_index()
    result["size_per_order"] = result["size"] / result["order_count"] 

    avg_trade_volume = result["size"].mean()
    result["rel_trade_volume"] = result["size"] / avg_trade_volume
    result['current_percent_range'] = (result["price_max"].values - result["price_min"].values) / result["price_median"]
    avg_percent_range = result['current_percent_range'].mean()
    result['rel_percent_range'] = result['current_percent_range'] /  avg_percent_range
    result['stock_id'] = stock_id

    result.columns = result.columns.map('_'.join).str.strip('_')
    return result

def target(stock_id, train_or_test):
    result = pd.read_parquet('target_data/target_' + train_or_test + '.parquet')
    result = result.loc[result['stock_id'] == stock_id]
    result = result.drop(['stock_id'], axis = 1)
    return result

def generate_data(stock_id, train_or_test):
    result = pd.merge(target(stock_id, train_or_test), book_predictors(stock_id, train_or_test), on='time_id', how='left')
    result = pd.merge(result, trade_predictors(stock_id, train_or_test), on='time_id', how='left')
    result = result.dropna()
    return result

def generate_train_and_test(stock_id):
    train = generate_data(stock_id, 'train')
    test = generate_data(stock_id, 'test')

    X_train = train.drop(['target', 'time_id'], axis = 1)
    X_test = test.drop(['target', 'time_id'], axis = 1)

    y_train = train['target']
    y_test = test['target']

    return X_train, X_test, y_train, y_test

### Getting combined Dataframes

In [None]:
total = 0
total_size = 0
warnings.filterwarnings("ignore")

X_train = pd.DataFrame()
X_test = pd.DataFrame()
y_train = pd.DataFrame()
y_test = pd.DataFrame()

for i in range(127):
    if not isValidStock(i):
        continue

    a, b, c, d = generate_train_and_test(i)

    X_train = pd.concat([X_train, a])
    X_test = pd.concat([X_test, b])
    y_train = pd.concat([y_train, c])
    y_test = pd.concat([y_test, d])

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

### Implementing LGBM

In [None]:
params_lgbm = {
        'boosting_type': 'goss',
        'learning_rate': 0.001,
        'objective': 'rmse',
        'n_jobs': 8,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'verbose': -1,
    }
weights = 1/(y_train[0] ** 2)
lgbm_train_data = lgbm.Dataset(X_train, label=y_train[0], weight=weights)

weights = 1/(y_test[0] ** 2)
lgbm_test_data = lgbm.Dataset(X_test, label=y_test[0], weight=weights)

evals={}
rounds = 100000
model = lgbm.train(params_lgbm, 
                    lgbm_train_data, 
                    rounds, 
                    valid_sets=lgbm_test_data,
                    feval=RMSPEMetric(),
                    verbose_eval = 1000,
                    early_stopping_rounds=3000,
                    categorical_feature = ['stock_id'],
                    callbacks = [lgbm.record_evaluation(evals)]
                    )

### RMSPE over time

In [None]:
plt.figure(figsize=(20, 8))
sns.set(font_scale = 2)
sns.lineplot(data=evals.get('valid_0').get('l2'))
plt.title('RMSPE vs. Iterations')
plt.xlabel("Iteration")
plt.ylabel("RMSPE")
plt.show()

### Calculating feature importance

In [None]:
feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X_train.columns})
plt.figure(figsize=(20, 12))
sns.set(font_scale = 2)
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:100])
plt.title('LightGBM Feature Importance')
plt.show()