In [185]:
import pandas as pd
import numpy as np 
import glob
import os
import matplotlib.pyplot as plt 
import lightgbm as lgbm 


### Helper Functions

In [186]:
def wap(df):
    return (df['bid_price1'] * df['ask_size1'] +
            df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])

def wap2(df):
    return (df['bid_price2'] * df['ask_size2'] +
            df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(returns):
    return np.sqrt(np.sum(returns ** 2))

def count_unique(series):
    return len(np.unique(series))

### Getting Dataset

In [187]:
def book_predictors(stock_id, train_or_test):
    stock_data = pd.read_parquet('stock_book_' + train_or_test + '/stock_' + str(stock_id) + '_' + train_or_test + '.parquet')
    stock_data['wap'] = wap(stock_data)
    stock_data['log_return'] = stock_data.groupby('time_id')['wap'].apply(log_return)
    stock_data['wap2'] = wap2(stock_data)
    stock_data['log_return2'] = stock_data.groupby('time_id')['wap2'].apply(log_return)
    stock_data['wap_offset'] = abs(stock_data['wap'] - stock_data['wap2'])
    stock_data['price_spread'] = (stock_data['ask_price1'] - stock_data['bid_price1']) / ((stock_data['ask_price1'] + stock_data['bid_price1']) / 2)
    stock_data['bid_spread'] = stock_data['bid_price1'] - stock_data['bid_price2']
    stock_data['ask_spread'] = stock_data['ask_price1'] - stock_data['ask_price2']
    stock_data['total_volume'] = (stock_data['ask_size1'] + stock_data['ask_size2']) + (stock_data['bid_size1'] + stock_data['bid_size2'])
    stock_data['volume_imbalance'] = abs((stock_data['ask_size1'] + stock_data['ask_size2']) - (stock_data['bid_size1'] + stock_data['bid_size2']))

    create_feature_dict = {
            'log_return':[realized_volatility],
            'log_return2':[realized_volatility],
            'wap_offset':[np.mean],
            'price_spread':[np.mean],
            'bid_spread':[np.mean],
            'ask_spread':[np.mean],
            'volume_imbalance':[np.mean],
            'total_volume':[np.mean],
            'wap':[np.mean],
                }

    return pd.DataFrame(stock_data.groupby(['time_id']).agg(create_feature_dict)).reset_index()

def trade_predictors(stock_id, train_or_test):
    stock_data = pd.read_parquet('stock_trade_' + train_or_test + '/stock_' + str(stock_id) + '_' + train_or_test + '.parquet')

    stock_data['log_return'] = stock_data.groupby('time_id')['price'].apply(log_return)
    
    
    aggregate_dictionary = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }

    return pd.DataFrame(stock_data.groupby('time_id').agg(aggregate_dictionary)).reset_index()

def target(stock_id, train_or_test):
    result = pd.read_parquet('target_data/target_' + train_or_test + '.parquet')
    result = result.loc[result['stock_id'] == stock_id]
    result = result.drop(['stock_id'], axis = 1)
    return result

def generate_data(stock_id, train_or_test):
    result = pd.merge(target(stock_id, train_or_test), book_predictors(stock_id, train_or_test), on='time_id', how='left')
    result = pd.merge(result, trade_predictors(stock_id, train_or_test), on='time_id', how='left')
    return result

### Setting up training set

In [188]:
train_0 = generate_data(0, 'train')
test_0 = generate_data(0, 'test')
