In [1]:
import pandas as pd
import numpy as np

from itertools import groupby
import lightgbm as lgb
import gc
from itertools import combinations
import plotly.express as px
from sklearn.metrics import mean_absolute_error
import numba
from numba import jit, njit, prange
import time

import warnings
warnings.filterwarnings("ignore")
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """

    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    if verbose:
        print(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        print(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        print(f"Decreased by {decrease:.2f}%")

    return df

In [3]:
train = pd.read_csv('../data/train.csv')
# revealed_targets = pd.read_csv('../data/revealed_targets.csv')
# test = pd.read_csv('../data/test.csv')
# sample_submission = pd.read_csv('../data/sample_submission.csv')
train.dropna(subset=['target'], inplace=True)
train['bucket'] = train['date_id']//97 # used for cv

# try the stock info
# stock_info = pd.read_csv('../data/sectornames_and_marketcap.csv')[['stock_id', 'sectorname']]

# for col in ['sectorname']:
#     le = LabelEncoder()
#     stock_info[col] = le.fit_transform(stock_info[col])

# train = train.merge(stock_info, on='stock_id', how='left')

train = reduce_mem_usage(train)

In [4]:
features = ['seconds_in_bucket', 'imbalance_size', 'far_price', 'near_price', 'bid_price', 'ask_size', 'global_median_size', 
            'global_median_bid_size', 'global_median_ask_size', 'global_median_imbalance_size', 'global_median_matched_size', 
            'global_mean_imbalance_buy_sell_flag', 'global_std_size', 'global_ptp_size', 'global_median_price', 'global_std_price', 
            'global_std_wap', 'global_ptp_price', 'imbalance', 'imbalance_global_ratio', 'price_pressure', 'depth_pressure', 
            'reference_price_far_price_imb', 'reference_price_near_price_imb', 'reference_price_ask_price_imb', 
            'reference_price_bid_price_imb', 'reference_price_wap_imb', 'reference_price_mid_price_imb', 'far_price_near_price_imb', 
            'far_price_mid_price_imb', 'near_price_ask_price_imb', 'near_price_bid_price_imb', 'near_price_wap_imb', 
            'near_price_mid_price_imb', 'wap_mid_price_imb', 'matched_size_bid_size_imb', 'matched_size_ask_size_imb', 
            'matched_size_imbalance_size_imb', 'matched_size_mid_size_imb', 'bid_size_ask_size_imb', 'bid_size_imbalance_size_imb', 
            'ask_size_imbalance_size_imb', 'imbalance_size_mid_size_imb', 'imbalance_momentum', 'log_return', 'norm_wap', 
            'norm_imbalance_buy_sell_flag', 'norm_log_return', 'ask_price_bid_price_wap_imb2', 'ask_price_wap_reference_price_imb2', 
            'ask_price_reference_price_mid_price_imb2', 'bid_price_wap_reference_price_imb2', 'bid_price_reference_price_mid_price_imb2', 
            'matched_size_bid_size_ask_size_imb2', 'matched_size_bid_size_imbalance_size_imb2', 'matched_size_bid_size_mid_size_imb2', 
            'matched_size_ask_size_imbalance_size_imb2', 'matched_size_ask_size_mid_size_imb2', 'matched_size_imbalance_size_mid_size_imb2', 
            'bid_size_ask_size_imbalance_size_imb2', 'all_sizes_mean', 'all_prices_std', 'all_sizes_std', 'all_prices_skew', 'all_sizes_skew', 
            'all_prices_kurt', 'all_sizes_kurt', 'all_sizes_max', 'wap_rank', 'imbalance_buy_sell_flag_rank', 'seconds', 'minute', 
            'matched_size_first', 'matched_size_first_ratio', 'imbalance_size_first', 'imbalance_size_first_ratio', 'ask_size_first', 
            'ask_size_first_ratio', 'bid_size_first', 'imbalance_buy_sell_flag_cumsum', 'imbalance_buy_sell_flag_cummean', 'rsi_cumsum', 
            'rsi_cummean', 'wap_mid_price_imb_rank', 'imbalance_global_ratio_rank', 'matched_global_ratio_rank', 'ask_size_global_ratio_rank', 
            'market_imbalance_buy_sell_flag_rank', 'imbalance_buy_sell_flag_cumsum_rank', 'matched_size_ret_1', 'matched_size_ret_2', 
            'matched_size_ret_10', 'imbalance_size_ret_1', 'imbalance_size_ret_2', 'imbalance_size_ret_3', 'imbalance_size_ret_6', 
            'imbalance_size_ret_10', 'reference_price_ret_1', 'reference_price_ret_2', 'reference_price_ret_3', 'reference_price_ret_10', 
            'wap_ret_1', 'ask_price_ret_1', 'bid_price_ret_1', 'bid_price_ret_10', 'ask_size_ret_1', 'bid_size_ret_1', 'bid_size_ret_2', 
            'bid_size_ret_3', 'wap_rolling_std_3', 'wap_rolling_std_10', 'imbalance_buy_sell_flag_rolling_mean_3', 
            'imbalance_buy_sell_flag_rolling_std_3', 'imbalance_buy_sell_flag_rolling_mean_6', 'imbalance_buy_sell_flag_rolling_mean_10', 
            'imbalance_buy_sell_flag_rolling_std_10', 'imbalance_size_rolling_mean_3', 'imbalance_size_rolling_mean_10', 
            'matched_size_rolling_std_10', 'norm_wap_rolling_mean_6', 'norm_wap_rolling_mean_10', 'rsi_rolling_mean_3', 'rsi_rolling_mean_10', 
            'matched_size_ema', 'imbalance_size_ema', 'reference_price_wap_imb_shift_3', 'reference_price_wap_imb_shift_4', 
            'reference_price_wap_imb_shift_5', 'reference_price_wap_imb_shift_6', 'norm_wap_shift_1', 'norm_wap_shift_3', 'norm_wap_shift_5', 
            'imbalance_buy_sell_flag_shift_1', 'imbalance_buy_sell_flag_shift_2', 'imbalance_buy_sell_flag_shift_3', 'wap_rank_shift_1', 
            'wap_rank_shift_3', 'wap_rank_shift_4', 'wap_rank_shift_5', 'imbalance_buy_sell_flag_rank_shift_1', 
            'imbalance_buy_sell_flag_rank_shift_2', 'imbalance_buy_sell_flag_rank_shift_4', 'imbalance_buy_sell_flag_rank_shift_6', 
            'norm_log_return_shift_1', 'norm_log_return_shift_2', 'norm_log_return_shift_3', 'norm_log_return_shift_4', 
            'norm_log_return_shift_5', 'norm_log_return_shift_6', 'shifted_1_imbalance_buy_sell_flag', 
            'shifted_1_imbalance_buy_sell_flag_rank', 'shifted_1_target', 'shifted_2_imbalance_buy_sell_flag_rank', 
            'shifted_2_reference_price_wap_imb', 'shifted_2_target', 'shifted_1_imbalance_buy_sell_flag_cumsum', 'shifted_1_rsi_cumsum', 
            'shifted_2_rsi_cumsum', 'shifted_1_imbalance_cumsum']



In [5]:
def match_weights(df):
    weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
    ]
    match_stock = {}
    for idx, w in enumerate(weights):
        match_stock[idx] = w
    
    df['weight_type'] = df['stock_id'].map(match_stock)
    return df

# cutoff info on seconds_in_bucket
def cut_off(seconds_in_bucket):
    if seconds_in_bucket <= 300:
        return 1
    elif seconds_in_bucket <= 480:
        return 2
    else:
        return 3

train = match_weights(train)
train['cut_off_time'] = train['seconds_in_bucket'].apply(cut_off)

stock_pca = pd.read_csv('../data/principal_components.csv')[['stock_id', 'PC1', 'PC2', 'PC3']]

train = train.merge(stock_pca, on=['stock_id'], how='left')

In [6]:
# third step
def global_features(df):
    """
    TODO: size/median_sizes; useful features' ratio;
    first_sizes and last_sizes looks wired -> can try to remove them; If they are useful, can consider to 
    use daily level to create the difference between them;
    imbalance_momentum can be update to use multiple windows -> check the difference with the rolling features;
    referene_prices over the middle of bid price and ask price;
    If 5 mins features are useful, then we can try to replicate the useful features we have before, same to the cutoff;
    Whether independent features are better than sum features -> test
    """
    global_stock_id_feats = {
        # size related features
        "median_size": df.groupby("stock_id")["bid_size"].median() + df.groupby("stock_id")["ask_size"].median(),
        "median_bid_size": df.groupby("stock_id")['bid_size'].median(),
        "median_ask_size": df.groupby("stock_id")['ask_size'].median(),
        "median_imbalance_size": df.groupby("stock_id")['imbalance_size'].median(),
        "median_matched_size": df.groupby("stock_id")['matched_size'].median(),
        "mean_imbalance_buy_sell_flag": df.groupby("stock_id")['imbalance_buy_sell_flag'].mean(),
        "std_wap": df.groupby("stock_id")["wap"].std(),
        
        "std_size": df.groupby("stock_id")["bid_size"].std() + df.groupby("stock_id")["ask_size"].std(),
        "max_sizes": df.groupby('stock_id')['bid_size'].max() + df.groupby('stock_id')['ask_size'].max(),
        "ptp_size": df.groupby("stock_id")["bid_size"].max() - df.groupby("stock_id")["bid_size"].min(),
        "mean_sizes": df.groupby('stock_id')['bid_size'].mean() + df.groupby('stock_id')['ask_size'].mean(),
        "ptp_IQR": df.groupby("stock_id")["bid_size"].quantile(0.75) - df.groupby("stock_id")["ask_size"].quantile(0.25),
        
        # price related features
        "median_price": df.groupby("stock_id")["bid_price"].median() + df.groupby("stock_id")["ask_price"].median(),
        "std_price": df.groupby("stock_id")["bid_price"].std() + df.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df.groupby("stock_id")["bid_price"].max() - df.groupby("stock_id")["ask_price"].min(),
        
        # 5 mins behavior
        "last5min_medvol": df[df.seconds_in_bucket>=300].groupby("stock_id")['bid_size'].median() + df[df.seconds_in_bucket>=300].groupby("stock_id")['ask_size'].median(),
        "first5min_medvol": df[df.seconds_in_bucket<300].groupby("stock_id")['bid_size'].median() + df[df.seconds_in_bucket<300].groupby("stock_id")['ask_size'].median(),
        
        # cut-off related features
        "cutoff_mean_sizes": df.groupby(['stock_id', 'cut_off_time'])['bid_size'].mean() + df.groupby(['stock_id', 'cut_off_time'])['ask_size'].mean(),
        "cutoff_imb_ratios": df.groupby(['stock_id', 'cut_off_time'])['matched_size'].mean() / df.groupby(['stock_id', 'cut_off_time'])['imbalance_size'].mean(),
    }

    return global_stock_id_feats

In [7]:
@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            if mid_val == min_val:  # Prevent division by zero
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features


def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features

In [8]:
# first step
def feature_eng(df):
    df["dow"] = df["date_id"] % 5
    df["seconds"] = df["seconds_in_bucket"] % 60
    df["minute"] = df["seconds_in_bucket"] // 60   
        
    df["imbalance"] = df.eval('imbalance_size * imbalance_buy_sell_flag')
    df["market_imbalance_buy_sell_flag"] = (df['bid_size'] > df['ask_size']).astype(int)
    df["mid_size"] = df.eval("(ask_size + bid_size)/2")
    df['high_volume'] = np.where(2*df['mid_size'] > df['global_median_size'], 1, 0)
    df['volume_global_ratio'] = df.eval("mid_size/global_median_size")
    df['bid_size_global_ratio'] = df.eval("bid_size/global_median_bid_size")
    df['ask_size_global_ratio'] = df.eval("ask_size/global_median_ask_size")
    df['imbalance_global_ratio'] = df.eval("imbalance_size/global_median_imbalance_size")
    df['matched_global_ratio'] = df.eval("matched_size/global_median_matched_size")
    df['bid_ask_volume_diff'] = df.eval("ask_size - bid_size")
    df["liquidity_imbalance"] = df['bid_ask_volume_diff']/df["mid_size"]
    df["size_imbalance"] = df.eval("bid_size / ask_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_percentage"] = df["price_spread"]/df["mid_price"]
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df['imbalance_ratio'] = df['imbalance_size'] / (1+df['matched_size'])
    
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df["spread_intensity1"] = df["spread_intensity"] / df['wap']
    df['price_pressure'] = df['imbalance_size'] * df["price_spread"]
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    #供需市场的差额
    df['depth_pressure'] = df['bid_ask_volume_diff'] * (df['far_price'] - df['near_price'])
    
    df["matched_momentum"] = df.groupby(['stock_id'])['matched_size'].diff(periods=1) / df['imbalance_size']  
    
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap", "mid_price"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size", "mid_size"] 
    norm_features = ['wap', 'imbalance_buy_sell_flag', 'log_return', 'wap_mid_price_imb']
    
    for func in ["mean", "std", "skew", "kurt", "max"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
    
    df['near_ratio'] = df['near_price'] / df['reference_price']
    df['far_ratio'] = df['far_price'] / df['reference_price']
    df['near_size'] = df['near_ratio'] * df['matched_size']
    df['far_size'] = df['far_ratio'] * df['matched_size']

    # use the triplet method
    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
    for c in combinations(sizes, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
    
    df['log_wap'] = np.log(df['wap'])
    df['auction_direction_alignment'] = (df.groupby(['stock_id', 'date_id'])['wap'].diff() * df['imbalance_buy_sell_flag'] > 0).astype(int)
    df['market_direction_alignment'] = (df.groupby(['stock_id', 'date_id'])['wap'].diff() * df['market_imbalance_buy_sell_flag'] > 0).astype(int)
    df['log_return'] = df.groupby(['stock_id', 'date_id'])['log_wap'].diff()
    
    for c in norm_features:
            df[f'norm_{c}'] = df.groupby("time_id")[c].transform(lambda x: x - x.mean())

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
        
    rank_df = df.groupby('time_id')[['wap', 'imbalance_buy_sell_flag']].rank(pct=True)
    rank_df.columns = [f'{c}_rank' for c in rank_df.columns]
    df = pd.concat([df, rank_df], axis=1)
        
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_shift_{window}"] = df.groupby(['stock_id', 'date_id'])[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby(['stock_id', 'date_id'])[col].pct_change(window)
            df[f"wt_{col}_ret_{window}"] = df.groupby(['weight_type', 'date_id'])[col].pct_change(window)
            
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'market_urgency', 'imbalance_momentum', 'size_imbalance']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_diff_{window}"] = df.groupby(['stock_id', 'date_id'])[col].diff(window)
    
    # one hot encoding
    df['cut_off_time_2'] = 0
    df['cut_off_time_3'] = 0
    df.loc[df["cut_off_time"] == 2, "cut_off_time_2"] = 1
    df.loc[df["cut_off_time"] == 3, "cut_off_time_3"] = 1
    # df.drop(columns=['cut_off_time'], inplace=True)
    df['far_near_diff'] = df['far_price'] - df['near_price']
    
    gc.collect()
    
    return df 
    
#     # useful for the last fold
#     df['all_prices_mean_std'] = df['all_prices_mean']/(1+df['all_prices_std'])
#     df['all_sizes_mean_std'] = df['all_sizes_mean']/(1+df['all_sizes_std'])
#     df['all_prices_skew_kurt'] = df['all_prices_skew']/(1+df['all_prices_kurt'])
#     df['all_sizes_skew_kurt'] = df['all_sizes_skew']/(1+df['all_sizes_kurt'])


In [9]:
def generate_all_features(df):
    cols = [c for c in df.columns if c not in ["row_id"]]
    df = df[cols]

    global_stock_id_feats = global_features(df)
    for key, value in global_stock_id_feats.items():
        if 'cut_off' not in key:
            df[f"global_{key}"] = df["stock_id"].map(value.to_dict())
        else:
            df[f"global_{key}"] = df["stock_id", 'cut_off_time'].map(value.to_dict())
            
    df = feature_eng(df)

    gc.collect()

    # Select and return the generated features
    feature_name = [i for i in df.columns if
                    i not in ["row_id", "target", "time_id", "date_id", "index_return", "stock_return", "target_norm",
                              "currently_scored"]]

    return df[feature_name]

In [10]:
train.sort_values(by=['stock_id', 'date_id', 'seconds_in_bucket'], ascending=True, inplace=True)
train = train[:100000]

time_id = train.time_id.values
date_id = train.date_id.values
time_index = time_id % 55
stock_id = train.stock_id.values
y = train['target']

train = generate_all_features(train)
train['target'] = y

ntime = 55
ndate = len(np.unique(date_id))
nstock = 200
nfeatures = train.shape[1]

In [11]:
extra_features = ['upper_matched_size', 'lower_matched_size', 'upper_wap', 'lower_wap', 'upper_reference_price',\
                  'lower_reference_price']

X = np.zeros((ndate, ntime, nstock, nfeatures+len(extra_features))) * np.nan
feature_dict = {}
for i, c in enumerate(train.columns.tolist()):
    feature_dict[c] = i
    
X[date_id, time_index, stock_id, :nfeatures] = train.values

In [16]:
train['time_id'] = time_id
train['date_id'] = date_id
train_l = []

for (t), frame in train.groupby("time_id"):
    stock = frame.stock_id.values
    date = frame.date_id.values[0]
    time = t % 55
    # generate_features(X, date, time_idx, stock_idx, feature_dict)

In [55]:
def generate_features(X, date, time, stock, feature_dict):
    res = pd.DataFrame(columns=list(feature_dict.keys()))
    res[list(feature_dict.keys())] = X[date, time, :, :nfeatures]
    for i, col in enumerate(['matched_size', 'wap', 'reference_price']):
        res[f'cum_{col}_std'] = np.std(X[date, :time + 1, :, feature_dict[col]], axis=0)
        X[date, time, :, nfeatures+2*i] = X[date, time, :, feature_dict[col]] + res[f'cum_{col}_std']
        X[date, time, :, nfeatures+2*i+1] = X[date, time, :, feature_dict[col]] - res[f'cum_{col}_std']
        if (time - 7 + 1 < 0):
            res[f'rolling_upper_{col}_max'] = np.nan
            res[f'rolling_upper_{col}_min'] = np.nan
            res[f'rti_{col}'] = np.nan
        else:
            res[f'rolling_upper_{col}_max'] = np.max(X[date, time - 7 + 1:time + 1, :, nfeatures+2*i], axis=0)
            res[f'rolling_upper_{col}_min'] = np.min(X[date, time - 7 + 1:time + 1, :, nfeatures+2*i+1], axis=0)
            res[f'rti_{col}'] = (res[col] - res[f'rolling_upper_{col}_min'])/(res[f'rolling_upper_{col}_max'] - res[f'rolling_upper_{col}_min'])
    res.drop(columns=['cum_wap_std'], inplace=True)
    
    return

In [None]:
res = pd.DataFrame(columns=list(feature_dict.keys()))
res[list(feature_dict.keys())] = X[date, time, :, :nfeatures]
first_features = ['matched_size', 'imbalance_size', 'imbalance_buy_sell_flag', 'ask_size', 'bid_size']
rank_features = ['imbalance_buy_sell_flag', 'wap', 'wap_mid_price_imb',
                 'volume_global_ratio', 'imbalance_global_ratio', 'matched_global_ratio', 'bid_size_global_ratio',
                 'ask_size_global_ratio', 'market_imbalance_buy_sell_flag']

for f in first_features:
    c_first = X[date, 0, :, feature_dict[f]]
    c_curr = X[date, time, :, feature_dict[f]]
    try:
        first_ratio = c_curr / c_first
    except:
        first_ratio = np.nan
    if(f"{f}_first" in features):
        res[f"{f}_first"] = c_first
    if (f"{f}_first_ratio" in features):
        res[f"{f}_first_ratio"] = first_ratio

## Cumulative Features
cumulative_features = ['imbalance_buy_sell_flag','rsi']
for f in cumulative_features:
    cumsum = np.sum(np.nan_to_num(X[date, :time + 1, :, feature_dict[f]]), axis=0)
    try:
        cummean = cumsum / X[date, time, :, feature_dict['seconds_in_bucket']]
    except:
        cummean = np.nan
    if(f'{f}_cumsum' in features):
        res[f'{f}_cumsum'] = cumsum
    if(f'{f}_cummean' in features):
        res[f'{f}_cummean'] = cummean
        
for f in rank_features:
    if(f"{f}_rank" in features):
        c_curr = X[date, time, :, feature_dict[f]]
        res[f"{f}_rank"] = pd.Series(c_curr).rank(pct=True).values
        
## Additional Rank Features
res[f"imbalance_buy_sell_flag_cumsum_rank"] = res['imbalance_buy_sell_flag_cumsum'].rank(pct=True).values

## Percent Change Features
for f in ['matched_size', 'imbalance_size', 'reference_price', 'wap', 'ask_price', 'bid_price', 'ask_size','bid_size']:
    for window in [1, 2, 3, 6, 10]:
        if(f"{f}_ret_{window}" in features):
            try:
                pct_change = (X[date, time, :, feature_dict[f]] / X[date, time - window, :, feature_dict[f]] - 1)
            except:
                pct_change = np.nan
            res[f"{f}_ret_{window}"] = pct_change
            if (time - window < 0):
                res[f"{f}_ret_{window}"] = np.nan
        
        # last 10 sec wap and the ratio between wap and the mean of previous wap in the same weight type
        if f == 'wap' and window==1:
            group_keys = X[date, time-window, :, feature_dict['weight_type']]

            mean_values = np.zeros_like(group_keys, dtype=float)  # Initialize an array to store mean values

            unique_keys = np.unique(group_keys)  # Get unique group keys

            for key in unique_keys:
                mask = (group_keys == key)  # Create a mask to filter values based on the current key
                mean_val = np.mean(X[date, time-window, :, feature_dict[f]][mask])  # Calculate mean of column 4 based on the mask
                mean_values[mask] = mean_val  # Assign mean value to corresponding positions in the array
            res['previous_wap'] = X[date, time - window, :, feature_dict[f]]
            res['previous_wt_wap_mean'] = mean_values
            res['wap_percent_ind'] = X[date, time, :, feature_dict[f]]/mean_values
        
        # last 10 sec median match size and im size for each weight type and the ratio between im size and match size
        if window==1 and (f=='imbalance_size' or f=='matched_size'):
            group_keys = X[date, time-window, :, feature_dict['weight_type']]

            median_values = np.zeros_like(group_keys, dtype=float)  # Initialize an array to store mean values

            unique_keys = np.unique(group_keys)  # Get unique group keys

            for key in unique_keys:
                mask = (group_keys == key)  # Create a mask to filter values based on the current key
                median_val = np.median(X[date, time-window, :, feature_dict[f]][mask])  # Calculate mean of column 4 based on the mask
                median_values[mask] = median_val  # Assign mean value to corresponding positions in the array
            res[f'previous_wt_{f}_median'] = median_values
res['previous_imbalance_ratio_percent_ind'] =  X[date, time, :, feature_dict['imbalance_size']] / (1+X[date, time, :, feature_dict['matched_size']])/ res['previous_wt_imbalance_size_median'] / (1+res['previous_wt_matched_size_median'])            
res.drop(columns=['previous_wt_imbalance_size_median', 'previous_wt_matched_size_median'], inplace=True)     
            
## Rolling Means
for f in ['wap', 'imbalance_buy_sell_flag', 'imbalance_size', 'matched_size', 'norm_wap','auction_direction_alignment','rsi']:
    if(f"{f}_rolling_mean_{window}" in features):
        for window in [3, 6, 10]:
            mean = np.mean(X[date, time - window + 1:time + 1, :, feature_dict[f]], axis=0)
            res[f"{f}_rolling_mean_{window}"] = mean
            if (time - window + 1 < 0):
                res[f"{f}_rolling_mean_{window}"] = np.nan

## Rolling Standard Deviations
for f in ['wap', 'imbalance_buy_sell_flag', 'imbalance_size', 'matched_size', 'norm_wap']:
    if(f"{f}_rolling_std_{window}" in features):
        for window in [3, 6, 10]:
            std = np.std(X[date, time - window + 1:time + 1, :, feature_dict[f]], axis=0)
            res[f"{f}_rolling_std_{window}"] = std
            if (time - window + 1 < 0):
                res[f"{f}_rolling_std_{window}"] = np.nan
                
## EMA
alpha = 0.285
beta = 1 - alpha
for f in ['matched_size', 'wap', 'imbalance_size','norm_wap','reference_price']:
    if(f"{f}_ema" in features):
        ema = X[date, time, :, feature_dict[f]]*alpha + \
              X[date, time-1, :, feature_dict[f]]*alpha*beta + \
              X[date, time-2, :, feature_dict[f]]*alpha*beta**2 + \
              X[date, time-3, :, feature_dict[f]]*alpha*beta**3 + \
              X[date, time-4, :, feature_dict[f]]*alpha*beta**4 + \
              X[date, time-5, :, feature_dict[f]]*alpha*beta**5 + \
              X[date, time-6, :, feature_dict[f]]*alpha*beta**6
        res[f"{f}_ema"] = ema
        if (time < 6):
            res[f"{f}_ema"] = np.nan
            
for f in ['wap_mid_price_imb', 'reference_price_wap_imb', 'norm_wap', 'imbalance_buy_sell_flag', 'wap_rank','imbalance_buy_sell_flag_rank','norm_log_return']:
    for window in [1, 2, 3, 4, 5, 6]:
        if(f"{f}_shift_{window}" in features):
            lag = X[date, time - window, :, feature_dict[f]]
            res[f"{f}_shift_{window}"] = lag
            if (time - window < 0):
                res[f"{f}_shift_{window}"] = np.nan

shift_features = ['imbalance_size', 'imbalance_buy_sell_flag', 'wap_rank', 'imbalance_buy_sell_flag_rank','reference_price_wap_imb', 'target']
for shift_idx in [1, 2]:
    for f in shift_features:
        if(f"shifted_{shift_idx}_{f}" in features):
            shift = X[date - shift_idx, time, :, feature_dict[f]].copy()
            res[f"shifted_{shift_idx}_{f}"] = shift
            if (date - shift_idx < 0):
                res[f"shifted_{shift_idx}_{f}"] = np.nan

# Handling edge case cumsum features
for f in ['imbalance_buy_sell_flag','rsi','imbalance']:
    for shift_idx in [1, 2]:
        if(f"shifted_{shift_idx}_{f}_cumsum" in features):
            cumsum = np.sum(np.nan_to_num(X[date - shift_idx, :time + 1, :, feature_dict[f]]), axis=0)
            res[f"shifted_{shift_idx}_{f}_cumsum"] = cumsum
            if (date - shift_idx < 0):
                res[f"shifted_{shift_idx}_{f}_cumsum"] = np.nan                
        
# adjusted rti
for i, col in enumerate(['matched_size', 'wap', 'reference_price']):
    res[f'cum_{col}_std'] = np.std(X[date, :time + 1, :, feature_dict[col]], axis=0)
    X[date, time, :, nfeatures+2*i] = X[date, time, :, feature_dict[col]] + res[f'cum_{col}_std']
    X[date, time, :, nfeatures+2*i+1] = X[date, time, :, feature_dict[col]] - res[f'cum_{col}_std']
    if (time - 7 + 1 < 0):
        res[f'rolling_upper_{col}_max'] = np.nan
        res[f'rolling_upper_{col}_min'] = np.nan
        res[f'rti_{col}'] = np.nan
    else:
        res[f'rolling_upper_{col}_max'] = np.max(X[date, time - 7 + 1:time + 1, :, nfeatures+2*i], axis=0)
        res[f'rolling_upper_{col}_min'] = np.min(X[date, time - 7 + 1:time + 1, :, nfeatures+2*i+1], axis=0)
        res[f'rti_{col}'] = (res[col] - res[f'rolling_upper_{col}_min'])/(res[f'rolling_upper_{col}_max'] - res[f'rolling_upper_{col}_min'])
res.drop(columns=['cum_wap_std'], inplace=True)





# 1mins level wap and industries related features -> try to lower the difficulties on predicting the target
df['previous_target'] = df['1min_wap_ratio'] - df['1min_iwap_ratio']
df[f'cum_previous_target_mean'] = df.groupby(['stock_id', 'date_id'])['previous_target'].expanding().mean().reset_index(level=[0,1], drop=True)
df[f'cum_previous_target_std'] = df.groupby(['stock_id', 'date_id'])['previous_target'].expanding().std().reset_index(level=[0,1], drop=True)
df[f'cum_previous_target_skew'] = df.groupby(['stock_id', 'date_id'])['previous_target'].expanding().skew().reset_index(level=[0, 1], drop=True)
df[f'cum_previous_target_kurtosis'] = df.groupby(['stock_id', 'date_id'])['previous_target'].expanding().kurt().reset_index(level=[0, 1], drop=True)

# cumulative min, max and min max standard of the reference price, matched_size, imbalance_size
for col in ['reference_price', 'matched_size', 'imbalance_size']:
    df[f'cum_{col}_min'] = df.groupby(['stock_id', 'date_id'])[col].expanding().min().reset_index(level=[0,1], drop=True)
    df[f'cum_{col}_max'] = df.groupby(['stock_id', 'date_id'])[col].expanding().max().reset_index(level=[0,1], drop=True)
    df[f'cum_{col}_max_min'] = (df[col] - df[f'cum_{col}_min'])/(1+df[f'cum_{col}_max'] - df[f'cum_{col}_min'])


    
        
### Heavy Ops ###


        
    # drop at the end
    # df.drop(columns=['cum_reference_price_min', 'cum_imbalance_size_min'], inplace=True)
        
    # pct_change for reference_price and wap and get the skew and kurt
    for col in ['reference_price', 'wap',]:
        df[f'{col}_pct_change'] = df.groupby(['stock_id', 'date_id'])[col].pct_change()
        df[f'cum_{col}_skew'] = df.groupby(['stock_id', 'date_id'])[f'{col}_pct_change'].expanding().skew().reset_index(level=[0, 1], drop=True)
        df[f'cum_{col}_kurtosis'] = df.groupby(['stock_id', 'date_id'])[f'{col}_pct_change'].expanding().kurt().reset_index(level=[0, 1], drop=True)
        
        # vol features
        df[f'abs_return'] = df[f'{col}_pct_change'].abs()
        df[f'{col}_illiq'] = (df['abs_return'] / df['matched_size']).rolling(10).mean()
        
        df['neg_retrun_flag'] = np.where(df[f'{col}_pct_change']<0, 1, 0)
        df['neg_retrun_flag'] = df['neg_retrun_flag'] * df['abs_return']
        df[f'{col}_negative_returns_illiq'] = df['neg_retrun_flag'].rolling(10).sum() / (1+(df['imbalance_size']/df['matched_size']).rolling(10).sum())
        
    # shrink flag
    for col in ['matched_size', 'imbalance_size',]:
        for window in [7, 14]:
            df[f'{col}_{window}_shrink_vol'] = df.groupby(['stock_id', 'date_id'])[col].rolling(window=window).std().reset_index(level=[0,1], drop=True)
            df[f'{col}_{window}_shrink_vol'] = np.where(df[col]<2*df[f'{col}_{window}_shrink_vol'], 1, 0)
    
    df.drop(columns=['previous_im_size', 'previous_m_size', 'previous_wt_im_median', 'previous_wt_m_median', \
                    'upper_matched_size', 'lower_matched_size', 'upper_wap', 'lower_wap', \
                    'upper_reference_price', 'lower_reference_price', 'buy_site_flag', 'sell_site_flag', \
                    'buy_flag_diff', 'sell_flag_diff', 'buy_site_diff_sum', 'sell_site_diff_sum', \
                    'buy_flag', 'sell_flag', 'buy_time', 'sell_time', 'buy_time_start', 'buy_time_end', \
                    'sell_time_start', 'sell_time_end', 'buy_time_diff', 'sell_time_diff', \
                    'cum_wap_skew', 'reference_price_diff', \
                    'cum_wap_std', 'cum_reference_price_min', 'cum_imbalance_size_min', 'index_wap', 'reference_price_pct_change',\
                    'abs_return', 'neg_retrun_flag', 'previous_1_min',\
                    ], inplace=True)

In [33]:
res = pd.DataFrame(columns=list(feature_dict.keys()))
res[list(feature_dict.keys())] = X[date, time, :, :nfeatures]
first_features = ['matched_size', 'imbalance_size', 'imbalance_buy_sell_flag', 'ask_size', 'bid_size']
rank_features = ['imbalance_buy_sell_flag', 'wap', 'wap_mid_price_imb',
                 'volume_global_ratio', 'imbalance_global_ratio', 'matched_global_ratio', 'bid_size_global_ratio',
                 'ask_size_global_ratio', 'market_imbalance_buy_sell_flag']

for f in ['matched_size', 'imbalance_size', 'reference_price', 'wap', 'ask_price', 'bid_price', 'ask_size','bid_size']:
    for window in [1, 2, 3, 6, 10]:
        if(f"{f}_ret_{window}" in features):
            try:
                pct_change = (X[date, time, :, feature_dict[f]] / X[date, time - window, :, feature_dict[f]] - 1)
            except:
                pct_change = np.nan
            res[f"{f}_ret_{window}"] = pct_change
            if (time - window < 0):
                res[f"{f}_ret_{window}"] = np.nan
        
        # last 10 sec wap and the ratio between wap and the mean of previous wap in the same weight type
        if f == 'wap' and window==1:
            group_keys = X[date, time-window, :, feature_dict['weight_type']]

            mean_values = np.zeros_like(group_keys, dtype=float)  # Initialize an array to store mean values

            unique_keys = np.unique(group_keys)  # Get unique group keys

            for key in unique_keys:
                mask = (group_keys == key)  # Create a mask to filter values based on the current key
                mean_val = np.mean(X[date, time-window, :, feature_dict[f]][mask])  # Calculate mean of column 4 based on the mask
                mean_values[mask] = mean_val  # Assign mean value to corresponding positions in the array
            res['previous_wap'] = X[date, time - window, :, feature_dict[f]]
            res['previous_wt_wap_mean'] = mean_values
            res['wap_percent_ind'] = X[date, time, :, feature_dict[f]]/mean_values
        
        # last 10 sec median match size and im size for each weight type and the ratio between im size and match size
        if window==1 and (f=='imbalance_size' or f=='matched_size'):
            group_keys = X[date, time-window, :, feature_dict['weight_type']]

            median_values = np.zeros_like(group_keys, dtype=float)  # Initialize an array to store mean values

            unique_keys = np.unique(group_keys)  # Get unique group keys

            for key in unique_keys:
                mask = (group_keys == key)  # Create a mask to filter values based on the current key
                median_val = np.median(X[date, time-window, :, feature_dict[f]][mask])  # Calculate mean of column 4 based on the mask
                median_values[mask] = median_val  # Assign mean value to corresponding positions in the array
            res[f'previous_wt_{f}_median'] = median_values
res['previous_imbalance_ratio_percent_ind'] =  X[date, time, :, feature_dict['imbalance_size']] / (1+X[date, time, :, feature_dict['matched_size']])/ res['previous_wt_imbalance_size_median'] / (1+res['previous_wt_matched_size_median'])            
res.drop(columns=['previous_wt_imbalance_size_median', 'previous_wt_matched_size_median'], inplace=True)

In [34]:
res[['previous_imbalance_ratio_percent_ind']]

Unnamed: 0,previous_imbalance_ratio_percent_ind
0,5.754543e-16
1,2.478304e-15
2,8.143392e-15
3,
4,
...,...
195,
196,
197,
198,
