In [4]:
import gc  # Garbage collection for memory management
import os  # Operating system-related functions
import time  # Time-related functions
import warnings  # Handling warnings
from itertools import combinations  # For creating combinations of elements
from warnings import simplefilter  # Simplifying warning handling

# 📦 Importing machine learning libraries
import joblib  # For saving and loading models
import numpy as np  # Numerical operations
import pandas as pd  # Data manipulation and analysis
from sklearn.metrics import mean_absolute_error  # Metric for evaluation
from sklearn.model_selection import KFold, TimeSeriesSplit  # Cross-validation techniques
import os
import lightgbm as lgb

# 🤐 Disable warnings to keep the code clean
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
import time
import polars as pl
import gc
print(pl.__version__)



0.20.21


In [1]:
import catboost as cbt

In [2]:
# INPUT_PATH = "/kaggle/input/optiver/"
INPUT_PATH = "../save/"

In [3]:
import json
with open(f"{INPUT_PATH}xgb3_feas_v7_157.json") as f:
    feas_dict = json.load(f)
selected_feas = feas_dict['selected_feas']
print(len(selected_feas))

157


In [5]:
import xgboost as xgb
import catboost as cbt
import lightgbm as lgb

In [6]:
xgb_model_list = []
for k in [1,2,3,4,5]:
    model=xgb.XGBRegressor()
    model.load_model(f'{INPUT_PATH}xgb3_v7_k{k}_weight15_debug.json') 
    xgb_model_list.append(model)

In [7]:
xgb_model_list

[XGBRegressor(base_score='-6.9737434E-2', booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None,
              feature_types=['float', 'float', 'float', 'float', 'float', 'int',
                             'float', 'float', 'float', 'float', 'float',
                             'float', 'float', 'float', 'floa...
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='reg:absoluteerror', ...),
 XGBRegressor(

In [8]:
# with open(f"{INPUT_PATH}scale_dict_median.json") as f:
#     scale_dict = json.load(f)

from load_data import load_data_from_csv
from data_generator.data_generator import TimeSeriesKFoldDataGenerator
from data_preprocessor.data_preprocessor import ReduceMemUsageDataPreprocessor

DATA_PATH = '..'
df_train, df_test, revealed_targets, sample_submission = load_data_from_csv(DATA_PATH)
print(df_train.columns)

time_series_k_fold_data_generator = TimeSeriesKFoldDataGenerator(n_fold=5, test_set_ratio=0.1)

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


In [9]:
train_dfs, eval_dfs, num_train_eval_sets = time_series_k_fold_data_generator.generate(df_train)

In [10]:
train = train_dfs[-1].copy(deep=True)
print(train.columns)

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


In [11]:
train = ReduceMemUsageDataPreprocessor(exclude_columns=['stock_id','date_id','seconds_in_bucket']).apply(train)

In [12]:
train.shape

(4714182, 17)

In [13]:
del train_dfs
# del eval_dfs

In [14]:
size_col = ['imbalance_size','matched_size','bid_size','ask_size']

In [15]:
# scale_dict = train[size_col].median().to_dict()

In [16]:
scale_dict = train.groupby("stock_id")[size_col].median().to_dict()

In [17]:
# scale_dict

In [18]:
def generate_features_no_hist_polars(df, target_feas):
    size_col = ['imbalance_size','matched_size','bid_size','ask_size']
    for _ in size_col:
        tmp_map = scale_dict[_].copy()
        tmp_map = {int(k):v for k,v in tmp_map.items()}
        df[f"{_}_stock_median"] = df['stock_id'].map(tmp_map)
        df[f"scale_{_}"] = df[_] / df[f"{_}_stock_median"]
        del df[f"{_}_stock_median"]
    #buy-side imbalance; 1
    #sell-side imbalance; -1
    #no imbalance; 0
    df['auc_bid_size'] = df['matched_size']
    df['auc_ask_size'] = df['matched_size']
    df.loc[df['imbalance_buy_sell_flag']==1,'auc_bid_size'] += df.loc[df['imbalance_buy_sell_flag']==1,'imbalance_size']
    df.loc[df['imbalance_buy_sell_flag']==-1,'auc_ask_size'] += df.loc[df['imbalance_buy_sell_flag']==-1,'imbalance_size']
    # 加一个ask_size - bid_size的特征 然后Rolling
    df = pl.from_pandas(df)
    feas_list = ['stock_id','seconds_in_bucket','imbalance_size','imbalance_buy_sell_flag',
               'reference_price','matched_size','far_price','near_price','bid_price','bid_size',
                'ask_price','ask_size','wap','scale_imbalance_size','scale_matched_size','scale_bid_size','scale_ask_size'
                 ,'auc_bid_size','auc_ask_size']
    # 基础特征
    df = df.with_columns([
        # 阶段1
        (pl.col('ask_size') * pl.col('ask_price')).alias("ask_money"),
        (pl.col('bid_size') * pl.col('bid_price')).alias("bid_money"),
        (pl.col('ask_size') + pl.col("auc_ask_size")).alias("ask_size_all"),
        (pl.col('bid_size') + pl.col("auc_bid_size")).alias("bid_size_all"),
        (pl.col('ask_size') + pl.col("auc_ask_size") + pl.col('bid_size') + pl.col("auc_bid_size")).alias("volumn_size_all"),
        (pl.col('reference_price') * pl.col('auc_ask_size')).alias("ask_auc_money"),
        (pl.col('reference_price') * pl.col('auc_bid_size')).alias("bid_auc_money"),
        (pl.col('ask_size') * pl.col('ask_price') + pl.col('bid_size') * pl.col('bid_price')).alias("volumn_money"),
        (pl.col('ask_size') + pl.col('bid_size')).alias('volume_cont'),
        (pl.col('ask_size') - pl.col('bid_size')).alias('diff_ask_bid_size'),
        (pl.col('imbalance_size') + 2 * pl.col('matched_size')).alias('volumn_auc'),
        ((pl.col('imbalance_size') + 2 * pl.col('matched_size')) * pl.col("reference_price")).alias('volumn_auc_money'),
        ((pl.col('ask_price') + pl.col('bid_price'))/2).alias('mid_price'),
        ((pl.col('near_price') + pl.col('far_price'))/2).alias('mid_price_near_far'),
        (pl.col('ask_price') - pl.col('bid_price')).alias('price_diff_ask_bid'),
        (pl.col('ask_price') / pl.col('bid_price')).alias('price_div_ask_bid'),
        (pl.col('imbalance_buy_sell_flag') * pl.col('scale_imbalance_size')).alias('flag_scale_imbalance_size'),
        (pl.col('imbalance_buy_sell_flag') * pl.col('imbalance_size')).alias('flag_imbalance_size'),
        (pl.col('imbalance_size') / pl.col('matched_size') * pl.col('imbalance_buy_sell_flag')).alias("div_flag_imbalance_size_2_balance"),
        ((pl.col('ask_price') - pl.col('bid_price')) * pl.col('imbalance_size')).alias('price_pressure'),
        ((pl.col('ask_price') - pl.col('bid_price')) * pl.col('imbalance_size') * pl.col('imbalance_buy_sell_flag')).alias('price_pressure_v2'),
        ((pl.col("ask_size") - pl.col("bid_size")) / (pl.col("far_price") - pl.col("near_price"))).alias("depth_pressure"),
        (pl.col("bid_size") / pl.col("ask_size")).alias("div_bid_size_ask_size"),
    ])
    feas_list.extend(['ask_money', 'bid_money', 'ask_auc_money','bid_auc_money',"ask_size_all","bid_size_all","volumn_size_all",
                      'volumn_money','volume_cont',"volumn_auc","volumn_auc_money","mid_price",
                      'mid_price_near_far','price_diff_ask_bid',"price_div_ask_bid","flag_imbalance_size","div_flag_imbalance_size_2_balance",
                     "price_pressure","price_pressure_v2","depth_pressure","flag_scale_imbalance_size","diff_ask_bid_size"])        

    # 各种ratio
    # 提升微忽几微
    add_cols = []
    for col1, col2 in [
        ("imbalance_size","bid_size"),
        ("imbalance_size","ask_size"),
        ("matched_size","bid_size"),
        ("matched_size","ask_size"),
        ("imbalance_size","volume_cont"),
        ("matched_size","volume_cont"),
        ("auc_bid_size","bid_size"),
        ("auc_ask_size","ask_size"),
        ("bid_auc_money","bid_money"),
        ("ask_auc_money","ask_money"),
    ]:
        add_cols.append((pl.col(col1) / pl.col(col2)).alias(f"div_{col1}_2_{col2}"))
        feas_list.append(f"div_{col1}_2_{col2}")        
    df = df.with_columns(add_cols)

    # 阶段2 不平衡特征
    # 除了price相关
    # 没加auc的ask/bid的 构造price以及不平衡进去
    add_cols = []
    for pair1,pair2 in [
        ('ask_size','bid_size'),
        ('ask_money','bid_money'),
        ('volumn_money','volumn_auc_money'),
        ('volume_cont','volumn_auc'),
        ('imbalance_size','matched_size'),
        ('auc_ask_size','auc_bid_size'),
        ("ask_size_all",'bid_size_all')
    ]:
        col_imb = f"imb1_{pair1}_{pair2}"
        add_cols.extend([
            ((pl.col(pair1) - pl.col(pair2)) / (pl.col(pair1) + pl.col(pair2))).alias(col_imb),
        ])
        feas_list.extend([col_imb])
    df = df.with_columns(add_cols)
    
    # price侧的imb1
    fea_append_list = []
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap","mid_price"]
    for c in combinations(prices, 2):
        fea_append_list.append(((pl.col(c[0]) - pl.col(c[1])) / (pl.col(c[0]) + pl.col(c[1]))).alias(f"imb1_{c[0]}_{c[1]}"))
        # fea_append_list.append((pl.col(c[0]) - pl.col(c[1])).alias(f"diff_{c[0]}_{c[1]}"))
        feas_list.extend([f"imb1_{c[0]}_{c[1]}"])
    df = df.with_columns(fea_append_list)
    
    
    # 不平衡特征 累计乘
    df = df.with_columns([
        ((pl.col("imb1_ask_size_bid_size") + 2) * (pl.col("imb1_ask_price_bid_price") + 2) * (pl.col("imb1_auc_ask_size_auc_bid_size")+2)).alias("market_urgency_v2"),
        (pl.col('price_diff_ask_bid') * (pl.col('imb1_ask_size_bid_size'))).alias('market_urgency'),
        (pl.col('imb1_ask_price_bid_price') * (pl.col('imb1_ask_size_bid_size'))).alias('market_urgency_v3'),
    ])
    feas_list.extend([f"market_urgency_v3",'market_urgency','market_urgency_v2'])
    
    feas_list = ['imb1_wap_mid_price', 'imb1_ask_money_bid_money', 'imb1_volume_cont_volumn_auc', 'imb1_reference_price_ask_price', 
                 'imb1_reference_price_mid_price', 'seconds_in_bucket', 'div_flag_imbalance_size_2_balance', 'ask_price', 
                 'imb1_reference_price_bid_price', 'scale_matched_size', 'imb1_near_price_wap', 'volumn_auc_money', 'imb1_far_price_wap', 
                 'bid_size', 'scale_bid_size', 'bid_size_all']
    # 隔离
    add_cols = []
    for col in ["bid_auc_money","imb1_reference_price_wap","bid_size_all",
                "imb1_auc_ask_size_auc_bid_size","div_flag_imbalance_size_2_balance",
                "imb1_ask_size_all_bid_size_all","flag_imbalance_size","imb1_reference_price_mid_price"]:
        for window in [3,6,18,36,60]:
            add_cols.append(pl.col(col).rolling_mean(window_size=window,min_periods=1).over('stock_id','date_id').alias(f'rolling{window}_mean_{col}'))
            add_cols.append(pl.col(col).rolling_std(window_size=window,min_periods=1).over('stock_id','date_id').alias(f'rolling{window}_std_{col}'))
            feas_list.extend([f'rolling{window}_mean_{col}',f'rolling{window}_std_{col}'])
    feas_list = ['imb1_wap_mid_price', 'imb1_ask_money_bid_money', 'imb1_volume_cont_volumn_auc', 
                     'imb1_reference_price_ask_price', 'imb1_reference_price_mid_price', 
                     'seconds_in_bucket', 'div_flag_imbalance_size_2_balance', 'ask_price', 
                     'imb1_reference_price_bid_price', 'scale_matched_size', 'imb1_near_price_wap', 
                     'volumn_auc_money', 'imb1_far_price_wap', 'bid_size', 'scale_bid_size', 'bid_size_all', 
                     'rolling18_mean_imb1_auc_ask_size_auc_bid_size', 'rolling3_mean_div_flag_imbalance_size_2_balance', 
                     'rolling60_std_div_flag_imbalance_size_2_balance', 'rolling36_mean_flag_imbalance_size', 
                     'rolling3_std_imb1_auc_ask_size_auc_bid_size', 'rolling18_mean_imb1_ask_size_all_bid_size_all', 
                     'rolling6_mean_div_flag_imbalance_size_2_balance', 'rolling6_std_imb1_auc_ask_size_auc_bid_size', 
                     'rolling3_mean_imb1_auc_ask_size_auc_bid_size', 'rolling60_std_imb1_auc_ask_size_auc_bid_size', 
                     'rolling6_std_bid_size_all', 'rolling3_std_bid_size_all', 'rolling3_mean_bid_size_all', 
                     'rolling18_std_bid_auc_money', 'rolling36_mean_bid_auc_money',"rolling60_mean_imb1_reference_price_wap",
                    'rolling18_mean_imb1_reference_price_wap', 'rolling3_mean_imb1_reference_price_mid_price']
    df = df.with_columns(add_cols)
    
#     for col in ["flag_imbalance_size","imb1_reference_price_wap","imb1_reference_price_mid_price","mid_price","imb1_far_price_wap",
#                'matched_size', 'reference_price', 'imbalance_buy_sell_flag']:
#         add_cols = []
#         for window_size in [1,2,4,6,12]:
#             add_cols.append(pl.col(col).shift(window_size).over('stock_id','date_id').alias(f'shift{window_size}_{col}'))
#             add_cols.append((pl.col(col) / pl.col(col).shift(window_size).over('stock_id','date_id')).alias(f'div_shift{window_size}_{col}'))
#             add_cols.append((pl.col(col) - pl.col(col).shift(window_size).over('stock_id','date_id')).alias(f'diff_shift{window_size}_{col}'))
#             feas_list.extend([f'shift{window_size}_{col}',f'div_shift{window_size}_{col}',f'diff_shift{window_size}_{col}'])
#         df = df.with_columns(add_cols)
    ### 杂七杂八
    df = df.with_columns([
        pl.col("flag_imbalance_size").diff().over('stock_id','date_id').alias("imbalance_momentum_unscaled"),
        pl.col("price_diff_ask_bid").diff().over('stock_id','date_id').alias("spread_intensity"),
    ])
    feas_list.extend(["imbalance_momentum_unscaled","spread_intensity"])
    df = df.with_columns([
        (pl.col("imbalance_momentum_unscaled")/pl.col("matched_size")).alias("imbalance_momentum")
    ])
    feas_list.extend(["imbalance_momentum"])

    #Calculate diff features for specific columns
    add_cols = []
    for col in ['ask_price',
 'bid_price',
 'imb1_reference_price_near_price',
 'bid_size',
 'scale_bid_size',
 'mid_price',
 'ask_size',
 'price_div_ask_bid',
 'div_bid_size_ask_size',
 'market_urgency',
 'wap',
 'imbalance_momentum']:
        for window in [1, 2, 3, 10]:
            add_cols.append((pl.col(col).diff(window).over('stock_id','date_id')).alias(f"{col}_diff_{window}"))
            feas_list.append(f"{col}_diff_{window}")
    df = df.with_columns(add_cols)
    
    ### target mock系列
    for mock_period in [1,3,12,6]:
    
        df = df.with_columns([
            pl.col("wap").shift(-mock_period).over("stock_id","date_id").alias(f"wap_shift_n{mock_period}")
        ])
        df = df.with_columns([
            (pl.col(f"wap_shift_n{mock_period}")/pl.col("wap")).alias("target_single")
        ])

        tmp_df = df.select(pl.col("target_single"),pl.col("weight")).to_pandas()
        tmp_df.loc[tmp_df["target_single"].isna(),"weight"] = 0
        df = df.with_columns([
            pl.lit(np.array(tmp_df["weight"])).alias("weight_tmp")
        ])

        df = df.with_columns([
            (((pl.col("weight_tmp") * pl.col("target_single")).sum().over("date_id","seconds_in_bucket")) / ((pl.col("weight_tmp")).sum().over("date_id","seconds_in_bucket"))).alias("index_target_mock")
        ])

        df = df.with_columns([
            ((pl.col("target_single") - pl.col("index_target_mock"))*10000).alias("target_mock")
        ])

        df = df.with_columns([
            pl.col("target_mock").shift(mock_period).over("stock_id","date_id").alias(f"target_mock_shift{mock_period}"),
            #pl.col("index_target_mock").shift(mock_period).over("stock_id","date_id").alias(f"index_target_mock_shift{mock_period}"),
            #pl.col("target_single").shift(mock_period).over("stock_id","date_id").alias(f"target_single_shift{mock_period}")
        ])
    # df.drop_in_place("wap_shift_6")
    # df.drop_in_place("target_single_shift6")
    # df.drop_in_place("indexwap_shift6")
    # add_cols_new = []
    add_cols = []
    for col in ['target_mock_shift6','target_mock_shift1','target_mock_shift3','target_mock_shift12']:
        for window in [1, 3,6,12,24,48]:
            add_cols.append(pl.col(col).rolling_mean(window_size=window,min_periods=1).over('stock_id','date_id').alias(f'rolling{window}_mean_{col}'))
            #add_cols.append(pl.col(col).rolling_std(window_size=window,min_periods=1).over('stock_id','date_id').alias(f'rolling{window}_std_{col}'))
            # add_cols_new.extend([f'rolling{window}_mean_{col}'])
    df = df.with_columns(add_cols)
    keep_cols_new = ['rolling48_mean_target_mock_shift3', 'rolling48_mean_target_mock_shift1', 'rolling48_mean_target_mock_shift12',
'rolling1_mean_target_mock_shift6', 'rolling24_mean_target_mock_shift6','rolling24_mean_target_mock_shift12',]
    feas_list.extend(keep_cols_new)
    
    add_cols = []
    for col in ["imb1_auc_ask_size_auc_bid_size","flag_imbalance_size","price_pressure_v2","scale_matched_size"]:
        for window_size in [1,2,3,6,12]:
            add_cols.append(pl.col(col).shift(window_size).over('stock_id','date_id').alias(f'shift{window_size}_{col}'))
            add_cols.append((pl.col(col) / pl.col(col).shift(window_size).over('stock_id','date_id')).alias(f'div_shift{window_size}_{col}'))
            add_cols.append((pl.col(col) - pl.col(col).shift(window_size).over('stock_id','date_id')).alias(f'diff_shift{window_size}_{col}'))
            #feas_list.extend([f'shift{window_size}_{col}',f'div_shift{window_size}_{col}',f'diff_shift{window_size}_{col}'])
    feas_list.extend(['div_shift6_imb1_auc_ask_size_auc_bid_size',
 'diff_shift6_price_pressure_v2',
 'shift1_price_pressure_v2',
 'div_shift3_flag_imbalance_size',
 'div_shift12_imb1_auc_ask_size_auc_bid_size',
 'div_shift3_scale_matched_size',
 'diff_shift6_flag_imbalance_size',
 'shift12_imb1_auc_ask_size_auc_bid_size',
 'div_shift12_price_pressure_v2',
 'shift6_flag_imbalance_size',
 'diff_shift3_imb1_auc_ask_size_auc_bid_size',
 'div_shift12_flag_imbalance_size',
 'shift12_flag_imbalance_size'])
    df = df.with_columns(add_cols)
    
    add_cols = []
    for col in ['imb1_ask_price_mid_price',
 'market_urgency',
 'market_urgency_diff_1',
 'imb1_ask_money_bid_money',
 'rolling18_mean_imb1_ask_size_all_bid_size_all',
 'rolling18_mean_imb1_auc_ask_size_auc_bid_size',
 'rolling18_mean_imb1_reference_price_wap',
 'ask_price_diff_3',
 'diff_shift1_price_pressure_v2',
 'diff_shift12_scale_matched_size',
 'diff_shift1_flag_imbalance_size',
 'imb1_ask_size_bid_size',
 'imb1_bid_price_mid_price',
 'rolling48_mean_target_mock_shift6']:
        add_cols.append((((pl.col(col) * pl.col("weight")).sum().over("date_id","seconds_in_bucket"))/(((pl.col("weight")).sum().over("date_id","seconds_in_bucket")))).alias(f"global_{col}"))
        feas_list.append(f"global_{col}")
    df = df.with_columns(add_cols)
    
    
    # MACD
    rsi_cols = ["mid_price_near_far","imb1_reference_price_wap","near_price",]
    add_cols = []
    for col in rsi_cols:
        for window_size in [3,6,12,24,48]:
            add_cols.append(pl.col(col).ewm_mean(span=window_size, adjust=False).over('stock_id','date_id').alias(f"rolling_ewm_{window_size}_{col}"))
            #feas_list.append(f"rolling_ewm_{window_size}_{col}")
    df = df.with_columns(add_cols)
    
    add_cols = []
    for col in rsi_cols:
        for w1,w2 in zip((3,6,12,24),(6,12,24,48)):
            add_cols.append((pl.col(f"rolling_ewm_{w1}_{col}") - pl.col(f"rolling_ewm_{w2}_{col}")).alias(f"dif_{col}_{w1}_{w2}"))
            #feas_list.append(f"dif_{col}_{w1}_{w2}")
    df = df.with_columns(add_cols)
    
    add_cols = []
    for col in rsi_cols:
        for w1,w2 in zip((3,6,12,24),(6,12,24,48)):
            add_cols.append(pl.col(f"dif_{col}_{w1}_{w2}").ewm_mean(span=9, adjust=False).over('stock_id','date_id').alias(f"dea_{col}_{w1}_{w2}"))
            #feas_list.append(f"dea_{col}_{w1}_{w2}")
    df = df.with_columns(add_cols)
    
    add_cols = []
    for col in rsi_cols:
        for w1,w2 in zip((3,6,12,24),(6,12,24,48)):
            add_cols.append((pl.col(f"dif_{col}_{w1}_{w2}") - pl.col(f"dea_{col}_{w1}_{w2}")).alias(f"macd_{col}_{w1}_{w2}"))
            #feas_list.append(f"macd_{col}_{w1}_{w2}")
    
    feas_list.extend(['macd_imb1_reference_price_wap_12_24',
 'dif_imb1_reference_price_wap_3_6',
 'macd_mid_price_near_far_12_24',
 'dif_near_price_3_6',
 'macd_near_price_24_48',
 'dea_imb1_reference_price_wap_12_24',
 'macd_near_price_12_24',
 'rolling_ewm_24_imb1_reference_price_wap',
 'dif_near_price_6_12',
 'dea_mid_price_near_far_6_12',
 'dea_near_price_24_48',
 'rolling_ewm_12_imb1_reference_price_wap',
 'dif_imb1_reference_price_wap_12_24'])
    df = df.with_columns(add_cols)
    
    #add_cols = []
    new_add_cols = []
    for col in ["target"]:
        # 176 1,2,3,5,10,15,20,25,30
        # [1,2,3,5,10,15,20,25,30,35,40,45,60] 5.8704926 157
        # [1,2,3,5,10,15,20,30,45,60] 5.8708683137
        for window_size in [1,2,3,5,10,15,20,25,30,35,40,45,60]:
            #add_cols.append(pl.col(col).shift(1).rolling_mean(window_size=window_size,min_periods=1).over('stock_id','seconds_in_bucket').alias(f'rolling_mean_{window_size}_{col}_second'))
            #add_cols.append(pl.col(col).shift(1).rolling_std(window_size=window_size,min_periods=1).over('stock_id','seconds_in_bucket').alias(f'rolling_std_{window_size}_{col}_second'))

            
            feas_list.extend([f'rolling_mean_{window_size}_{col}_second',f'rolling_std_{window_size}_{col}_second',])
            new_add_cols.extend([f'rolling_mean_{window_size}_{col}_second',f'rolling_std_{window_size}_{col}_second',])
    #df = df.with_columns(add_cols)

    # print(df.dtypes)
    # print(target_feas.dtypes)
    
    df = df.join(target_feas,how='left',on=['stock_id','date_id','seconds_in_bucket'])

    
    
    
    keep_cols = ['stock_id','date_id']
    keep_all = keep_cols + feas_list 
    return df.to_pandas()[keep_all], feas_list
    



In [19]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
weights = {int(k):v for k,v in enumerate(weights)}

In [20]:
def handle_format(test_df):
    
    
    target_df = pd.DataFrame()
    # if len(revealed_targets) > 2:
    #     # 增加try except
    #     try:
    #         target_df['stock_id'] = list(revealed_targets['stock_id'])
    #         target_df['date_id'] = list(revealed_targets['revealed_date_id'])
    #         target_df['seconds_in_bucket'] = list(revealed_targets['seconds_in_bucket'])
    #         target_df['target'] = list(revealed_targets['revealed_target'])
    #         target_df['stock_id'] = target_df['stock_id'].astype(np.int64)
    #         target_df['date_id'] = target_df['date_id'].astype(np.int64)
    #         target_df['seconds_in_bucket'] = target_df['seconds_in_bucket'].astype(np.int64)
    #         target_df['target'] = target_df['target'].astype(np.float64)
    #     except:
    #         target_df = pd.DataFrame()
    
    # test_df
    test_df['stock_id'] = test_df['stock_id'].astype(np.int64)
    test_df['date_id'] = test_df['date_id'].astype(np.int64)
    test_df['seconds_in_bucket'] = test_df['seconds_in_bucket'].astype(np.int64)
    test_df['imbalance_size'] = test_df['imbalance_size'].astype(np.float64)
    test_df['imbalance_buy_sell_flag'] = test_df['imbalance_buy_sell_flag'].astype(np.int64)
    test_df['reference_price'] = test_df['reference_price'].astype(np.float64)
    test_df['matched_size'] = test_df['matched_size'].astype(np.float64)
    test_df['far_price'] = test_df['far_price'].astype(np.float64)
    test_df['near_price'] = test_df['near_price'].astype(np.float64)
    test_df['bid_price'] = test_df['bid_price'].astype(np.float64)
    test_df['bid_size'] = test_df['bid_size'].astype(np.float64)
    test_df['ask_price'] = test_df['ask_price'].astype(np.float64)
    test_df['ask_size'] = test_df['ask_size'].astype(np.float64)
    test_df['wap'] = test_df['wap'].astype(np.float64)
    return test_df, target_df

def get_target_feathers(df, date_id, second, test_df_mock):
    df = df.copy()
    df = df[df['seconds_in_bucket']==second]
    df = df[df['date_id']<date_id] # 以防重复数据
    df = pd.concat([df,test_df_mock],ignore_index=True).sort_values(['date_id','seconds_in_bucket','stock_id'])
    df = pl.from_pandas(df)
    feas_list = []
    add_cols = []
    for col in ["target"]:

        for window_size in [1,2,3,5,10,15,20,25,30,35,40,45,60]:
            add_cols.append(pl.col(col).shift(1).rolling_mean(window_size=window_size,min_periods=1).over('stock_id','seconds_in_bucket').alias(f'rolling_mean_{window_size}_{col}_second'))
            add_cols.append(pl.col(col).shift(1).rolling_std(window_size=window_size,min_periods=1).over('stock_id','seconds_in_bucket').alias(f'rolling_std_{window_size}_{col}_second'))

            
            feas_list.extend([f'rolling_mean_{window_size}_{col}_second',f'rolling_std_{window_size}_{col}_second',])

    df = df.with_columns(add_cols)
    df = df.filter(pl.col("date_id")==date_id)
    df.drop_in_place('target')
    return df

In [21]:
def reduce_mem_usage(df,exclude_columns = [], verbose=True):
    import time
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_time = time.time()
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        if col in exclude_columns:
            continue
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))
        print('reduce memory use:',round(time.time() - start_time,1))
    return df

In [22]:
target_df = df_train[["stock_id", "date_id", "seconds_in_bucket", "target"]]

In [23]:
# target_df = pd.read_feather("/kaggle/input/optiver/train_labels.fer")
target_df = target_df[target_df['date_id'] >= 400].reset_index(drop=True)
gc.collect()
# 为了给一个表头
target_df_save_retrain = target_df[target_df['date_id'] >= 478].reset_index(drop=True)


In [24]:
def retrain_xgb(train_feas, feas_list, seed):
    params = {
        'random_state': seed,
        'learning_rate':0.01,
        'n_estimators':3200,
        'n_jobs':-1,
        'objective':'reg:absoluteerror',
        "device": "gpu",
        'max_depth': 10,
         'min_child_weight': 8.860379669551103,
         'subsample': 0.7711820080525443,
         'colsample_bytree': 0.5348780216605801,
         'reg_alpha': 0.12854342791716195,
         'reg_lambda': 0.39326076062073634,
         'gamma': 0.24378704040107024
    }
    date_ids = np.array(train_feas["date_id"])
    max_date = max(date_ids)
    weights_date = np.ones_like(date_ids).astype(float)
    weights_date[date_ids>=(max_date - 45)] = 1.5
    del date_ids
    gc.collect()

    clf = xgb.XGBRegressor(**params)
    clf.fit(train_feas[feas_list],train_feas['target'],sample_weight=weights_date)
    gc.collect()
    return clf


In [25]:
eval = eval_dfs[-1].copy(deep=True)
print(eval.columns)

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


In [26]:
eval = ReduceMemUsageDataPreprocessor(exclude_columns=['stock_id','date_id','seconds_in_bucket']).apply(eval)
# train = ReduceMemUsageDataPreprocessor(exclude_columns=['stock_id','date_id','seconds_in_bucket']).apply(train)

In [27]:
eval.shape

(523798, 17)

In [28]:
import sys

In [29]:
print(sys.getsizeof(eval))

76606852


In [30]:
eval

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
4714182,2,433,210,2.974876e+06,-1,0.999189,5558122.50,,,0.999098,11508.120117,0.999640,7528.620117,0.999426,-14.910102,23836,433_210_2
4714183,3,433,210,2.732643e+07,-1,1.000747,40090504.00,,,1.000697,12566.400391,1.000900,5302.529785,1.000840,-1.419783,23836,433_210_3
4714184,4,433,210,6.274368e+06,-1,0.999576,13820789.00,,,0.999402,34546.000000,0.999923,3456.399902,0.999876,-1.860261,23836,433_210_4
4714185,5,433,210,2.661069e+06,1,0.999996,3859620.75,,,0.999568,54561.000000,0.999996,15745.500000,0.999900,-11.050105,23836,433_210_5
4714186,6,433,210,1.798153e+06,-1,0.999050,6075542.50,,,0.998724,1008.719971,0.999496,43072.000000,0.998741,10.850430,23836,433_210_6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2.440723e+06,-1,1.000317,28280362.00,0.999734,0.999734,1.000317,32257.039062,1.000434,319862.406250,1.000328,2.310276,26454,480_540_195
5237976,196,480,540,3.495105e+05,-1,1.000643,9187699.00,1.000129,1.000386,1.000643,205108.406250,1.000900,93393.070312,1.000819,-8.220077,26454,480_540_196
5237977,197,480,540,0.000000e+00,0,0.995789,12725436.00,0.995789,0.995789,0.995789,16790.660156,0.995883,180038.312500,0.995797,1.169443,26454,480_540_197
5237978,198,480,540,1.000899e+06,1,0.999210,94773272.00,0.999210,0.999210,0.998970,125631.718750,0.999210,669893.000000,0.999008,-1.540184,26454,480_540_198


In [31]:
eval_date_ids = eval["date_id"].unique()
eval_date_ids = sorted(eval_date_ids)
for eval_test_id in eval_date_ids:
    test_df = eval[eval["date_id"] == eval_test_id]

In [32]:
import time
# import optiver2023
# env = optiver2023.make_env()  # Setting up the environment for the competition
# iter_test = env.iter_test()   # Getting the iterator for the test set

counter = 0                   # Initializing a counter
qps = []                      # Queries per second tracking
y_min, y_max = -64, 64
target_df_back_days = 68

hist_df = pd.DataFrame()
max_date = -1

# 存放后面的labels
save_labels_df_list = []
save_feas_df_list = []
train_data_saving_flag = True
retrain_flag = False # 现在变成是否第一次训练完的标志了
is_first_scored = False
scored_count = 0
ready_to_second_retrain_flag = False

overall_test_df = pd.DataFrame()

for eval_test_id in eval_date_ids:
    test_df = eval[eval["date_id"] == eval_test_id]

    print('counter:', counter)
    # 处理下格式 target_df_tmp是前一天的label
    # test_df, target_df_tmp =  handle_format(test_df)
    target_df_tmp = pd.DataFrame()
    
    # current_is_score = list(test_df['currently_scored'])[0]
    
    # if (not is_first_scored) and current_is_score:
        # is_first_scored = True
    current_is_score = True

    # test_df = test_df.drop('currently_scored', axis=1)
    test_df["weight"] = test_df["stock_id"].map(weights)
    current_date = test_df['date_id'].max()
    current_second = test_df['seconds_in_bucket'].min()
    
    if current_is_score and current_date!=max_date:
        scored_count +=1
    if scored_count >= 30 and retrain_flag:
        ready_to_second_retrain_flag = True

    #481才会增加到target_df上 只有在新日期才会有这个数据
    # if current_date >=482 and current_date!=max_date:
    #     target_df = pd.concat([target_df,target_df_tmp],ignore_index=True).sort_values(['date_id','seconds_in_bucket','stock_id'])
    #     target_df = target_df[target_df['date_id']>=(current_date-target_df_back_days)].reset_index(drop=True) #只取top80天就够了
    #     gc.collect()

    #     # (retrain)存放新数据的labels
    #     if len(target_df_tmp) > 0 and (train_data_saving_flag):
    #         save_labels_df_list.append(target_df_tmp)
            
    #     # 第一次训练
    #     if (current_date >=560) and (is_first_scored) and (not retrain_flag) and (not ready_to_second_retrain_flag):
    #         # 构建feas
    #         retrain_feas = pd.concat(save_feas_df_list,ignore_index=True)
    #         # 删除这个变量
    #         del save_feas_df_list
    #         save_feas_df_list = []
    #         gc.collect()
    #         # 特征函数
    #         retrain_feas = reduce_mem_usage(retrain_feas, exclude_columns=['stock_id','date_id','seconds_in_bucket'])
    #         gc.collect()
    #         # train 0 -500天的
    #         labels_df_retrain = pd.concat(save_labels_df_list,ignore_index=True)
    #         # 表头
    #         labels_df_retrain = pd.concat([target_df_save_retrain, labels_df_retrain],ignore_index=True).sort_values(['date_id','seconds_in_bucket','stock_id'])
    #         labels_df_retrain = labels_df_retrain[~labels_df_retrain['target'].isna()]
    #         # 关联labels
    #         retrain_feas = retrain_feas.merge(labels_df_retrain, how='inner',on=['stock_id','date_id','seconds_in_bucket'])
    #         # 清理内存
    #         del save_labels_df_list
    #         save_labels_df_list = []                
    #         del labels_df_retrain
    #         # del target_df_save_retrain
    #         gc.collect()
    #         # 合起来
    #         retrain_feas = pd.concat([pd.read_feather("/kaggle/input/optiver/train_480_feas_drop60.fer"), retrain_feas], ignore_index=True)
    #         gc.collect()
    #         retrain_feas = reduce_mem_usage(retrain_feas, exclude_columns=['stock_id','date_id','seconds_in_bucket'])
    #         gc.collect()
    #         new_xgb_models_list = []
    #         # 要加种子
    #         for seed_ in [47,1103,2023]:
    #             gc.collect()
    #             new_xgb_models_list.append(retrain_xgb(retrain_feas, selected_feas, seed_))
    #         gc.collect()
    #         # 清理旧文件
    #         # del retrain_feas
    #         retrain_feas = retrain_feas[retrain_feas['date_id']>=90].reset_index(drop=True)
    #         gc.collect()
    #         retrain_flag = True
    #         #train_data_saving_flag = False
    #         scored_count = 0
            
    #     # 第二次训练
    #     if (current_date > 565) and (retrain_flag) and (ready_to_second_retrain_flag) and (train_data_saving_flag):
    #         # 构建feas
    #         retrain_feas_2 = pd.concat(save_feas_df_list,ignore_index=True)
    #         # 删除这个变量
    #         del save_feas_df_list
    #         save_feas_df_list = []
    #         gc.collect()
    #         retrain_feas_2 = reduce_mem_usage(retrain_feas_2, exclude_columns=['stock_id','date_id','seconds_in_bucket'])
    #         gc.collect()
    #         # train 0 -500天的
    #         labels_df_retrain = pd.concat(save_labels_df_list,ignore_index=True)
    #         # 表头
    #         labels_df_retrain = pd.concat([target_df_save_retrain, labels_df_retrain],ignore_index=True).sort_values(['date_id','seconds_in_bucket','stock_id'])
    #         labels_df_retrain = labels_df_retrain[~labels_df_retrain['target'].isna()]
    #         # 关联labels
    #         retrain_feas_2 = retrain_feas_2.merge(labels_df_retrain, how='inner',on=['stock_id','date_id','seconds_in_bucket'])
    #         # 清理内存
    #         del save_labels_df_list
    #         save_labels_df_list = []                
    #         del labels_df_retrain
    #         del target_df_save_retrain
    #         gc.collect()
    #         # 合起来
    #         retrain_feas = pd.concat([retrain_feas, retrain_feas_2], ignore_index=True)
    #         del retrain_feas_2
    #         gc.collect()
    #         retrain_feas = reduce_mem_usage(retrain_feas)
    #         gc.collect()
    #         new_xgb_models_list = []
    #         # 要加种子
    #         for seed_ in [47,1103,2023]:
    #             gc.collect()
    #             new_xgb_models_list.append(retrain_xgb(retrain_feas, selected_feas, seed_))
    #         # 清理旧文件
    #         del retrain_feas
    #         gc.collect()
    #         #retrain_flag = True
    #         train_data_saving_flag = False     


    # 做target相关特征
    test_df_mock = test_df[['stock_id','date_id','seconds_in_bucket']].copy()
    current_target_feas_polars = get_target_feathers(target_df,current_date,current_second,test_df_mock) # polars



    now_time = time.time()    # Current time for performance measurement
    if current_date != max_date:
        hist_df = pd.DataFrame()  

    hist_df = pd.concat([hist_df,test_df],ignore_index=True)


    pred_df, _ = generate_features_no_hist_polars(hist_df,current_target_feas_polars)
    pred_df = test_df[['stock_id','date_id','seconds_in_bucket']].merge(pred_df,how='left',on=['stock_id','date_id','seconds_in_bucket'])

    # 处理na/clip
    pred_df = pred_df.fillna(-9e10)
    for _ in selected_feas:
        pred_df[_] = pred_df[_].clip(lower=-9e9,upper=9e9)

    #(retrain)
    if train_data_saving_flag and current_date >= 481:
        save_feas_df_list.append(pred_df)

    if current_is_score:
        xgb_pred_list = []

        if retrain_flag and len(new_xgb_models_list) > 0:
            for model in new_xgb_models_list:
                xgb_pred_list.append(model.predict(pred_df[selected_feas]))
        else:
            for model in xgb_model_list:
                xgb_pred_list.append(model.predict(pred_df[selected_feas]))


        #后处理
        lgb_predictions = np.mean(xgb_pred_list,axis=0)
        test_df['pred'] = lgb_predictions
        test_df['w_pred'] = test_df['weight'] * test_df['pred']
        test_df["post_num"] = test_df.groupby(["date_id","seconds_in_bucket"])['w_pred'].transform('sum') / test_df.groupby(["date_id","seconds_in_bucket"])['weight'].transform('sum')
        test_df['pred'] = test_df['pred'] - test_df['post_num']


        test_df['submission'] = list(test_df['pred'])
    else:
        test_df['submission'] = 0


    # Use the environment to make predictions
    # env.predict(sample_prediction_df)

    max_date = test_df['date_id'].max()

    counter += 1
    qps.append(time.time() - now_time)
    gc.collect()
    if counter % 10 == 0:
        print(f"{counter} queries per second: {np.mean(qps)}")
#     except:
#         sample_prediction_df['target'] = 0
#         env.predict(sample_prediction_df)
#         print("error********************************************")

    overall_test_df = pd.concat([overall_test_df, test_df])

time_cost = 1.146 * np.mean(qps)
print(f"The code will take approximately {np.round(time_cost, 2)} hours to reason about")


counter: 0
counter: 1
counter: 2
counter: 3
counter: 4
counter: 5
counter: 6
counter: 7
counter: 8
counter: 9
10 queries per second: 7.147119379043579
counter: 10
counter: 11
counter: 12
counter: 13
counter: 14
counter: 15
counter: 16
counter: 17
counter: 18
counter: 19
20 queries per second: 7.2657750248909
counter: 20
counter: 21
counter: 22
counter: 23
counter: 24
counter: 25
counter: 26
counter: 27
counter: 28
counter: 29
30 queries per second: 7.299528169631958
counter: 30
counter: 31
counter: 32
counter: 33
counter: 34
counter: 35
counter: 36
counter: 37
counter: 38
counter: 39
40 queries per second: 7.2906905055046085
counter: 40
counter: 41
counter: 42
counter: 43
counter: 44
counter: 45
counter: 46
counter: 47
The code will take approximately 8.35 hours to reason about


In [33]:
overall_test_df

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,ask_size,wap,target,time_id,row_id,weight,pred,w_pred,post_num,submission
4714182,2,433,210,2.974876e+06,-1,0.999189,5558122.50,,,0.999098,...,7528.620117,0.999426,-14.910102,23836,433_210_2,0.002,-2.230313,-0.004385,0.037825,-2.230313
4714183,3,433,210,2.732643e+07,-1,1.000747,40090504.00,,,1.000697,...,5302.529785,1.000840,-1.419783,23836,433_210_3,0.006,-1.609180,-0.009428,0.037825,-1.609180
4714184,4,433,210,6.274368e+06,-1,0.999576,13820789.00,,,0.999402,...,3456.399902,0.999876,-1.860261,23836,433_210_4,0.004,-3.089912,-0.012208,0.037825,-3.089912
4714185,5,433,210,2.661069e+06,1,0.999996,3859620.75,,,0.999568,...,15745.500000,0.999900,-11.050105,23836,433_210_5,0.004,0.954841,0.003971,0.037825,0.954841
4714186,6,433,210,1.798153e+06,-1,0.999050,6075542.50,,,0.998724,...,43072.000000,0.998741,10.850430,23836,433_210_6,0.002,1.928775,0.003933,0.037825,1.928775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2.440723e+06,-1,1.000317,28280362.00,0.999734,0.999734,1.000317,...,319862.406250,1.000328,2.310276,26454,480_540_195,0.004,-2.348969,-0.008314,0.270381,-2.348969
5237976,196,480,540,3.495105e+05,-1,1.000643,9187699.00,1.000129,1.000386,1.000643,...,93393.070312,1.000819,-8.220077,26454,480_540_196,0.001,-1.970338,-0.001700,0.270381,-1.970338
5237977,197,480,540,0.000000e+00,0,0.995789,12725436.00,0.995789,0.995789,0.995789,...,180038.312500,0.995797,1.169443,26454,480_540_197,0.004,0.329774,0.002401,0.270381,0.329774
5237978,198,480,540,1.000899e+06,1,0.999210,94773272.00,0.999210,0.999210,0.998970,...,669893.000000,0.999008,-1.540184,26454,480_540_198,0.006,0.930229,0.007204,0.270381,0.930229


In [34]:
test_df['submission']

5226980    0.289991
5226981    1.497331
5226982   -0.048941
5226983    0.464082
5226984   -1.204913
             ...   
5237975   -2.348969
5237976   -1.970338
5237977    0.329774
5237978    0.930229
5237979   -4.464337
Name: submission, Length: 11000, dtype: float64

In [35]:
test_df

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,ask_size,wap,target,time_id,row_id,weight,pred,w_pred,post_num,submission
5226980,0,480,0,5.372506e+06,1,0.999894,18358364.00,,,0.999894,...,11974.200195,1.000000,-1.279712,26400,480_0_0,0.004,0.289991,0.000600,-0.139875,0.289991
5226981,1,480,0,0.000000e+00,0,1.000230,1463335.75,,,0.999966,...,19947.900391,1.000000,-4.460216,26400,480_0_1,0.001,1.497331,0.001357,-0.139875,1.497331
5226982,2,480,0,2.668167e+06,-1,1.000514,3178828.25,,,0.999999,...,63279.859375,1.000000,5.899668,26400,480_0_2,0.002,-0.048941,-0.000378,-0.139875,-0.048941
5226983,3,480,0,1.009214e+07,1,0.999887,26401518.00,,,0.999839,...,20659.000000,1.000000,1.430511,26400,480_0_3,0.006,0.464082,0.001945,-0.139875,0.464082
5226984,4,480,0,5.413386e+06,1,1.000016,14120504.00,,,0.999358,...,2006.180054,1.000000,-1.220107,26400,480_0_4,0.004,-1.204913,-0.005379,-0.139875,-1.204913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2.440723e+06,-1,1.000317,28280362.00,0.999734,0.999734,1.000317,...,319862.406250,1.000328,2.310276,26454,480_540_195,0.004,-2.348969,-0.008314,0.270381,-2.348969
5237976,196,480,540,3.495105e+05,-1,1.000643,9187699.00,1.000129,1.000386,1.000643,...,93393.070312,1.000819,-8.220077,26454,480_540_196,0.001,-1.970338,-0.001700,0.270381,-1.970338
5237977,197,480,540,0.000000e+00,0,0.995789,12725436.00,0.995789,0.995789,0.995789,...,180038.312500,0.995797,1.169443,26454,480_540_197,0.004,0.329774,0.002401,0.270381,0.329774
5237978,198,480,540,1.000899e+06,1,0.999210,94773272.00,0.999210,0.999210,0.998970,...,669893.000000,0.999008,-1.540184,26454,480_540_198,0.006,0.930229,0.007204,0.270381,0.930229


In [36]:
eval

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
4714182,2,433,210,2.974876e+06,-1,0.999189,5558122.50,,,0.999098,11508.120117,0.999640,7528.620117,0.999426,-14.910102,23836,433_210_2
4714183,3,433,210,2.732643e+07,-1,1.000747,40090504.00,,,1.000697,12566.400391,1.000900,5302.529785,1.000840,-1.419783,23836,433_210_3
4714184,4,433,210,6.274368e+06,-1,0.999576,13820789.00,,,0.999402,34546.000000,0.999923,3456.399902,0.999876,-1.860261,23836,433_210_4
4714185,5,433,210,2.661069e+06,1,0.999996,3859620.75,,,0.999568,54561.000000,0.999996,15745.500000,0.999900,-11.050105,23836,433_210_5
4714186,6,433,210,1.798153e+06,-1,0.999050,6075542.50,,,0.998724,1008.719971,0.999496,43072.000000,0.998741,10.850430,23836,433_210_6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2.440723e+06,-1,1.000317,28280362.00,0.999734,0.999734,1.000317,32257.039062,1.000434,319862.406250,1.000328,2.310276,26454,480_540_195
5237976,196,480,540,3.495105e+05,-1,1.000643,9187699.00,1.000129,1.000386,1.000643,205108.406250,1.000900,93393.070312,1.000819,-8.220077,26454,480_540_196
5237977,197,480,540,0.000000e+00,0,0.995789,12725436.00,0.995789,0.995789,0.995789,16790.660156,0.995883,180038.312500,0.995797,1.169443,26454,480_540_197
5237978,198,480,540,1.000899e+06,1,0.999210,94773272.00,0.999210,0.999210,0.998970,125631.718750,0.999210,669893.000000,0.999008,-1.540184,26454,480_540_198


In [37]:
# pred_df.shape

In [38]:
# -4.576492

In [39]:
from data_preprocessor.feature_engineering import RemoveRecordsByStockDateIdPreprocessor

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x14b63be7f100>
Traceback (most recent call last):
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^^^^^^^^^

In [40]:
overall_test_df[overall_test_df['target'].isnull()]

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,ask_size,wap,target,time_id,row_id,weight,pred,w_pred,post_num,submission
4764999,19,438,0,,-1,,,,,,...,0.0,,,24090,438_0_19,0.002,4.691122,0.009247,-0.067666,4.691122


In [41]:
overall_test_df[overall_test_df['submission'].isnull()]

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,ask_size,wap,target,time_id,row_id,weight,pred,w_pred,post_num,submission


In [45]:
overall_test_df = overall_test_df.drop(4764999)

In [46]:
overall_test_df[overall_test_df['target'].isnull()]

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,ask_size,wap,target,time_id,row_id,weight,pred,w_pred,post_num,submission


In [47]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(overall_test_df['target'], overall_test_df['submission'])

In [48]:
mae

5.827889864343183