In [1]:
import os
import joblib
import polars as pl
import xgboost as xgb
import numpy as np
import pandas as pd
import pyarrow.parquet as pq


In [2]:
# Paths and constants
input_path = '../data/jane_street_data'
def read_selected_data(input_path):
    # Define the directory containing your data files

    # List three specific Parquet files you want to read
    selected_files = [f"partition_id={i}/part-0.parquet" for i in range(1)]
    # Load and filter the data from only the selected Parquet files
    dfs = []
    for file_name in selected_files:
        file_path = f'{input_path}/train.parquet/{file_name}'
        lazy_df = pl.scan_parquet(file_path)
        df = lazy_df.collect()
        dfs.append(df)

    # Concatenate all dataframes into a single dataframe
    full_df = pl.concat(dfs)

    return full_df

In [3]:
full_df = read_selected_data(input_path)

In [4]:
df = full_df.fill_null(strategy='forward')

In [5]:
feature_names = [f"feature_{i:02d}" for i in range(79)]

num_valid_dates = 70 
dates = df['date_id'].unique().to_numpy()
valid_dates = dates[-num_valid_dates:]
train_dates = dates[:-num_valid_dates]

In [6]:
X_valid = df.filter(pl.col('date_id').is_in(valid_dates)).select(feature_names).to_numpy()
y_valid = df.filter(pl.col('date_id').is_in(valid_dates)).select('responder_6').to_numpy().ravel()
w_valid = df.filter(pl.col('date_id').is_in(valid_dates)).select('weight').to_numpy().ravel()

X_train = df.filter(pl.col('date_id').is_in(train_dates)).select(feature_names).to_numpy()
y_train = df.filter(pl.col('date_id').is_in(train_dates)).select('responder_6').to_numpy().ravel()
w_train = df.filter(pl.col('date_id').is_in(train_dates)).select('weight').to_numpy().ravel()

In [7]:
def r2_xgb(y_true, y_pred, sample_weight=None):
    if sample_weight is None:
        sample_weight = np.ones_like(y_true)
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return -r2

In [17]:
# Train the XGBoost model
model = xgb.XGBRegressor(
    n_estimators=2000,
    learning_rate=0.1,
    max_depth=6,
    tree_method='hist',
#     device="cuda",
    objective='reg:squarederror',
    eval_metric=r2_xgb,
    disable_default_eval_metric=True,
    early_stopping_rounds=2
)

In [18]:
model.fit(
    X_train, y_train,
    sample_weight=w_train,
    eval_set=[(X_valid, y_valid)],
    sample_weight_eval_set=[w_valid],
    verbose=2)

[0]	validation_0-r2_xgb:-0.00240
[2]	validation_0-r2_xgb:-0.00510
[4]	validation_0-r2_xgb:-0.00747
[6]	validation_0-r2_xgb:-0.00944
[8]	validation_0-r2_xgb:-0.01077
[10]	validation_0-r2_xgb:-0.01193
[12]	validation_0-r2_xgb:-0.01283
[14]	validation_0-r2_xgb:-0.01324
[16]	validation_0-r2_xgb:-0.01378
[18]	validation_0-r2_xgb:-0.01396
[20]	validation_0-r2_xgb:-0.01396
[22]	validation_0-r2_xgb:-0.01423
[24]	validation_0-r2_xgb:-0.01452
[25]	validation_0-r2_xgb:-0.01442


In [30]:
test = pl.scan_parquet("../data/jane_street_data/test.parquet/date_id=0/part-0.parquet")
test = test.collect()
test_df = test.to_pandas()
test = test_df[feature_names].values


In [31]:
predictions = model.predict(test)

In [35]:
output_df = pd.DataFrame({"row_id": test_df['row_id'], "responder_6": predictions})
output_df.head()

Unnamed: 0,row_id,responder_6
0,0,0.083981
1,1,0.083981
2,2,0.083981
3,3,0.083981
4,4,0.083981
