In [1]:
import pandas as pd
import polars as pl
import numpy as np
import os
import pyarrow as pa
# from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import pickle

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge

In [2]:
class CONFIG:
    seed = 2025
    target_col = "responder_6"
    # data_id is not included as it's not relavant
    feature_cols = [f"feature_{idx:02d}" for idx in range(79) if idx not in (9, 10, 11, 61)] \
        + [f"responder_{idx}_lag_1" for idx in range(9)]
    # encoded_cols = [f'encoded_feature_{i}' for i in range(32)]
    categorical_cols = []

In [3]:
train = pl.scan_parquet("/root/autodl-tmp/jane-street-2024/training.parquet").collect().to_pandas()
valid = pl.scan_parquet("/root/autodl-tmp/jane-street-2024/validation.parquet").collect().to_pandas()
train.shape, valid.shape

((25908520, 103), (1341648, 103))

In [4]:
# Trick of boosting LB score, data leakage on the validation set
train = pd.concat([train, valid]).reset_index(drop=True)
train.shape

(27250168, 103)

In [5]:
X_train = train[ CONFIG.feature_cols ]
X_train = X_train.ffill().fillna(0)
y_train = train[ CONFIG.target_col ]
w_train = train["weight"]

X_valid = valid[ CONFIG.feature_cols ]
X_valid = X_valid.ffill().fillna(0)
y_valid = valid[ CONFIG.target_col ]
w_valid = valid["weight"]

## Model training

In [6]:
def get_model(seed):
    lr_model = LinearRegression(positive=False)
    # rg_model = Ridge(
    #     alpha=1.0,
    #     solver='saga',
    #     max_iter=1000,
    #     tol=1e-3,
    #     random_state=seed
    # )
    rg_model = Ridge()
    return lr_model, rg_model

In [7]:
lr_model, rg_model = get_model(CONFIG.seed)

In [15]:
rg_model.fit(X_train, y_train)

In [16]:
y1_pred_valid = rg_model.predict(X_valid)
valid_score = r2_score(y_valid, y1_pred_valid )
valid_score
# 0.004431724548339844
# 0.004270970821380615

0.005071580410003662

In [10]:
# lr_model.fit( X_train, y_train, sample_weight=w_train)

In [11]:
# # evaluation for linear regression
# y_pred_train1 = lr_model.predict(X_train.iloc[:X_train.shape[0]//2])
# y_pred_train2 = lr_model.predict(X_train.iloc[X_train.shape[0]//2:])
# train_score = r2_score(y_train, np.concatenate([y_pred_train1, y_pred_train2], axis=0), sample_weight=w_train )
# train_score
# # 0.00696951150894165

In [12]:
# y_pred_valid = lr_model.predict(X_valid)
# valid_score = r2_score(y_valid, y_pred_valid, sample_weight=w_valid )
# valid_score
# 0.004431962966918945

In [13]:
# result = {
#     "model" : lr_model
# }
# with open("lr_result.pkl", "wb") as fp:
#     pickle.dump(result, fp)

In [17]:
result = {
    "model" : rg_model
}
with open("rg_result.pkl", "wb") as fp:
    pickle.dump(result, fp)