In [1]:
import joblib

import pandas as pd
import numpy as np
import polars as pl
from sklearn.preprocessing import StandardScaler

In [2]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_mem:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    decrease = 100 * (start_mem - end_mem) / start_mem
    print(f"Decreased by {decrease:.2f}%")

    return df

In [3]:
target = "responder_6"
main = f"C:/Users/edmun/OneDrive/Desktop/Personal-Projects/Kaggle/Jane Street Real Time Market Data Forecasting/"

# Number of dates to skip from the beginning of the dataset
skip_dates = 500

# Define the feature names based on the number of features (79 in this case)
feature_names = [f"feature_{i:02d}" for i in range(79)]
exogeneous_features = [
    "sin_time_id",
    "cos_time_id",
    "sin_time_id_halfday",
    "cos_time_id_halfday",
]

In [4]:
full = []
for i in range(10):
    t0 = (
        pd.read_parquet(f"{main}/data/train.parquet/partition_id={i}/part-0.parquet")
        .fillna(0)
        .set_index("date_id")
    )
    full.append(reduce_mem_usage(t0))

full = pd.concat(full)

full["sin_time_id"] = np.sin(2 * np.pi * full["time_id"] / 967)
full["cos_time_id"] = np.cos(2 * np.pi * full["time_id"] / 967)
full["sin_time_id_halfday"] = np.sin(2 * np.pi * full["time_id"] / 483)
full["cos_time_id_halfday"] = np.cos(2 * np.pi * full["time_id"] / 483)

full = full[full.index >= skip_dates]

Memory usage of dataframe is 654.51 MB
Memory usage after optimization is: 335.60 MB
Decreased by 48.73%
Memory usage of dataframe is 944.04 MB
Memory usage after optimization is: 484.06 MB
Decreased by 48.73%
Memory usage of dataframe is 1022.35 MB
Memory usage after optimization is: 524.21 MB
Decreased by 48.73%
Memory usage of dataframe is 1352.24 MB
Memory usage after optimization is: 693.36 MB
Decreased by 48.73%
Memory usage of dataframe is 1690.96 MB
Memory usage after optimization is: 867.04 MB
Decreased by 48.73%
Memory usage of dataframe is 1800.46 MB
Memory usage after optimization is: 923.18 MB
Decreased by 48.73%
Memory usage of dataframe is 2088.53 MB
Memory usage after optimization is: 1070.89 MB
Decreased by 48.73%
Memory usage of dataframe is 2132.85 MB
Memory usage after optimization is: 1093.61 MB
Decreased by 48.73%
Memory usage of dataframe is 2067.02 MB
Memory usage after optimization is: 1059.86 MB
Decreased by 48.73%
Memory usage of dataframe is 2112.32 MB
Memor

In [5]:
# Number of validation dates to use
num_valid_dates = 100

# Number of folds for cross-validation
N_fold = 5

dates = full.index.unique()

# Define validation dates as the last `num_valid_dates` dates
valid_dates = dates[-num_valid_dates:]

# Define training dates as all dates except the last `num_valid_dates` dates
train_dates = dates[:-num_valid_dates]

In [6]:
X = full[feature_names + exogeneous_features]
Y = full[[target]]
weight = full[["weight"]]

del full

In [8]:
def export_train():
    X_valid = X[X.index.isin(valid_dates)]
    y_valid = Y[Y.index.isin(valid_dates)]
    w_valid = weight[weight.index.isin(valid_dates)]

    X_valid.to_parquet(f"{main}/data/training_data/X_valid.parquet")
    y_valid.to_parquet(f"{main}/data/training_data/y_valid.parquet")
    w_valid.to_parquet(f"{main}/data/training_data/w_valid.parquet")

    for fold in range(N_fold):
        selected_dates = [
            date for ii, date in enumerate(train_dates) if ii % N_fold != fold
        ]
        X_train = X[X.index.isin(selected_dates)]
        y_train = Y[Y.index.isin(selected_dates)]
        w_train = weight[weight.index.isin(selected_dates)]

        X_train.to_parquet(f"{main}/data/training_data/X_train_{fold}.parquet")
        y_train.to_parquet(f"{main}/data/training_data/y_train_{fold}.parquet")
        w_train.to_parquet(f"{main}/data/training_data/w_train_{fold}.parquet")

        del X_train, y_train, w_train

In [None]:
export_train()