In [3]:
import numpy as np
import pandas as pd 
from typing import List
from pathlib import Path
import polars as pl
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
import lightgbm as lgb
import os

for dirname, _, filenames in os.walk('/kaggle/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

KAGGLE = False # define paths accordingly
SUBMISSION = False # use smaller datasets during dev

if KAGGLE:
    crypto_folder = Path("/kaggle/input/drw-crypto-market-prediction")
else:
    crypto_folder = Path("../raw_data/crypto")

- train / test split 
- test scores
- retry linear regression
- try adding linear regression to lgb model

In [None]:

def get_clean_crypto_data(train: bool = True) -> pl.LazyFrame:
    """
    Load and clean crypto data, returning either train or test set.
    
    Args:
        train: If True, return training set. If False, return test set.
        
    Returns:
        Cleaned lazy frame with columns that have variance and no infinite values.
    """
    
    filename = "train.parquet" if train else "test.parquet"
    
    # load data
    crypto_lazy = pl.scan_parquet(crypto_folder / filename)
    
    # rename timestamp column
    if train and KAGGLE:
        crypto_lazy = crypto_lazy.with_columns(pl.col("__index_level_0__").alias("timestamp")).drop(["__index_level_0__"])
    
    # Get column names and calculate variance on training set (for consistency)
    crypto_var = train_lazy.select(pl.exclude(["timestamp"]).var()).collect()
    
    # Remove columns with zero variance
    train_lazy = pl.scan_parquet(crypto_folder / "train.parquet").with_columns(pl.col("__index_level_0__").alias("timestamp")).drop(["__index_level_0__"])
    
    crypto_var_cols = crypto_var.select(pl.all() == 0.0).to_pandas().T.rename(
        columns={0: "is_variance_null"}
    ).reset_index().rename(columns={"index": "column_name"}).groupby(
        "is_variance_null"
    )["column_name"].unique()
    
    crypto_cols_with_var = crypto_var_cols[False]
    
    try:
        cols_no_var = crypto_var_cols[True]
        print(f"Columns with no variance : {cols_no_var}")
    except IndexError:
        print("All columns have variance in the train set")

    # remove columns that have no variance in the training set 
    train_lazy_var = train_lazy.select(
        ["timestamp"] + [pl.col(c) for c in crypto_cols_with_var]
    )
    
    # Remove columns with infinite values (check on training set)
    current_columns = train_lazy_var.collect_schema().names()
    contains_infinite_cols = train_lazy_var.select(
        pl.exclude("timestamp").abs().max().is_infinite()
    ).collect().to_pandas().T.rename(
        columns={0: "contains_infinite"}
    ).reset_index().rename(columns={"index": "column_name"}).groupby(
        "contains_infinite"
    )["column_name"].unique()
    
    # Filter clean columns based on what's available in the current dataset
    clean_columns = [c for c in current_columns if c in contains_infinite_cols[False]] + ["timestamp"]    
    available_columns = crypto_lazy.collect_schema().names()
    final_columns = [c for c in clean_columns if c in available_columns]
    
    return crypto_lazy.select(final_columns)

In [None]:
stats_columns = ["timestamp", "bid_qty", "ask_qty", "buy_qty", "sell_qty", "volume", "label"]
stats_columns_test = ["ID", "bid_qty", "ask_qty", "buy_qty", "sell_qty", "volume", "label"]
X_exclude = ["timestamp", "label"]
X_test_exclude = ["ID", "label"]

crypto_lazy_clean = get_clean_crypto_data(train=True)

In [None]:
def get_diff_features(df:pl.LazyFrame, stats_columns:List[str]):
    return df.with_columns(pl.exclude(stats_columns).diff()).with_row_index().fill_null(strategy="backward").select(pl.exclude("index"))

crypto_lazy_clean = crypto_lazy_clean.join(
        get_diff_features(crypto_lazy_clean, stats_columns),
        on=stats_columns,
        how="inner",
        suffix="_diff"
)

In [None]:
X = crypto_lazy_clean.select(pl.exclude(X_exclude)).collect().to_numpy()
y = crypto_lazy_clean.select(pl.col("label")).collect().to_numpy().T[0]

In [None]:
lgb_model = lgb.LGBMRegressor(random_state=42, weight=np.flip(1. / np.sqrt(np.arange(1, len(X)))))
lgb_model.fit(X, y)

In [None]:
crypto_lazy_test = get_clean_crypto_data(train=False)

# create unique row identifier
n = crypto_lazy_test.select(pl.len()).collect().item()
crypto_lazy_test = crypto_lazy_test.with_columns(
    ID=range(1, n+1)
)

print(crypto_lazy_test.select(pl.len()).collect().item())

crypto_lazy_test = crypto_lazy_test.join(
        get_diff_features(crypto_lazy_test, stats_columns_test),
        on=stats_columns_test,
        how="inner",
        suffix="_diff"
)

# crypto_lazy_test = get_diff_features(crypto_lazy_test, stats_columns_test)
assert n == crypto_lazy_test.select(pl.len()).collect().item()

In [None]:
X_test = crypto_lazy_test.select(pl.exclude(X_test_exclude)).collect().to_numpy()
X_test.shape, X.shape

In [None]:
del X
del y

In [None]:
y_hat_lgb_test = lgb_model.predict(X_test)

del X_test

In [None]:
crypto_lazy_test = crypto_lazy_test.with_columns(
    ID=range(1, n+1),
    prediction=y_hat_lgb_test
)
crypto_lazy_test.head(5).collect()

In [None]:
crypto_lazy_test.select([pl.col("ID"), pl.col("prediction")]).collect().write_csv(Path("submission.csv"))