In [1]:
import numpy as np
import pandas as pd
from typing import List
from pathlib import Path
import polars as pl
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import os

for dirname, _, filenames in os.walk("/kaggle/"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

KAGGLE = True  # define paths accordingly
SUBMISSION = False  # use smaller datasets during dev

if KAGGLE:
    crypto_folder = Path("/kaggle/input/drw-crypto-market-prediction")
else:
    crypto_folder = Path("../raw_data/crypto")

/kaggle/src/script.ipynb
/kaggle/lib/kaggle/gcp.py
/kaggle/input/drw-crypto-market-prediction/sample_submission.csv
/kaggle/input/drw-crypto-market-prediction/train.parquet
/kaggle/input/drw-crypto-market-prediction/test.parquet
/kaggle/huggingface/hf_scan.py
/kaggle/nbdev/export.sh
/kaggle/nbdev/settings.ini
/kaggle/working/__notebook__.ipynb


- train / test split 
- test scores
- retry linear regression
- try adding linear regression to lgb model

In [2]:
def get_clean_crypto_data(train: bool = True) -> pl.LazyFrame:
    """
    Load and clean crypto data, returning either train or test set.

    Args:
        train: If True, return training set. If False, return test set.

    Returns:
        Cleaned lazy frame with columns that have variance and no infinite values.
    """

    filename = "train.parquet" if train else "test.parquet"

    # load data
    crypto_lazy = pl.scan_parquet(crypto_folder / filename)
    n_cols = len(crypto_lazy.collect_schema().names())

    if train and KAGGLE:
        # rename timestamp column
        crypto_lazy = crypto_lazy.with_columns(
            pl.col("__index_level_0__").alias("timestamp")
        ).drop(["__index_level_0__"])

    # Remove columns with zero variance in the training set
    train_lazy = pl.scan_parquet(crypto_folder / "train.parquet")
    if KAGGLE:
        train_lazy = train_lazy.with_columns(
            pl.col("__index_level_0__").alias("timestamp")
        ).drop(["__index_level_0__"])

    # Get column names and calculate variance on training set (for consistency)
    crypto_var = train_lazy.select(pl.exclude(["timestamp"]).var())

    crypto_var_cols = (
        crypto_var.select(pl.all() == 0.0)
        .first()
        .collect()
        .to_pandas()
        .T.rename(columns={0: "is_variance_null"})
        .reset_index()
        .rename(columns={"index": "column_name"})
        .groupby("is_variance_null")["column_name"]
        .unique()
    )

    crypto_cols_with_var = crypto_var_cols[False]

    try:
        cols_no_var = crypto_var_cols[True]
        print(f"Columns with no variance : {cols_no_var}")
    except KeyError:
        print("All columns have variance in the train set")

    # remove columns that have no variance in the training set
    train_lazy = train_lazy.select(
        ["timestamp"] + [pl.col(c) for c in crypto_cols_with_var]
    )

    # Remove columns with infinite values (check on training set)
    current_columns = train_lazy.collect_schema().names()
    contains_infinite_cols = (
        train_lazy.select(pl.exclude("timestamp").abs().max().is_infinite())
        .collect()
        .to_pandas()
        .T.rename(columns={0: "contains_infinite"})
        .reset_index()
        .rename(columns={"index": "column_name"})
        .groupby("contains_infinite")["column_name"]
        .unique()
    )

    try:
        cols_with_inf_vals = contains_infinite_cols[True]
        print(f"Columns with infinite values : {cols_with_inf_vals}")
    except KeyError:
        print("No columns with infinite values")

    if not train:
        # add dummy timestamps
        crypto_lazy = crypto_lazy.with_columns(
            ID=range(1, crypto_lazy.select(pl.len()).collect().item() + 1)
        )
    # Filter clean columns based on what's available in the current dataset
    clean_columns = [
        c for c in current_columns if c in contains_infinite_cols[False]
    ] + ["timestamp", "ID"]
    available_columns = crypto_lazy.collect_schema().names()
    final_columns = [c for c in clean_columns if c in available_columns]
    print(f"Eventually {len(final_columns)}, removed {n_cols - len(final_columns)}")

    return crypto_lazy.select(final_columns)

In [3]:
stats_columns = [
    "timestamp",
    "bid_qty",
    "ask_qty",
    "buy_qty",
    "sell_qty",
    "volume",
    "label",
]
stats_columns_test = [
    "ID",
    "bid_qty",
    "ask_qty",
    "buy_qty",
    "sell_qty",
    "volume",
    "label",
]
X_exclude = ["timestamp", "label"]
X_test_exclude = ["ID", "label"]

crypto_lazy_clean = get_clean_crypto_data(train=True)

All columns have variance in the train set
No columns with infinite values
Eventually 787, removed 0


In [4]:
def get_diff_features(df: pl.LazyFrame, stats_columns: List[str]):
    return (
        df.with_columns(pl.exclude(stats_columns).diff())
        .with_row_index()
        .fill_null(strategy="backward")
        .select(pl.exclude("index"))
    )


# join level with diff values
# crypto_lazy_clean = crypto_lazy_clean.join(
#     get_diff_features(crypto_lazy_clean, stats_columns),
#     on=stats_columns,
#     how="inner",
#     suffix="_diff",
# )

In [5]:
X = crypto_lazy_clean.select(pl.exclude(X_exclude)).collect().to_numpy()
y = crypto_lazy_clean.select(pl.col("label")).collect().to_numpy().T[0]

In [6]:
# test with the linear regression

# crypto_lazy_clean.select(pl.exclude(stats_columns).mean()).collect()

# from sklearn.linear_model import Lasso


In [7]:
from sklearn.ensemble import RandomForestRegressor

In [8]:
if not SUBMISSION:
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        shuffle=True,  # TODO : question this, whether timestamps are independant draws
        random_state=42,
    )
else:
    X_train, y_train = X, y
del X
del y

lr = 0.5

lin = RandomForestRegressor(
    # fit_intercept=True,
    n_estimators=80,
    n_jobs=-1,
    max_depth=10,
    min_samples_split=100,
    min_samples_leaf=50,
    max_features="sqrt",
    max_samples=0.5,
    random_state=41,
)
# n_samples = 80_000
lin.fit(
    X_train,
    y_train,
    # sample_weight=np.flip(1.0 / np.sqrt(np.arange(1, n_samples+1)))
)

y_train_lin = lin.predict(X_train)

print(f"R2 train lin: {r2_score(y_train, y_train_lin)}")
print(f"Pearson train lin : {pearsonr(y_train, y_train_lin)}")

y_train_res = y_train - lr * y_train_lin


lgb_model = lgb.LGBMRegressor(
    random_state=42,
    # weight=np.flip(1.0 / np.sqrt(np.arange(1, len(X_train)+1))),
    # n_estimators=80,
    # max_depth=10,
    n_jobs=-1,
)
lgb_model.fit(X_train, y_train_res)

y_train_hat = lgb_model.predict(X_train)

print(f"R2 train : {r2_score(y_train, y_train_hat + lr * y_train_lin)}")
print(f"Pearson train : {pearsonr(y_train, y_train_hat + lr * y_train_lin)}")

if not SUBMISSION:
    y_test_lin = lin.predict(X_test)

    print(f"R2 test lin : {r2_score(y_test, y_test_lin)}")
    print(f"Pearson test lin : {pearsonr(y_test, y_test_lin)}")

    y_test_hat = lgb_model.predict(X_test)

    print(f"R2 test : {r2_score(y_test, y_test_hat + lr * y_test_lin)}")
    print(f"Pearson test : {pearsonr(y_test, y_test_hat + lr * y_test_lin)}")

R2 train lin: 0.19989628780087498
Pearson train lin : PearsonRResult(statistic=0.5685028870623896, pvalue=0.0)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 7.157776 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 200155
[LightGBM] [Info] Number of data points in the train set: 420708, number of used features: 785
[LightGBM] [Info] Start training from score 0.019267
R2 train : 0.5927372516597694
Pearson train : PearsonRResult(statistic=0.8245744740773004, pvalue=0.0)
R2 test lin : 0.19319237832814762
Pearson test lin : PearsonRResult(statistic=0.565625419890334, pvalue=0.0)
R2 test : 0.5794029519679587
Pearson test : PearsonRResult(statistic=0.8158649812269481, pvalue=0.0)


In [9]:
crypto_lazy_test = get_clean_crypto_data(train=False)

# create unique row identifier
n = crypto_lazy_test.select(pl.len()).collect().item()
crypto_lazy_test = crypto_lazy_test.with_columns(ID=range(1, n + 1))

print(crypto_lazy_test.select(pl.len()).collect().item())

# crypto_lazy_test = crypto_lazy_test.join(
#     get_diff_features(crypto_lazy_test, stats_columns_test),
#     on=stats_columns_test,
#     how="inner",
#     suffix="_diff",
# )

# crypto_lazy_test = get_diff_features(crypto_lazy_test, stats_columns_test)
assert n == crypto_lazy_test.select(pl.len()).collect().item()

All columns have variance in the train set
No columns with infinite values
Eventually 787, removed -1
538150


In [10]:
X_test = crypto_lazy_test.select(pl.exclude(X_test_exclude)).collect().to_numpy()
X_test.shape

(538150, 785)

In [11]:
y_lin_test = lin.predict(X_test)
y_hat_lgb_test = lgb_model.predict(X_test)

del X_test

In [12]:
crypto_lazy_test = crypto_lazy_test.with_columns(
    ID=range(1, n + 1), prediction=y_hat_lgb_test + lr * y_lin_test
)
crypto_lazy_test.head(5).collect()

bid_qty,ask_qty,buy_qty,sell_qty,volume,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27,X28,X29,X30,X31,X32,…,X747,X748,X749,X750,X751,X752,X753,X754,X755,X756,X757,X758,X759,X760,X761,X762,X763,X764,X765,X766,X767,X768,X769,X770,X771,X772,X773,X774,X775,X776,X777,X778,X779,X780,label,ID,prediction
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64
0.317,8.102,13.164,10.272,23.436,-0.341229,0.041851,-0.020094,-0.206221,-0.297124,-0.222267,-0.197837,-0.203911,-0.655944,0.007174,-0.329925,-0.596616,-0.625749,-0.323394,-0.263399,-0.305465,0.483037,0.086237,0.229379,0.433586,0.526652,0.422905,0.108377,0.063136,0.47397,-0.069006,0.234467,0.447948,0.234189,0.016555,0.011871,0.136422,…,0.007922,-0.005892,-0.01263,-0.873517,-0.840068,0.005786,0.083554,0.015444,-0.012708,0.381417,0.529008,0.902084,0.355764,0.368828,1.600727,0.093405,-0.465116,-0.516132,-0.653997,-0.725988,-0.436268,-0.238371,-0.032994,-1.497419,-0.525974,-0.147911,-0.043417,1.521787,1.548965,1.495735,1.16673,0.281056,-0.187831,-0.599553,0,1,0.285343
2.608,2.111,123.562,40.163,163.725,-1.029564,-1.382505,-1.214935,-1.020241,-0.960397,-1.048605,-1.100512,-1.125502,-0.948648,-1.382813,-1.155675,-0.938154,-0.884723,-1.07428,-1.173479,-1.18582,0.525631,-1.659673,-1.297677,-1.141278,-1.209288,-1.362582,-0.945602,-0.444541,2.148573,-0.847346,-0.580285,-0.427531,-0.449724,-0.339681,0.256421,0.868983,…,1.483452,0.17478,0.346611,-0.12789,0.619577,-0.099618,0.955552,-0.193293,0.806112,-1.245615,0.247692,-1.49016,-0.112356,1.387892,-0.891127,1.882082,-0.532749,-0.567907,-0.58218,-0.574999,-0.016773,0.376866,0.609505,-0.266302,-0.340076,-0.126703,-0.07709,-0.703054,-0.716951,-0.721292,-0.674619,-0.639318,-0.736268,-0.86222,0,2,0.70901
2.768,10.787,126.137,118.266,244.403,-2.59409,-5.486158,-4.744466,-3.930152,-3.275324,-2.795483,-2.697029,-2.631111,-0.930428,-4.773587,-3.225123,-2.140452,-1.375326,-0.993945,-1.053773,-1.062467,1.31847,-2.916145,-0.322479,0.51916,0.77634,0.483633,0.278665,0.351723,0.763206,-2.738539,-0.18359,0.544262,0.767578,0.475454,0.100709,-0.209634,…,-0.02887,-0.008207,-0.017233,0.004587,0.20286,-0.146217,-0.222135,0.106311,0.258699,-0.424363,0.196806,-1.19279,-0.04576,1.507167,0.072856,0.27568,1.186799,1.23529,0.942472,0.549053,-0.025098,-0.023192,0.397028,-0.41309,2.131345,-0.14775,-0.030627,-0.703514,-0.717525,-0.731701,-0.750998,-0.789366,-0.850941,-1.033131,0,3,-0.57488
0.948,12.157,16.069,31.723,47.792,0.240745,0.997585,1.028965,1.081052,0.811895,0.140567,-0.019137,0.019558,0.123288,-0.036963,0.238274,0.294708,0.174069,-0.19868,-0.276774,-0.192456,-0.69019,-0.296766,-0.15669,-0.207432,-0.272406,-0.306263,-0.334957,-0.319814,-1.055234,-0.506583,-0.333547,-0.30927,-0.248058,-0.257161,-0.377619,-0.552696,…,-0.02887,-0.008207,-0.017233,-0.873517,-0.840068,0.005786,-0.294406,-0.258626,-0.012731,-0.632079,0.136685,0.872713,0.15807,-0.964867,1.152734,-1.204955,-0.586641,-0.650525,-0.707137,-0.808201,-1.097548,-1.229467,-1.14426,-0.812698,0.524579,-0.136737,-0.03338,1.521167,1.551771,1.582833,1.62583,1.762155,1.911924,1.962445,0,4,0.272841
1.084,3.493,32.679,37.327,70.006,0.067189,0.772852,0.772152,0.714846,0.514422,0.099683,-0.085127,-0.163693,-0.961655,0.332133,0.294928,0.132711,-0.241271,-0.942707,-1.209709,-1.300811,2.070281,0.226949,-0.006069,-0.267187,-0.561779,-0.747072,-0.444194,0.172938,2.35154,0.016358,-0.154874,-0.244417,-0.289631,-0.244339,0.038124,0.589887,…,-0.02887,-0.008207,-0.017233,-0.932703,-1.237356,-1.477333,-0.695744,-2.011974,-0.660032,-1.477275,-0.354258,-0.158485,-0.336057,2.372868,0.466208,-0.005992,-0.522653,-0.596118,-0.590908,-0.538846,-0.135403,0.211673,0.325342,-1.323223,-0.523453,-0.218991,-0.004915,-0.703161,-0.7169,-0.714699,-0.652209,-0.623165,-0.699887,-0.640094,0,5,0.140561


In [13]:
crypto_lazy_test.select([pl.col("ID"), pl.col("prediction")]).collect().write_csv(
    Path("submission.csv")
)