In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from typing import List
from pathlib import Path
import polars as pl
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
import lightgbm as lgb


import os
for dirname, _, filenames in os.walk('/kaggle/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


crypto_folder = Path("/kaggle/input/drw-crypto-market-prediction")

/kaggle/src/script.ipynb
/kaggle/lib/kaggle/gcp.py
/kaggle/input/drw-crypto-market-prediction/sample_submission.csv
/kaggle/input/drw-crypto-market-prediction/train.parquet
/kaggle/input/drw-crypto-market-prediction/test.parquet
/kaggle/huggingface/hf_scan.py
/kaggle/nbdev/export.sh
/kaggle/nbdev/settings.ini
/kaggle/working/__notebook__.ipynb


In [2]:

def get_clean_crypto_data(train: bool = True) -> pl.LazyFrame:
    """
    Load and clean crypto data, returning either train or test set.
    
    Args:
        train: If True, return training set. If False, return test set.
        
    Returns:
        Cleaned lazy frame with columns that have variance and no infinite values.
    """
    filename = "train.parquet" if train else "test.parquet"
    
    # Load data
    crypto_lazy = pl.scan_parquet(crypto_folder / filename)
    if train:
        crypto_lazy = crypto_lazy.with_columns(pl.col("__index_level_0__").alias("timestamp")).drop(["__index_level_0__"])
    # Get column names and calculate variance on training set (for consistency)
    train_lazy = pl.scan_parquet(crypto_folder / "train.parquet").with_columns(pl.col("__index_level_0__").alias("timestamp")).drop(["__index_level_0__"])

    crypto_var = train_lazy.select(pl.exclude(["timestamp"]).var()).collect()
    
    # Remove columns with zero variance
    crypto_cols = train_lazy.collect_schema().names()
    crypto_var_cols = crypto_var.select(pl.all() == 0.0).to_pandas().T.rename(
        columns={0: "is_variance_null"}
    ).reset_index().rename(columns={"index": "column_name"}).groupby(
        "is_variance_null"
    )["column_name"].unique()
    
    crypto_cols_with_var = crypto_var_cols[False]
    
    # Handle timestamp column presence
    if train:
        crypto_lazy_var = crypto_lazy.select(
            ["timestamp"] + [pl.col(c) for c in crypto_cols_with_var]
        )
    else:
        crypto_lazy_var = crypto_lazy.select(
            [pl.col(c) for c in crypto_cols_with_var if c in crypto_lazy.collect_schema().names()]
        )
    
    # Remove columns with infinite values (check on training set)
    train_lazy_var = train_lazy.select(
        ["timestamp"] + [pl.col(c) for c in crypto_cols_with_var]
    )
    current_columns = train_lazy_var.collect_schema().names()
    contains_infinite_cols = train_lazy_var.select(
        pl.exclude("timestamp").abs().max().is_infinite()
    ).collect().to_pandas().T.rename(
        columns={0: "contains_infinite"}
    ).reset_index().rename(columns={"index": "column_name"}).groupby(
        "contains_infinite"
    )["column_name"].unique()
    
    clean_columns = [c for c in current_columns if c in contains_infinite_cols[False]] + ["timestamp"]    
    # Filter clean columns based on what's available in the current dataset
    available_columns = crypto_lazy.collect_schema().names()
    final_columns = [c for c in clean_columns if c in available_columns]
    
    return crypto_lazy.select(final_columns)

In [3]:
stats_columns = ["timestamp", "bid_qty", "ask_qty", "buy_qty", "sell_qty", "volume", "label"]
stats_columns_test = ["bid_qty", "ask_qty", "buy_qty", "sell_qty", "volume", "label"]
X_exclude = ["timestamp", "label"]

crypto_lazy_clean = get_clean_crypto_data(train=True)

def get_diff_features(df:pl.LazyFrame, stats_columns:List[str]):
    return df.with_columns(pl.exclude(stats_columns).diff()).with_row_index().fill_null(strategy="backward").select(pl.exclude("index"))

crypto_lazy_clean = crypto_lazy_clean.join(
        get_diff_features(crypto_lazy_clean, stats_columns),
        on=stats_columns,
        suffix="_diff"
)


In [4]:
X = crypto_lazy_clean.select(pl.exclude(X_exclude)).collect().to_numpy()
y = crypto_lazy_clean.select(pl.col("label")).collect().to_numpy().T[0]


In [5]:
lgb_model = lgb.LGBMRegressor(random_state=42, weight=np.flip(1. / np.sqrt(np.arange(1, len(X)))))
lgb_model.fit(X, y)

Please use weight argument of the Dataset constructor to pass this parameter.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 16.748068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 399056
[LightGBM] [Info] Number of data points in the train set: 525886, number of used features: 1565
[LightGBM] [Info] Start training from score 0.036126


In [6]:
crypto_lazy_test = get_clean_crypto_data(train=False)

crypto_lazy_test = crypto_lazy_test.join(
        get_diff_features(crypto_lazy_test, stats_columns_test),
        on=stats_columns_test,
        suffix="_diff"
)

# crypto_lazy_test = get_diff_features(crypto_lazy_test, stats_columns_test)
assert "timestamp" not in crypto_lazy_test.collect_schema().names()

In [7]:
X_test = crypto_lazy_test.select(pl.exclude(["label"])).collect().to_numpy()
X_test.shape, X.shape


((538282, 1565), (525886, 1565))

In [8]:
del X
del y

In [9]:
y_hat_lgb_test = lgb_model.predict(X_test)

del X_test

In [10]:
n = crypto_lazy_test.select(pl.len()).collect().item()
crypto_lazy_test = crypto_lazy_test.with_columns(
    ID=range(1, n+1),
    prediction=y_hat_lgb_test
)
crypto_lazy_test.head(5).collect()

bid_qty,ask_qty,buy_qty,sell_qty,volume,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27,X28,X29,X30,X31,X32,…,X746_diff,X747_diff,X748_diff,X749_diff,X750_diff,X751_diff,X752_diff,X753_diff,X754_diff,X755_diff,X756_diff,X757_diff,X758_diff,X759_diff,X760_diff,X761_diff,X762_diff,X763_diff,X764_diff,X765_diff,X766_diff,X767_diff,X768_diff,X769_diff,X770_diff,X771_diff,X772_diff,X773_diff,X774_diff,X775_diff,X776_diff,X777_diff,X778_diff,X779_diff,X780_diff,ID,prediction
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64
0.317,8.102,13.164,10.272,23.436,-0.341229,0.041851,-0.020094,-0.206221,-0.297124,-0.222267,-0.197837,-0.203911,-0.655944,0.007174,-0.329925,-0.596616,-0.625749,-0.323394,-0.263399,-0.305465,0.483037,0.086237,0.229379,0.433586,0.526652,0.422905,0.108377,0.063136,0.47397,-0.069006,0.234467,0.447948,0.234189,0.016555,0.011871,0.136422,…,0.748208,1.47553,0.180672,0.359242,0.745627,1.459644,-0.105404,0.871998,-0.208738,0.81882,-1.627032,-0.281315,-2.392244,-0.46812,1.019065,-2.491853,1.788676,-0.067634,-0.051776,0.071816,0.150989,0.419495,0.615237,0.642499,1.231117,0.185897,0.021208,-0.033673,-2.22484,-2.265916,-2.217028,-1.841349,-0.920374,-0.548437,-0.262667,1,-0.127187
2.608,2.111,123.562,40.163,163.725,-1.029564,-1.382505,-1.214935,-1.020241,-0.960397,-1.048605,-1.100512,-1.125502,-0.948648,-1.382813,-1.155675,-0.938154,-0.884723,-1.07428,-1.173479,-1.18582,0.525631,-1.659673,-1.297677,-1.141278,-1.209288,-1.362582,-0.945602,-0.444541,2.148573,-0.847346,-0.580285,-0.427531,-0.449724,-0.339681,0.256421,0.868983,…,0.748208,1.47553,0.180672,0.359242,0.745627,1.459644,-0.105404,0.871998,-0.208738,0.81882,-1.627032,-0.281315,-2.392244,-0.46812,1.019065,-2.491853,1.788676,-0.067634,-0.051776,0.071816,0.150989,0.419495,0.615237,0.642499,1.231117,0.185897,0.021208,-0.033673,-2.22484,-2.265916,-2.217028,-1.841349,-0.920374,-0.548437,-0.262667,2,0.24678
2.768,10.787,126.137,118.266,244.403,-2.59409,-5.486158,-4.744466,-3.930152,-3.275324,-2.795483,-2.697029,-2.631111,-0.930428,-4.773587,-3.225123,-2.140452,-1.375326,-0.993945,-1.053773,-1.062467,1.31847,-2.916145,-0.322479,0.51916,0.77634,0.483633,0.278665,0.351723,0.763206,-2.738539,-0.18359,0.544262,0.767578,0.475454,0.100709,-0.209634,…,-0.766864,-1.512322,-0.182986,-0.363845,0.132477,-0.416717,-0.046599,-1.177687,0.299605,-0.547413,0.821252,-0.050887,0.297369,0.066596,0.119275,0.963983,-1.606402,1.719548,1.803197,1.524652,1.124052,-0.008325,-0.400058,-0.212477,-0.146789,2.471421,-0.021047,0.046463,-0.00046,-0.000573,-0.010408,-0.076379,-0.150048,-0.114673,-0.170911,3,-1.220725
0.948,12.157,16.069,31.723,47.792,0.240745,0.997585,1.028965,1.081052,0.811895,0.140567,-0.019137,0.019558,0.123288,-0.036963,0.238274,0.294708,0.174069,-0.19868,-0.276774,-0.192456,-0.69019,-0.296766,-0.15669,-0.207432,-0.272406,-0.306263,-0.334957,-0.319814,-1.055234,-0.506583,-0.333547,-0.30927,-0.248058,-0.257161,-0.377619,-0.552696,…,0.0,0.0,0.0,0.0,-0.878104,-1.042928,0.152003,-0.07227,-0.364937,-0.271429,-0.207717,-0.06012,2.065504,0.203831,-2.472034,1.079878,-1.480635,-1.773439,-1.885815,-1.649609,-1.357253,-1.07245,-1.206275,-1.541288,-0.399608,-1.606766,0.011013,-0.002753,2.224681,2.269296,2.314533,2.376828,2.551521,2.762865,2.995576,4,0.214973
1.084,3.493,32.679,37.327,70.006,0.067189,0.772852,0.772152,0.714846,0.514422,0.099683,-0.085127,-0.163693,-0.961655,0.332133,0.294928,0.132711,-0.241271,-0.942707,-1.209709,-1.300811,2.070281,0.226949,-0.006069,-0.267187,-0.561779,-0.747072,-0.444194,0.172938,2.35154,0.016358,-0.154874,-0.244417,-0.289631,-0.244339,0.038124,0.589887,…,0.0,0.0,0.0,0.0,-0.059186,-0.397288,-1.483119,-0.401339,-1.753348,-0.647301,-0.845195,-0.490943,-1.031199,-0.494127,3.337735,-0.686526,1.198963,0.063987,0.054407,0.116229,0.269355,0.962146,1.441141,1.469601,-0.510524,-1.048032,-0.082253,0.028465,-2.224328,-2.268671,-2.297532,-2.278039,-2.38532,-2.611811,-2.602539,5,0.166043


In [11]:
crypto_lazy_test.select([pl.col("ID"), pl.col("prediction")]).collect().write_csv(Path("submission.csv"))