In [1]:
import pandas as pd
import polars as pl
import numpy as np
import os
import pyarrow as pa
from autogluon.tabular import TabularPredictor, TabularDataset
from autogluon.tabular.version import __version__
import torch
import gc
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f"Autogluon Version: {__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

Autogluon Version: 1.2
CUDA Available: True
GPU Name: NVIDIA GeForce RTX 3090


In [3]:
class CONFIG:
    seed = 2025
    target_col = "responder_6"
    feature_cols = [f"feature_{idx:02d}" for idx in range(79)] \
        + [f"responder_{idx}_lag_1" for idx in range(9)]
    categorical_cols = []
    train_path = "/root/autodl-tmp/jane-street-2024/training.parquet"
    valid_path = "/root/autodl-tmp/jane-street-2024/validation.parquet"

In [4]:
train = pl.scan_parquet(CONFIG.train_path).collect().to_pandas()
valid = pl.scan_parquet(CONFIG.valid_path).collect().to_pandas()
print(f"Training Shape: {train.shape}, Validation Shape: {valid.shape}")

Training Shape: (25908520, 103), Validation Shape: (1341648, 103)


In [5]:
# Trick of boosting LB score, data leakage on the validation set
train = pd.concat([train, valid]).reset_index(drop=True)
train.shape

(27250168, 103)

In [6]:
train_tmp = train[["symbol_id", "time_id"] + CONFIG.feature_cols + [CONFIG.target_col] + ["weight"]]
train_tmp = train_tmp.dropna(axis=1, how='all')
train_tmp = train_tmp.loc[:, train_tmp.nunique() > 1]
# deal with null values
train_tmp = train_tmp.ffill().fillna(0)

In [7]:
# scaling features
scaler = StandardScaler()
train_tmp[CONFIG.feature_cols] = scaler.fit_transform(train_tmp[CONFIG.feature_cols])

In [8]:
train_automd = TabularDataset(train_tmp)
train_automd.head()

Unnamed: 0,symbol_id,time_id,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1,responder_6,weight
0,0,0,-0.896643,-0.610005,-0.995681,-0.954077,-2.76067,0.288205,1.873081,0.09493,-0.392002,-0.916119,0.645358,-0.558358,-0.687803,0.591037,-0.324128,0.631278,-0.167471,0.603672,-1.993188,-1.127251,1.023338,-0.565472,0.783994,0.714467,-0.454944,-0.406669,1.024027,2.409864,0.918978,0.266725,0.138662,-0.768144,-0.565228,0.031462,-1.699263,-1.820564,-1.942219,-0.165838,-0.594687,-0.077806,2.282816,-0.110153,0.143219,-0.302822,0.20446,-1.748215,1.440617,0.900254,0.359193,0.432366,0.118912,2.8494,0.198372,-0.131024,-1.300356,-0.167255,-0.938577,1.329146,0.030262,3.077584,1.163163,1.223827,0.096157,0.536366,0.096771,-1.450948,-1.371611,-0.826808,0.351071,-0.327754,-1.158452,1.113587,-0.564873,0.029608,0.043557,-0.223579,-0.25648,-0.345467,-0.327794,-0.44544,-1.046363,-0.536402,0.045927,-0.016934,0.20516,0.878791,1.004436,0.89257,0.496563,3.324375
1,1,0,-1.002535,-0.706006,-1.017293,-1.153922,-2.469186,0.271089,2.589536,0.076709,-0.52345,-0.916119,0.645358,-0.558358,-1.149239,0.287937,-0.453126,0.631278,-0.480747,0.603672,-2.014348,-1.252671,0.892914,-0.116104,2.042073,1.164534,-0.395741,-0.178578,0.454699,1.535051,1.708965,-0.273597,-0.323041,-0.113742,-0.565228,0.031462,-0.905948,-1.255449,0.216441,-0.20246,-0.432976,-0.077806,1.016129,-0.110153,0.143219,-0.705592,0.20446,-1.786147,1.418459,0.320197,0.790277,0.576228,0.118912,1.954982,0.198372,-0.131024,1.166803,-0.167255,-1.794623,2.600993,0.030262,1.540443,1.029736,1.223827,-0.122236,-0.557287,-0.424072,-2.040446,-1.481852,-0.357632,0.2391,-0.526793,-0.822562,0.246273,-0.676566,0.029608,0.043557,-0.31811,-0.247092,-0.34512,-0.310614,-0.360236,0.069464,-0.4268,-0.265364,-0.233734,-0.595318,-1.540195,-1.183527,-0.89144,0.529877,4.711303
2,2,0,-1.230939,-1.120955,-1.031138,-1.084398,-2.506928,0.342476,2.920083,0.121616,-0.81501,2.074254,-0.92851,-0.663494,-0.876131,1.827359,-0.23561,0.631278,0.201888,0.603672,-1.71961,-1.829798,-1.11213,-0.581926,0.765638,0.274209,-1.270422,-1.140752,0.558969,1.79541,1.370559,-0.462291,0.07038,-0.360759,-0.565228,0.031462,-1.156181,-1.632147,-1.834713,0.1733,-0.021549,-0.077806,1.397663,-0.110153,0.143219,-2.071595,0.20446,-1.98908,1.013458,-0.467515,-1.626455,-2.462493,0.118912,1.65267,0.198372,-0.131024,-2.159296,-0.167255,-1.459922,1.221402,0.030262,-2.014698,-0.620301,1.223827,0.11893,0.222961,-0.16316,-1.637916,-1.879726,-1.109044,1.527212,-0.166839,-0.891068,1.303491,-0.307766,0.029608,0.043557,0.736153,0.489172,-0.058928,-0.04222,-0.382783,0.094874,-0.399741,-0.145459,-0.121599,0.207478,0.440513,0.418602,0.23063,0.746983,3.028847
3,3,0,-1.225174,-0.253832,-1.08393,-0.742088,-2.597986,0.218405,1.927774,0.069894,-0.381086,-1.215156,-0.613736,-0.960352,-1.061723,0.966756,-0.60459,0.631278,0.08586,0.603672,-1.811111,-1.983444,0.132673,0.987362,0.348089,-0.28552,1.08059,0.640839,-0.334247,-0.211743,-0.136426,-0.084064,-0.852502,0.905843,-0.565228,0.031462,-1.041331,-0.7163,-1.235662,-0.229454,-0.29219,-0.077806,1.570834,-0.110153,0.143219,0.317223,0.20446,-0.813999,1.3396,0.835345,3.895741,1.408002,0.118912,3.170727,0.198372,-0.131024,-0.101458,-0.167255,-0.829015,1.733263,0.030262,4.81656,1.746597,1.223827,-0.075761,0.771788,0.405967,-1.179221,-1.441056,-0.858955,0.90364,-0.40819,-1.124283,0.605557,-0.644534,0.029608,0.043557,3.529042,3.995298,0.624213,0.734843,3.654261,0.3874,0.61184,0.114229,0.128016,-0.016035,-0.353038,-0.390723,-0.383279,0.941218,2.099438
4,4,0,-0.972385,-0.344859,-1.172583,-1.072821,-2.549062,0.139879,0.962981,0.053145,-0.361565,-0.74524,-1.243284,-0.972721,-0.77621,0.169685,-0.652229,0.631278,0.065304,0.603672,-2.84022,-1.003263,-0.38732,1.272878,0.735418,-0.624473,3.802739,2.052129,-1.126798,-0.03714,0.769159,-0.17602,-0.290959,1.75024,-0.565228,0.031462,-1.026004,-2.075983,-3.734874,-0.17781,-0.38651,-0.077806,1.294092,-0.110153,0.143219,-0.748033,0.20446,-1.273917,1.744212,0.062925,0.743748,-0.095948,0.118912,2.894469,0.198372,-0.131024,-0.485311,-0.167255,-1.320898,1.955807,0.030262,3.437934,1.641095,1.223827,-0.010995,1.094658,0.07889,-1.341557,-1.309457,-1.273147,0.126753,-0.543311,-1.151026,0.139384,-0.642059,0.029608,0.043557,5.592292,4.160807,1.217082,0.683053,-0.124827,-0.120259,0.344871,-0.162151,-0.12774,0.047159,-0.523502,-0.452244,-0.441841,0.204584,3.166049


In [9]:
# Define Autogluon Predictor Configuration
predictor_kwargs = {
    'label': CONFIG.target_col,
    'eval_metric': "r2",
    'sample_weight': "weight",
    'weight_evaluation': True,
    'problem_type': 'regression',
    'verbosity': 4
}

In [10]:
train_kwargs = {
    'train_data': train_automd,
    'presets': "medium_quality",
    'time_limit': 3600*9,  # 9 hours
    'ag_args_fit': {
        'num_gpus': 1
    },
    'excluded_model_types': ['RF', 'XT', 'KNN', 'GBM', 'FASTAI', 'CAT'],
    # 'num_stack_levels': 1,
    # 'num_bag_folds': 3,
    'hyperparameters': {
        'XGB': {
            'extra_trees': True,
            'ag_args': {
                'name_suffix': 'XGB',
                'use_gpu': True,
            },
        },
        'NeuralNet': {
            'epochs': 50,
            'learning_rate': 1e-3,
            'ag_args': {
                'name_suffix': 'NN',
                'use_gpu': True,
            },
        },
        # Add more models or tune specific models as needed
    },
    # 'hyperparameter_tune_kwargs': {
    #     'scheduler': 'local',
    #     'searcher': 'random',
    #     'num_trials': 20,
    # },
}

: 

In [11]:
# train using AutoGluon
predictor = TabularPredictor(**predictor_kwargs)

print("Starting training...")
predictor.fit(**train_kwargs)

No path specified. Models will be saved in: "AutogluonModels/ag-20241221_105141"
Verbosity: 4 (Maximum Logging)
AutoGluon Version:  1.2
Python Version:     3.10.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #147-Ubuntu SMP Fri Oct 14 17:07:22 UTC 2022
CPU Count:          128
GPU Count:          1
Memory Avail:       690.06 GB / 755.30 GB (91.4%)
Disk Space Avail:   31.79 GB / 50.00 GB (63.6%)
Presets specified: ['medium_quality']
User Specified kwargs:
{'ag_args_fit': {'num_gpus': 1},
 'auto_stack': False,
 'excluded_model_types': ['RF', 'XT', 'KNN', 'GBM', 'FASTAI', 'CAT']}
Full kwargs:
{'_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': {'num_gpus': 1},
 'auto_stack': False,
 'calibrate': 'auto',
 'delay_bag_sets': False,
 'ds_args': {'clean_up_fits': True,
             'detection_time_frac': 0.25,
             'enable_callbacks': False,
             'enable_ray_logging': True,
      

Starting training...


Values in column 'weight' used as sample weights instead of predictive features. Evaluation will report weighted metrics, so ensure same column exists in test data.
Beginning AutoGluon training ... Time limit = 32400s
AutoGluon will save models to "/root/autodl-tmp/jane-street-2024/AutogluonModels/ag-20241221_105141"
Train Data Rows:    27250168
Train Data Columns: 91
Label Column:       responder_6
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    706188.23 MB
	Train Data (Original)  Memory Usage: 9225.66 MB (1.3% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Original Features (exact raw dtype, raw dtype):
				('float32', 'float') : 88 | ['feature_00', 'feature_01', 'feature_02', 'f

In [None]:
# for autogluon results displaying
predictor.leaderboard()