In [1]:
!pip install -qq -U pytabkit

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m352.2/352.2 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m97.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m102.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os, sys
import pandas as pd, numpy as np, random

from pytabkit import TabM_D_Regressor
from contextlib import contextmanager
from tqdm.auto import tqdm

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import warnings

In [3]:
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
# pd.options.mode.copy_on_write = True
# plt.style.use("whitegrid")

## -- Set Global Seed --
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

COLOR = '\033[32m'
RESET = '\033[0m'

In [4]:
## -- Load Data --
PATH = "/kaggle/input/playground-series-s5e10/"
train= pd.read_csv(PATH+"train.csv", index_col='id') #.drop('id', axis=1)
test = pd.read_csv(PATH+"test.csv", index_col='id') #.drop('id', axis=1)
submit = pd.read_csv(PATH+"sample_submission.csv")

ORIG_PATH = "/kaggle/input/simulated-roads-accident-data/"
orig = pd.concat([pd.read_csv(ORIG_PATH+"synthetic_road_accidents_100k.csv"),
                  pd.read_csv(ORIG_PATH+"synthetic_road_accidents_10k.csv"),
                  pd.read_csv(ORIG_PATH+"synthetic_road_accidents_2k.csv")],
            ignore_index=True)

TARGET = "accident_risk"
NUMS = test.select_dtypes(include='number').columns.tolist()
CATS = test.select_dtypes(exclude='number').columns.tolist()
FEATURES = NUMS + CATS

for (name, df) in dict(Train=train, Test=test, Original=orig).items():
    print(f"{name} has shape: {df.shape}")

print(f"\nTotal Numerical: {len(NUMS)}")
print(f"Total Categorical: {len(CATS)}")

Train has shape: (517754, 13)
Test has shape: (172585, 12)
Original has shape: (112000, 13)

Total Numerical: 4
Total Categorical: 8


In [5]:
BINARY_COLS = [c for c in test.columns if test[c].nunique() == 2]
for df in [train, test, orig]:
    df['meta'] = (
        0.3 * df["curvature"] + 
        0.2 * (df["lighting"] == "night").astype(int) + 
        0.1 * (df["weather"] != "clear").astype(int) + 
        0.2 * (df["speed_limit"] >= 60).astype(int) + 
        0.1 * (np.array(df["num_reported_accidents"]) > 2).astype(int)
    )
    df['BINARY']=0
    for i in range(len(BINARY_COLS)):
        df['BINARY']+=df[BINARY_COLS[i]].astype(int)*(2**i)

In [6]:
for c in ['curvature', 'speed_limit']:
    for i in range(-3,3):
        train[c+f"_{i}"]=(train[c]*(10**i)%10).astype(np.int8)
        test[c+f"_{i}"]=(test[c]*(10**i)%10).astype(np.int8)
        orig[c+f"_{i}"]=(orig[c]*(10**i)%10).astype(np.int8)
        if train[c+f"_{i}"].nunique()==1:
            train.drop([c+f"_{i}"],axis=1,inplace=True)
            test.drop([c+f"_{i}"],axis=1,inplace=True)
            orig.drop([c+f"_{i}"],axis=1,inplace=True)

In [7]:
STATS = ['mean']  # stats to compute
ORIG = []

print(f"Merging {len(FEATURES)} features: ", end="")

for feat in FEATURES:
    print(f"{feat}, ", end="")

    agg = orig.groupby(feat)[TARGET].agg(STATS)
    new_cols = [f'OTE_{feat}_{s.upper()}' for s in STATS]
    agg.columns = new_cols
    agg = agg.reset_index() 

    train = train.merge(agg, on=feat, how='left')
    test  = test.merge(agg, on=feat, how='left')

    # optionally fill NaNs (choose a sensible fill)
    # e.g., fill count with 0, nunique with 0, mean with global mean
    global_mean = orig[TARGET].mean()
    nan_filler = {
            f'OTE_{feat}_MEAN': global_mean,
            # f'OTE_{feat}_COUNT': 0,
            # f'OTE_{feat}_NUNIQUE': 0,
        }
    train[new_cols] = train[new_cols].fillna(nan_filler)
    test[new_cols] = test[new_cols].fillna(nan_filler)
    # simpler: fill all with -1 (or use per-column logic)
    # train[new_cols] = train[new_cols].fillna(-1)
    # test[new_cols]  = test[new_cols].fillna(-1)

    ORIG.extend(new_cols)

# After all merges, drop columns that are constant in both train and test
to_drop = []
for col in list(ORIG):
    # check column exists (safety) and whether it's constant (nunique < 2) in both datasets
    n_train = train[col].nunique() if col in train.columns else 0
    n_test  = test[col].nunique()  if col in test.columns  else 0
    if (n_train < 2) and (n_test < 2):
        to_drop.append(col)

if to_drop:
    train = train.drop(columns=to_drop)
    test  = test.drop(columns=to_drop)
    ORIG = [c for c in ORIG if c not in to_drop]

print()
print(f"\n##### Total features merged: {len(ORIG)}")

Merging 12 features: num_lanes, curvature, speed_limit, num_reported_accidents, road_type, lighting, weather, road_signs_present, public_road, time_of_day, holiday, school_season, 

##### Total features merged: 12


In [8]:
test.isna().sum()

road_type                          0
num_lanes                          0
curvature                          0
speed_limit                        0
lighting                           0
weather                            0
road_signs_present                 0
public_road                        0
time_of_day                        0
holiday                            0
school_season                      0
num_reported_accidents             0
meta                               0
BINARY                             0
curvature_0                        0
curvature_1                        0
curvature_2                        0
speed_limit_-1                     0
speed_limit_0                      0
OTE_num_lanes_MEAN                 0
OTE_curvature_MEAN                 0
OTE_speed_limit_MEAN               0
OTE_num_reported_accidents_MEAN    0
OTE_road_type_MEAN                 0
OTE_lighting_MEAN                  0
OTE_weather_MEAN                   0
OTE_road_signs_present_MEAN        0
O

In [9]:
FEATURES.append('meta')
FEATURES.append('BINARY')
FEATURES.extend(ORIG)
print(len(FEATURES), 'Features.')

26 Features.


In [10]:
# from pytabkit import TabM_HPO_Regressor

# model = TabM_HPO_Regressor(
#         device='cuda',
#         random_state=42,
#         n_cv=1,
#         # n_refit=0,
#         n_epochs=10, 
#         val_metric_name='rmse',
#         verbosity=2
#     )
    
# model.fit(X, y, cat_col_names=CATS)

In [11]:
@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout

In [12]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true.values, y_pred))

FOLDS = 7
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

rand_seeds = tuple(np.ceil(np.linspace(SEED, 2005, FOLDS)).astype(int))
rand_seeds

(42, 370, 697, 1024, 1351, 1678, 2005)

In [13]:
FEATURES = [c for c in test.columns]

X = train[FEATURES]
y = train[TARGET]
X_test = test[FEATURES]

FEATURES

['road_type',
 'num_lanes',
 'curvature',
 'speed_limit',
 'lighting',
 'weather',
 'road_signs_present',
 'public_road',
 'time_of_day',
 'holiday',
 'school_season',
 'num_reported_accidents',
 'meta',
 'BINARY',
 'curvature_0',
 'curvature_1',
 'curvature_2',
 'speed_limit_-1',
 'speed_limit_0',
 'OTE_num_lanes_MEAN',
 'OTE_curvature_MEAN',
 'OTE_speed_limit_MEAN',
 'OTE_num_reported_accidents_MEAN',
 'OTE_road_type_MEAN',
 'OTE_lighting_MEAN',
 'OTE_weather_MEAN',
 'OTE_road_signs_present_MEAN',
 'OTE_public_road_MEAN',
 'OTE_time_of_day_MEAN',
 'OTE_holiday_MEAN',
 'OTE_school_season_MEAN']

In [14]:
tabm_params = {
          'batch_size': 'auto',
          'patience': 16,
          'allow_amp': True,
          'arch_type': 'tabm-mini',
          'tabm_k': 32,
          'gradient_clipping_norm': 1.0, 
          'share_training_batches': False,
          'lr': 0.003,
          'weight_decay': 0.024,
          'n_blocks': 3,
          'd_block': 448, 
          'dropout': 0.0, 
          'num_emb_type': 'pwl',
          'd_embedding': 32,
          'num_emb_n_bins': 119,
         }

# tabm_params = {'batch_size': 'auto',
#           'patience': 16,
#           'allow_amp': True,
#           'arch_type': 'tabm-mini',
#           'tabm_k': 32,
#           'gradient_clipping_norm': 1.0, 
#           'share_training_batches': False,
#           'lr': 0.0029993695720154537,
#           'weight_decay': 0.023742083301699905,
#           'n_blocks': 3,
#           'd_block': 448, 
#           'dropout': 0.0, 
#           'num_emb_type': 'pwl',
#           'd_embedding': 32,
#           'num_emb_n_bins': 119,
#          }

In [15]:
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(test))
fold_scores = []

for fold, (train_idx, val_idx) in tqdm(enumerate(kf.split(X)), total=FOLDS):
    print(f'|- Fold {fold+1}/{FOLDS} -|')
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    tabm_params['random_state'] = (42, 370, 697, 1024, 1351, 1678, 2005)[fold]

    with suppress_stdout():
        model = TabM_D_Regressor(**tabm_params)
        model.fit(X_train, y_train, X_val, y_val, cat_col_names=CATS)
    
    oof_preds[val_idx] = model.predict(X_val)
    test_preds += model.predict(X_test) / FOLDS

    score = rmse(y_val, oof_preds[val_idx])
    print(f"{COLOR}Fold {fold+1} RMSE: {score:.6f}{RESET}")
    fold_scores.append(score)

overall_score = rmse(y, oof_preds)
name = f"tabM_" + f"{overall_score:.6f}".split('.')[1]

print(f"|{'-'*50}{COLOR}")
print(f"| Total No.features: {len(X_train.columns)}")
print(f"| Overall OOF  RMSE: {overall_score:.6f}")
print(f"| Average Fold RMSE: {np.mean(fold_scores):.6f} ± {np.std(fold_scores):.6f}{RESET}")

oof_preds.shape, test_preds.shape

  0%|          | 0/7 [00:00<?, ?it/s]

|- Fold 1/7 -|
[32mFold 1 RMSE: 0.055870[0m
|- Fold 2/7 -|
[32mFold 2 RMSE: 0.056371[0m
|- Fold 3/7 -|
[32mFold 3 RMSE: 0.055737[0m
|- Fold 4/7 -|
[32mFold 4 RMSE: 0.056120[0m
|- Fold 5/7 -|
[32mFold 5 RMSE: 0.055903[0m
|- Fold 6/7 -|
[32mFold 6 RMSE: 0.055828[0m
|- Fold 7/7 -|
[32mFold 7 RMSE: 0.055710[0m
|--------------------------------------------------[32m
| Total No.features: 31
| Overall OOF  RMSE: 0.055934
| Average Fold RMSE: 0.055934 ± 0.000217[0m


((517754,), (172585,))

In [16]:
oof_final  = pd.DataFrame(oof_preds, columns=[name])
test_final = pd.DataFrame(test_preds, columns=[name])

In [17]:
## -- Save OOF & PREDICTIONS --
oof_final.to_parquet(f"{name}_oof.parquet", index=False)
test_final.to_parquet(f"{name}_pred.parquet", index=False)

## -- Submission file --
submit[TARGET] = test_final.values
submit.to_csv(f"submit_{name}.csv", index=False)
submit.head()

Unnamed: 0,id,accident_risk
0,517754,0.295315
1,517755,0.119829
2,517756,0.181758
3,517757,0.306133
4,517758,0.395426
