# Autogluon Approach

This notebook is based on the amazing notebook: https://www.kaggle.com/code/quannguyn12/ensemble-with-polars

But we use Autogluon instead for the training.

- We implement the ndcg_at_3 as a regression problem

In [2]:
%%capture
!pip install -U polars
!pip install -U optuna
!pip install -U autogluon

In [34]:
# Autogluon configuration. Automatically detects if we are using an interactive notebook, and use lower defaults when debugging
import os
def is_interactive_session():
    return os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') == 'Interactive'

is_interactive_session()

config = {
    "autogluon_time": 60*60*0.2,
    "autogluon_presets": "best_quality",
    #"reduce_features": 0, # Set to >0 to use only the first n features
    "tail_rows": 0 # Set to >0 to use only the last n rows in the file
    
}

if is_interactive_session():
    print("Interactive session")
    config["autogluon_time"] = 100
    #config["reduce_features"] = 200
    config["autogluon_preset"] = "medium_quality"
    config["tail_rows"] = 2000
    print(config)
else:
    print("running as job")
    print(config)

Interactive session
{'autogluon_time': 100, 'autogluon_presets': 'best_quality', 'tail_rows': 2000, 'autogluon_preset': 'medium_quality'}


In [3]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import time
import xgboost as xgb
import catboost
import lightgbm as lgb
import optuna

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load data
train = pl.read_parquet('/kaggle/input/aeroclub-recsys-2025/train.parquet').drop('__index_level_0__')
test = pl.read_parquet('/kaggle/input/aeroclub-recsys-2025/test.parquet').drop('__index_level_0__').with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))

data_raw = pl.concat((train, test))

## Helpers

In [5]:
def hitrate_at_3(y_true, y_pred, groups):
    df = pl.DataFrame({
        'group': groups,
        'pred': y_pred,
        'true': y_true
    })
    
    return (
        df.filter(pl.col("group").count().over("group") > 10)
        .sort(["group", "pred"], descending=[False, True])
        .group_by("group", maintain_order=True)
        .head(3)
        .group_by("group")
        .agg(pl.col("true").max())
        .select(pl.col("true").mean())
        .item()
    )

## Feature Engineering

In [6]:
df = data_raw.clone()

# More efficient duration to minutes converter
def dur_to_min(col):
    # Extract days and time parts in one pass
    days = col.str.extract(r"^(\d+)\.", 1).cast(pl.Int64).fill_null(0) * 1440
    time_str = pl.when(col.str.contains(r"^\d+\.")).then(col.str.replace(r"^\d+\.", "")).otherwise(col)
    hours = time_str.str.extract(r"^(\d+):", 1).cast(pl.Int64).fill_null(0) * 60
    minutes = time_str.str.extract(r":(\d+):", 1).cast(pl.Int64).fill_null(0)
    return (days + hours + minutes).fill_null(0)

# Process duration columns
dur_cols = ["legs0_duration", "legs1_duration"] + [f"legs{l}_segments{s}_duration" for l in (0, 1) for s in (0, 1)]
dur_exprs = [dur_to_min(pl.col(c)).alias(c) for c in dur_cols if c in df.columns]

# Apply duration transformations first
if dur_exprs:
    df = df.with_columns(dur_exprs)

# Precompute marketing carrier columns check
mc_cols = [f'legs{l}_segments{s}_marketingCarrier_code' for l in (0, 1) for s in range(4)]
mc_exists = [col for col in mc_cols if col in df.columns]

# Combine all initial transformations
df = df.with_columns([
        # Price features
        (pl.col("totalPrice") / (pl.col("taxes") + 1)).alias("price_per_tax"),
        (pl.col("taxes") / (pl.col("totalPrice") + 1)).alias("tax_rate"),
        pl.col("totalPrice").log1p().alias("log_price"),
        
        # Duration features
        (pl.col("legs0_duration").fill_null(0) + pl.col("legs1_duration").fill_null(0)).alias("total_duration"),
        pl.when(pl.col("legs1_duration").fill_null(0) > 0)
            .then(pl.col("legs0_duration") / (pl.col("legs1_duration") + 1))
            .otherwise(1.0).alias("duration_ratio"),
        
        # Trip type
        (pl.col("legs1_duration").is_null() | 
         (pl.col("legs1_duration") == 0) | 
         pl.col("legs1_segments0_departureFrom_airport_iata").is_null()).cast(pl.Int32).alias("is_one_way"),
        
        # Total segments count
        (pl.sum_horizontal(pl.col(col).is_not_null().cast(pl.UInt8) for col in mc_exists) 
         if mc_exists else pl.lit(0)).alias("l0_seg"),
        
        # FF features
        (pl.col("frequentFlyer").fill_null("").str.count_matches("/") + 
         (pl.col("frequentFlyer").fill_null("") != "").cast(pl.Int32)).alias("n_ff_programs"),
        
        # Binary features
        pl.col("corporateTariffCode").is_not_null().cast(pl.Int32).alias("has_corporate_tariff"),
        (pl.col("pricingInfo_isAccessTP") == 1).cast(pl.Int32).alias("has_access_tp"),
        
        # Baggage & fees
        (pl.col("legs0_segments0_baggageAllowance_quantity").fill_null(0) + 
         pl.col("legs1_segments0_baggageAllowance_quantity").fill_null(0)).alias("baggage_total"),
        (pl.col("miniRules0_monetaryAmount").fill_null(0) + 
         pl.col("miniRules1_monetaryAmount").fill_null(0)).alias("total_fees"),
        
        # Routes & carriers
        pl.col("searchRoute").is_in(["MOWLED/LEDMOW", "LEDMOW/MOWLED", "MOWLED", "LEDMOW", "MOWAER/AERMOW"])
            .cast(pl.Int32).alias("is_popular_route"),
        
        # Cabin
        pl.mean_horizontal(["legs0_segments0_cabinClass", "legs1_segments0_cabinClass"]).alias("avg_cabin_class"),
        (pl.col("legs0_segments0_cabinClass").fill_null(0) - 
         pl.col("legs1_segments0_cabinClass").fill_null(0)).alias("cabin_class_diff"),
])

# Segment counts - more efficient
seg_exprs = []
for leg in (0, 1):
    seg_cols = [f"legs{leg}_segments{s}_duration" for s in range(4) if f"legs{leg}_segments{s}_duration" in df.columns]
    if seg_cols:
        seg_exprs.append(
            pl.sum_horizontal(pl.col(c).is_not_null() for c in seg_cols)
                .cast(pl.Int32).alias(f"n_segments_leg{leg}")
        )
    else:
        seg_exprs.append(pl.lit(0).cast(pl.Int32).alias(f"n_segments_leg{leg}"))

# Add segment-based features
# First create segment counts
df = df.with_columns(seg_exprs)

# Then use them for derived features
df = df.with_columns([
    (pl.col("n_segments_leg0") + pl.col("n_segments_leg1")).alias("total_segments"),
    (pl.col("n_segments_leg0") == 1).cast(pl.Int32).alias("is_direct_leg0"),
    pl.when(pl.col("is_one_way") == 1).then(0)
        .otherwise((pl.col("n_segments_leg1") == 1).cast(pl.Int32)).alias("is_direct_leg1"),
])

# More derived features
df = df.with_columns([
    (pl.col("is_direct_leg0") & pl.col("is_direct_leg1")).cast(pl.Int32).alias("both_direct"),
    ((pl.col("isVip") == 1) | (pl.col("n_ff_programs") > 0)).cast(pl.Int32).alias("is_vip_freq"),
    (pl.col("baggage_total") > 0).cast(pl.Int32).alias("has_baggage"),
    (pl.col("total_fees") > 0).cast(pl.Int32).alias("has_fees"),
    (pl.col("total_fees") / (pl.col("totalPrice") + 1)).alias("fee_rate"),
    pl.col("Id").count().over("ranker_id").alias("group_size"),
])

# Add major carrier flag if column exists
if "legs0_segments0_marketingCarrier_code" in df.columns:
    df = df.with_columns(
        pl.col("legs0_segments0_marketingCarrier_code").is_in(["SU", "S7", "U6"])
            .cast(pl.Int32).alias("is_major_carrier")
    )
else:
    df = df.with_columns(pl.lit(0).alias("is_major_carrier"))

df = df.with_columns(pl.col("group_size").log1p().alias("group_size_log"))

# Time features - batch process
time_exprs = []
for col in ("legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"):
    if col in df.columns:
        dt = pl.col(col).str.to_datetime(strict=False)
        h = dt.dt.hour().fill_null(12)
        time_exprs.extend([
            h.alias(f"{col}_hour"),
            dt.dt.weekday().fill_null(0).alias(f"{col}_weekday"),
            (((h >= 6) & (h <= 9)) | ((h >= 17) & (h <= 20))).cast(pl.Int32).alias(f"{col}_business_time")
        ])
if time_exprs:
    df = df.with_columns(time_exprs)

# Batch rank computations - more efficient with single pass
# First apply the columns that will be used for ranking
df = df.with_columns([
    pl.col("group_size").log1p().alias("group_size_log"),
])

# Price and duration basic ranks
rank_exprs = []
for col, alias in [("totalPrice", "price"), ("total_duration", "duration")]:
    rank_exprs.append(pl.col(col).rank().over("ranker_id").alias(f"{alias}_rank"))

# Price-specific features
price_exprs = [
    (pl.col("totalPrice").rank("average").over("ranker_id") / 
     pl.col("totalPrice").count().over("ranker_id")).alias("price_pct_rank"),
    (pl.col("totalPrice") == pl.col("totalPrice").min().over("ranker_id")).cast(pl.Int32).alias("is_cheapest"),
    ((pl.col("totalPrice") - pl.col("totalPrice").median().over("ranker_id")) / 
     (pl.col("totalPrice").std().over("ranker_id") + 1)).alias("price_from_median"),
    (pl.col("l0_seg") == pl.col("l0_seg").min().over("ranker_id")).cast(pl.Int32).alias("is_min_segments"),
]

# Apply initial ranks
df = df.with_columns(rank_exprs + price_exprs)

# Cheapest direct - more efficient
direct_cheapest = (
    df.filter(pl.col("is_direct_leg0") == 1)
    .group_by("ranker_id")
    .agg(pl.col("totalPrice").min().alias("min_direct"))
)

df = df.join(direct_cheapest, on="ranker_id", how="left").with_columns(
    ((pl.col("is_direct_leg0") == 1) & 
     (pl.col("totalPrice") == pl.col("min_direct"))).cast(pl.Int32).fill_null(0).alias("is_direct_cheapest")
).drop("min_direct")

In [7]:
# Fill nulls
data = df.with_columns(
    [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
    [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
)

## Feature Selection

In [8]:
# Categorical features
cat_features = [
    'nationality', 'searchRoute', 'corporateTariffCode',
    'bySelf', 'sex', 'companyID',
    # Leg 0 segments 0-1
    'legs0_segments0_aircraft_code', 'legs0_segments0_arrivalTo_airport_city_iata',
    'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata',
    'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code',
    'legs0_segments0_flightNumber',
    'legs0_segments1_aircraft_code', 'legs0_segments1_arrivalTo_airport_city_iata',
    'legs0_segments1_arrivalTo_airport_iata', 'legs0_segments1_departureFrom_airport_iata',
    'legs0_segments1_marketingCarrier_code', 'legs0_segments1_operatingCarrier_code',
    'legs0_segments1_flightNumber',
    # Leg 1 segments 0-1
    'legs1_segments0_aircraft_code', 'legs1_segments0_arrivalTo_airport_city_iata',
    'legs1_segments0_arrivalTo_airport_iata', 'legs1_segments0_departureFrom_airport_iata',
    'legs1_segments0_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code',
    'legs1_segments0_flightNumber',
    'legs1_segments1_aircraft_code', 'legs1_segments1_arrivalTo_airport_city_iata',
    'legs1_segments1_arrivalTo_airport_iata', 'legs1_segments1_departureFrom_airport_iata',
    'legs1_segments1_marketingCarrier_code', 'legs1_segments1_operatingCarrier_code',
    'legs1_segments1_flightNumber',
]

# Columns to exclude (uninformative or problematic)
exclude_cols = [
    'Id', 'ranker_id', 'selected', 'profileId', 'requestDate',
    'legs0_departureAt', 'legs0_arrivalAt', 'legs1_departureAt', 'legs1_arrivalAt',
    'miniRules0_percentage', 'miniRules1_percentage',  # >90% missing
    'frequentFlyer',  # Already processed
    # Exclude constant columns
    'pricingInfo_passengerCount'
]


# Exclude segment 2-3 columns (>98% missing)
for leg in [0, 1]:
    for seg in [2, 3]:
        for suffix in ['aircraft_code', 'arrivalTo_airport_city_iata', 'arrivalTo_airport_iata',
                      'baggageAllowance_quantity', 'baggageAllowance_weightMeasurementType',
                      'cabinClass', 'departureFrom_airport_iata', 'duration', 'flightNumber',
                      'marketingCarrier_code', 'operatingCarrier_code', 'seatsAvailable']:
            exclude_cols.append(f'legs{leg}_segments{seg}_{suffix}')

feature_cols = [col for col in data.columns if col not in exclude_cols]
cat_features_final = [col for col in cat_features if col in feature_cols]

print(f"Using {len(feature_cols)} features ({len(cat_features_final)} categorical)")

X = data.select(feature_cols)
y = data.select('selected')
groups = data.select('ranker_id')

Using 112 features (34 categorical)


## Model Training and Tuning

## AutoGluon

In [22]:
from autogluon.tabular import TabularPredictor


In [21]:
autogluon_df = pl.concat([X, y], how="horizontal")
autogluon_df

bySelf,companyID,corporateTariffCode,nationality,isAccess3D,isVip,legs0_duration,legs0_segments0_aircraft_code,legs0_segments0_arrivalTo_airport_city_iata,legs0_segments0_arrivalTo_airport_iata,legs0_segments0_baggageAllowance_quantity,legs0_segments0_baggageAllowance_weightMeasurementType,legs0_segments0_cabinClass,legs0_segments0_departureFrom_airport_iata,legs0_segments0_duration,legs0_segments0_flightNumber,legs0_segments0_marketingCarrier_code,legs0_segments0_operatingCarrier_code,legs0_segments0_seatsAvailable,legs0_segments1_aircraft_code,legs0_segments1_arrivalTo_airport_city_iata,legs0_segments1_arrivalTo_airport_iata,legs0_segments1_baggageAllowance_quantity,legs0_segments1_baggageAllowance_weightMeasurementType,legs0_segments1_cabinClass,legs0_segments1_departureFrom_airport_iata,legs0_segments1_duration,legs0_segments1_flightNumber,legs0_segments1_marketingCarrier_code,legs0_segments1_operatingCarrier_code,legs0_segments1_seatsAvailable,legs1_duration,legs1_segments0_aircraft_code,legs1_segments0_arrivalTo_airport_city_iata,legs1_segments0_arrivalTo_airport_iata,legs1_segments0_baggageAllowance_quantity,legs1_segments0_baggageAllowance_weightMeasurementType,…,total_fees,is_popular_route,avg_cabin_class,cabin_class_diff,n_segments_leg0,n_segments_leg1,total_segments,is_direct_leg0,is_direct_leg1,both_direct,is_vip_freq,has_baggage,has_fees,fee_rate,group_size,is_major_carrier,group_size_log,legs0_departureAt_hour,legs0_departureAt_weekday,legs0_departureAt_business_time,legs0_arrivalAt_hour,legs0_arrivalAt_weekday,legs0_arrivalAt_business_time,legs1_departureAt_hour,legs1_departureAt_weekday,legs1_departureAt_business_time,legs1_arrivalAt_hour,legs1_arrivalAt_weekday,legs1_arrivalAt_business_time,price_rank,duration_rank,price_pct_rank,is_cheapest,price_from_median,is_min_segments,is_direct_cheapest,selected
bool,i64,i64,i64,bool,bool,i64,str,str,str,f64,f64,f64,str,i64,str,str,str,f64,str,str,str,f64,f64,f64,str,i64,str,str,str,f64,i64,str,str,str,f64,f64,…,f64,i32,f64,f64,i32,i32,i32,i32,i32,i32,i32,i32,i32,f64,u32,i32,f64,i8,i8,i32,i8,i8,i32,i8,i8,i32,i8,i8,i32,f64,f64,f64,i32,f64,i32,i32,i64
true,57323,0,36,false,false,160,"""YK2""","""KJA""","""KJA""",1.0,0.0,1.0,"""TLK""",160,"""216""","""KV""","""KV""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,155,"""YK2""","""TLK""","""TLK""",1.0,0.0,…,0.0,0,1.0,0.0,2,2,4,0,0,0,1,1,0,0.0,25,0,3.258097,15,6,0,16,6,0,9,2,1,14,2,0,1.0,1.0,0.04,1,-1.947024,1,0,1
true,57323,123,36,true,false,445,"""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""",170,"""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""",80,"""5311""","""S7""","""S7""",4.0,505,"""E70""","""OVB""","""OVB""",1.0,0.0,…,5800.0,0,1.0,0.0,2,2,4,0,0,0,1,1,1,0.113445,25,1,3.258097,9,6,1,14,6,0,22,2,0,8,3,1,4.5,3.5,0.18,0,-0.135934,0,0,0
true,57323,0,36,false,false,445,"""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""",170,"""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""",80,"""5311""","""S7""","""S7""",4.0,505,"""E70""","""OVB""","""OVB""",1.0,0.0,…,5800.0,0,1.0,0.0,2,2,4,0,0,0,1,1,1,0.108015,25,1,3.258097,9,6,1,14,6,0,22,2,0,8,3,1,10.5,3.5,0.42,0,0.0,0,0,0
true,57323,123,36,true,false,445,"""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""",170,"""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""",80,"""5311""","""S7""","""S7""",4.0,505,"""E70""","""OVB""","""OVB""",1.0,0.0,…,0.0,0,1.0,0.0,2,2,4,0,0,0,1,1,0,0.0,25,1,3.258097,9,6,1,14,6,0,22,2,0,8,3,1,16.5,3.5,0.66,0,1.490774,0,0,0
true,57323,0,36,false,false,445,"""E70""","""OVB""","""OVB""",1.0,0.0,1.0,"""TLK""",170,"""5358""","""S7""","""S7""",4.0,"""E70""","""KJA""","""KJA""",1.0,0.0,1.0,"""OVB""",80,"""5311""","""S7""","""S7""",4.0,505,"""E70""","""OVB""","""OVB""",1.0,0.0,…,0.0,0,1.0,0.0,2,2,4,0,0,0,1,1,0,0.0,25,1,3.258097,9,6,1,14,6,0,22,2,0,8,3,1,22.5,3.5,0.9,0,1.712393,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
false,57320,65,36,true,false,165,"""32A""","""ASF""","""ASF""",2.0,0.0,1.0,"""SVO""",165,"""1678""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,0,"""missing""","""missing""","""missing""",0.0,0.0,…,0.0,0,1.0,1.0,2,2,4,0,0,0,0,1,0,0.0,12,1,2.564949,9,3,1,12,3,0,12,0,0,12,0,0,10.5,9.5,0.875,0,1.361367,1,0,0
false,57320,65,36,true,false,165,"""32A""","""ASF""","""ASF""",1.0,0.0,1.0,"""SVO""",165,"""1172""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,0,"""missing""","""missing""","""missing""",0.0,0.0,…,5600.0,0,1.0,1.0,2,2,4,0,0,0,0,1,1,0.478551,12,1,2.564949,21,3,0,1,4,0,12,0,0,12,0,0,6.5,9.5,0.541667,0,0.0,1,0,0
false,57320,65,36,true,false,165,"""32A""","""ASF""","""ASF""",2.0,0.0,1.0,"""SVO""",165,"""1172""","""SU""","""SU""",4.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,0,"""missing""","""missing""","""missing""",0.0,0.0,…,0.0,0,1.0,1.0,2,2,4,0,0,0,0,1,0,0.0,12,1,2.564949,21,3,0,1,4,0,12,0,0,12,0,0,10.5,9.5,0.875,0,1.361367,1,0,0
false,57320,65,36,true,false,160,"""32B""","""ASF""","""ASF""",1.0,0.0,1.0,"""SVO""",160,"""1174""","""SU""","""SU""",9.0,"""missing""","""missing""","""missing""",0.0,0.0,0.0,"""missing""",0,"""missing""","""missing""","""missing""",0.0,0,"""missing""","""missing""","""missing""",0.0,0.0,…,5600.0,0,1.0,1.0,2,2,4,0,0,0,0,1,1,0.388027,12,1,2.564949,15,3,0,18,3,1,12,0,0,12,0,0,9.0,5.5,0.75,0,0.776705,1,0,0


In [29]:
hyperparameters = {
    'GBM': {},      # LightGBM
    'CAT': {},      # CatBoost
    'RF': {},       # RandomForest
    'XT': {},       # ExtraTrees
    'KNN': {},      # k-NN
    'NN': {},       # MXNet neural-net
    'LR': {},       # LinearModel
    'XGB': {        # XGBoost 
    }
}


In [None]:
from sklearn.metrics import ndcg_score
from autogluon.core.metrics import make_scorer
from autogluon.tabular import TabularPredictor


# 1) Define a (rough) NDCG@3 scorer for validation:
def ndcg_at_3(y_true, y_pred, **kwargs):
    # This simple wrapper only works if you pass in one group at a time.
    return ndcg_score([y_true], [y_pred], k=3)
ndcg_scorer = make_scorer(name='ndcg@3', score_func=ndcg_at_3, greater_is_better=True)

# 2) Fit all models under regression:
predictor = TabularPredictor(
    label='selected',
    problem_type='regression',
    eval_metric=ndcg_scorer,
).fit(
    autogluon_df.to_pandas(),
    hyperparameters=hyperparameters,
    time_limit = config["autogluon_time"],
    presets = config["autogluon_preset"]
    # you can also tune time_limit, presets, etc., as needed
)



No path specified. Models will be saved in: "AutogluonModels/ag-20250718_110104"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.10.17
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Oct  8 14:23:56 UTC 2023
CPU Count:          96
Memory Avail:       162.20 GB / 334.56 GB (48.5%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 100s
AutoGluon will save models to "/kaggle/working/AutogluonModels/ag-20250718_110104"
Train Data Rows:    25043148
Train Data Columns: 112
Label Column:       selected
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    202838.26 MB
	Train Data (Original)  Memory Usage: 54569.85 MB (26.9% of available memory)
	Inferring data type of each feature based on column val

In [None]:
# Once trained, view performance on validation:
predictor.leaderboard(val_data, extra_metrics=True)

## 5. Submission

In [None]:
predictions = predictor.predict(test.select(feature_cols).to_pandas())

In [None]:
predictions.write_csv("submission.csv")