# LightGBM CTR pipeline (step-by-step)

This notebook mirrors `src/training/lgbm_pipeline.py` so you can run each stage manually: import → load data → train → evaluate → submit.


In [1]:
import psutil, os

process = psutil.Process(os.getpid())
process.memory_info().rss / 1024**3

0.068359375

In [2]:
from pathlib import Path
import sys
import gc
import json

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

try:
    import optuna
except Exception:
    optuna = None

PROJECT_ROOT = Path.cwd().resolve()
if not (PROJECT_ROOT / 'src').exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.utils.metrics import average_precision, competition_score, weighted_log_loss


In [3]:
# Paths
train_path = PROJECT_ROOT / 'data/train/train.parquet'
test_path = PROJECT_ROOT / 'data/test/test.parquet'
submission_template = PROJECT_ROOT / 'data/submission/sample_submission.csv'
output_model = PROJECT_ROOT / 'models/lgbm_model.txt'
output_submission = PROJECT_ROOT / 'reports/submission_lgbm.csv'
output_metrics = PROJECT_ROOT / 'reports/metrics_lgbm.json'

# Columns
target_col = 'clicked'
id_col = 'ID'

# Training controls
valid_frac = 0.1  # set 0.0 to disable validation
seed = 42
num_boost_round = 800
early_stopping_rounds = 50

# Default categorical columns (kept only if they exist in the data)
categorical_cols = ['gender', 'age_group', 'inventory_id', 'day_of_week', 'hour', 'seq']


In [4]:
# Load training data
train_df = pd.read_parquet(train_path)

In [5]:
train_df = train_df.sample(frac=0.1, random_state=42)

In [6]:
if target_col not in train_df.columns:
    raise ValueError(f'Missing target column: {target_col}')

if id_col in train_df.columns:
    train_df = train_df.drop(columns=[id_col])

y = train_df[target_col].values
X = train_df.drop(columns=[target_col])

categorical_cols = [c for c in categorical_cols if c in X.columns]

train_df.head()


Unnamed: 0,gender,age_group,inventory_id,day_of_week,hour,seq,l_feat_1,l_feat_2,l_feat_3,l_feat_4,...,history_b_22,history_b_23,history_b_24,history_b_25,history_b_26,history_b_27,history_b_28,history_b_29,history_b_30,clicked
880123,2.0,6.0,42,5,0,"9,57,74,77,318,101,138,132,532,101,497,101,132...",2.0,2.0,2.0,24.0,...,0.036189,0.036189,0.006032,0.002413,0.022518,0.025734,0.008042,0.020909,0.026539,0
2089435,1.0,8.0,41,6,18,"9,57,193,463,212,479,463,193,57,479,57,463,193...",2.0,2.0,3.0,7.0,...,0.056925,0.056925,0.009488,0.003795,0.03542,0.04048,0.01265,0.03289,0.062618,0
5702931,1.0,6.0,41,5,7,"57,516,195,27,479,269,57,516,527,74,77,207,452...",2.0,2.0,3.0,1.0,...,0.139752,0.139752,0.023292,0.009317,0.086957,0.099379,0.031056,0.080746,0.051242,0
8257709,2.0,4.0,46,3,19,"516,57,165,165,338,416,74,527,77,463,212,193,1...",2.0,2.0,3.0,7.0,...,0.264708,0.264708,0.044118,0.017647,0.164707,0.188237,0.058824,0.152942,0.09706,0
3296598,2.0,8.0,88,1,23,"321,144,516,57,165,527,74,318,77,317,75,269,45...",2.0,2.0,2.0,6.0,...,0.211266,0.211266,0.035211,0.014084,0.131454,0.150234,0.046948,0.122065,0.077464,0


In [7]:
# Apply categoricals and compute class balance
category_levels = {}
for col in categorical_cols:
    X[col] = X[col].astype('category')
    category_levels[col] = X[col].cat.categories

n_pos = y.sum()
n_neg = len(y) - n_pos
scale_pos_weight = float(n_neg / max(n_pos, 1))

params = {
    'objective': 'binary',
    'metric': ['binary_logloss', 'auc'],
    'learning_rate': 0.05,
    'num_leaves': 255,
    'min_data_in_leaf': 100,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'max_depth': -1,
    'max_bin': 255,
    'lambda_l2': 1.0,
    'scale_pos_weight': scale_pos_weight,
    'seed': seed,
    'verbosity': -1,
    'num_threads': -1,
}


In [8]:
# Optuna tuning (optional)
use_optuna = False
optuna_trials = 25
optuna_timeout = None  # seconds
optuna_valid_frac = valid_frac if valid_frac and valid_frac > 0 else 0.1

if use_optuna:
    if optuna is None:
        raise ImportError("Optuna is not installed. Install with `pip install optuna`.")

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=optuna_valid_frac, random_state=seed, stratify=y
    )

    lgb_train = lgb.Dataset(
        X_train,
        label=y_train,
        categorical_feature=categorical_cols or 'auto',
        free_raw_data=False,
    )
    lgb_valid = lgb.Dataset(
        X_valid,
        label=y_valid,
        categorical_feature=categorical_cols or 'auto',
        free_raw_data=False,
    )

    def objective(trial):
        trial_params = {
            'objective': 'binary',
            'metric': ['binary_logloss', 'auc'],
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 31, 512),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 500),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
            'max_depth': trial.suggest_categorical('max_depth', [-1, 4, 6, 8, 10, 12, 14, 16]),
            'max_bin': trial.suggest_categorical('max_bin', [127, 255, 511]),
            'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 10.0, log=True),
            'scale_pos_weight': scale_pos_weight,
            'seed': seed,
            'verbosity': -1,
            'num_threads': -1,
        }

        model_t = lgb.train(
            trial_params,
            lgb_train,
            num_boost_round=num_boost_round,
            valid_sets=[lgb_valid],
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False,
        )
        preds = model_t.predict(X_valid, num_iteration=model_t.best_iteration)
        return competition_score(y_valid, preds)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=optuna_trials, timeout=optuna_timeout)

    print('Best score:', study.best_value)
    print('Best params:', study.best_params)

    params.update(study.best_params)


In [9]:
params

{'objective': 'binary',
 'metric': ['binary_logloss', 'auc'],
 'learning_rate': 0.05,
 'num_leaves': 255,
 'min_data_in_leaf': 100,
 'feature_fraction': 0.8,
 'bagging_fraction': 0.8,
 'bagging_freq': 1,
 'max_depth': -1,
 'max_bin': 255,
 'lambda_l2': 1.0,
 'scale_pos_weight': 50.85128850997869,
 'seed': 42,
 'verbosity': -1,
 'num_threads': -1}

In [10]:
# Train model
valid_info = None
if valid_frac and valid_frac > 0:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=valid_frac, random_state=seed, stratify=y
    )
    lgb_train = lgb.Dataset(
        X_train,
        label=y_train,
        categorical_feature=categorical_cols or 'auto',
        free_raw_data=False,
    )
    lgb_valid = lgb.Dataset(
        X_valid,
        label=y_valid,
        categorical_feature=categorical_cols or 'auto',
        free_raw_data=False,
    )
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=num_boost_round,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=['train', 'valid'],
        #early_stopping_rounds=early_stopping_rounds,
        #verbose_eval=100,
    )
    preds_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    valid_info = {
        'average_precision': average_precision(y_valid, preds_valid),
        'weighted_logloss': weighted_log_loss(y_valid, preds_valid),
        'competition_score': competition_score(y_valid, preds_valid),
    }
else:
    lgb_train = lgb.Dataset(
        X,
        label=y,
        categorical_feature=categorical_cols or 'auto',
        free_raw_data=False,
    )
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=num_boost_round,
        valid_sets=[lgb_train],
        valid_names=['train'],
        verbose_eval=200,
    )


In [11]:
# Inspect metrics (only if validation was enabled)
valid_info

{'average_precision': 0.06243837096167431,
 'weighted_logloss': 1.0742282470180122,
 'competition_score': 0.2722726957292}

In [12]:
# Save model + metrics
output_model.parent.mkdir(parents=True, exist_ok=True)
output_metrics.parent.mkdir(parents=True, exist_ok=True)

model.save_model(str(output_model))

metrics_payload = {
    'best_iteration': int(model.best_iteration or num_boost_round),
    'categorical_features': categorical_cols,
}
if valid_info:
    metrics_payload.update(valid_info)

with output_metrics.open('w', encoding='utf-8') as f:
    json.dump(metrics_payload, f, indent=2)


In [13]:
# Free training data memory (optional)
del train_df, X
gc.collect()

551

In [14]:
# Load test data + predict
test_df = pd.read_parquet(test_path)
if id_col not in test_df.columns:
    raise ValueError(f'Missing id column in test: {id_col}')

test_ids = test_df[id_col].copy()
X_test = test_df.drop(columns=[id_col])

for col in categorical_cols:
    if col in X_test.columns:
        X_test[col] = X_test[col].astype(
            pd.CategoricalDtype(categories=category_levels[col])
        )

preds = model.predict(X_test, num_iteration=model.best_iteration)
preds[:5]


  X_test[col] = X_test[col].astype(


array([0.07735263, 0.0379124 , 0.06150564, 0.04825122, 0.01425259])

In [15]:
# Build submission
sub = pd.read_csv(submission_template)
if id_col not in sub.columns:
    raise ValueError(f'Missing id column in submission template: {id_col}')

sub = sub.drop(columns=[target_col], errors='ignore')
pred_df = pd.DataFrame({id_col: test_ids, target_col: preds})
sub = sub.merge(pred_df, on=id_col, how='left')

output_submission.parent.mkdir(parents=True, exist_ok=True)
sub.to_csv(output_submission, index=False)

sub.head()


Unnamed: 0,ID,clicked
0,TEST_0000000,0.077353
1,TEST_0000001,0.037912
2,TEST_0000002,0.061506
3,TEST_0000003,0.048251
4,TEST_0000004,0.014253
