# Baseline Model with LightAutoML

This notebook demonstrates how to build a baseline model using LightAutoML for viral repository prediction.

In [None]:
import pandas as pd
from lightautoml.dataset.roles import ColumnRole, DatetimeRole
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

In [None]:
# Install required packages with uv for fast installation
!uv pip install pandas scipy numpy bentoml scikit-learn optuna
!pip install lightautoml[all]


import pandas as pd
from lightautoml.dataset.roles import ColumnRole, DatetimeRole
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from lightautoml.tuner.optuna import OptunaTuner
from lightautoml.ml_algo.boost_lgbm import LGBMClassifier
from lightautoml.tuner.tuner import AutoTuner
from lightautoml.pipelines.selection.base import ModelSelector

## 2. Prepare the dataframe and roles


In [None]:
df = pd.read_parquet("../data/repos_snapshot.parquet")

TARGET = "viral_label"          # 1 / 0
# if you predict both labels, train two separate models or a multitask preset

# Define numeric features based on RepoData model
numeric_features = [
    # 30-day metrics
    "stars_30d", "forks_30d", "commits_30d", "contributors_30d",
    # 90-day metrics  
    "stars_90d", "forks_90d", "commits_90d", "contributors_90d",
    # 365-day metrics
    "stars_365d", "forks_365d", "commits_365d", "contributors_365d",
    # Repository metadata
    "repo_age"
]

# Define categorical features
categorical_features = ["primary_language", "license"]

# Define boolean features (will be treated as categorical)
boolean_features = ["has_ci", "has_wiki"]

roles = {
    "target": TARGET,
    "drop": ["repo_id", "abandoned_label"],  # drop ID and unused label
    "numeric": numeric_features,
    "categorical": categorical_features + boolean_features,
    "datetime": {"snapshot_date": DatetimeRole(base_date=True)}
}

# LightAutoML infers roles automatically, but it's safer to pass them explicitly when your columns are heterogeneous.


In [None]:
# Validate dataset structure
print("Dataset shape:", df.shape)
print("\nColumns in dataset:")
print(df.columns.tolist())

print(f"\nTarget distribution:")
print(df[TARGET].value_counts())

print(f"\nMissing values:")
print(df.isnull().sum().sum())

# Check if all expected features are present
expected_features = numeric_features + categorical_features + boolean_features + ["snapshot_date", TARGET]
missing_features = [f for f in expected_features if f not in df.columns]
if missing_features:
    print(f"\nWarning: Missing expected features: {missing_features}")
else:
    print(f"\nAll expected features present ✓")


In [None]:
# Proper train/test split for model validation
from sklearn.model_selection import train_test_split

try:
    # Split data before AutoML to have proper holdout validation
    train_data, test_data = train_test_split(
        df, 
        test_size=0.2, 
        random_state=42, 
        stratify=df[TARGET]
    )
    
    print(f"Training set: {train_data.shape}")
    print(f"Test set: {test_data.shape}")
    print(f"Target distribution in train: {train_data[TARGET].value_counts()}")
    print(f"Target distribution in test: {test_data[TARGET].value_counts()}")
    
except Exception as e:
    print(f"Error in train/test split: {e}")
    raise

## 3. Define the task


In [None]:
task = Task(
    name="binary", 
    metric="aucpr",           # PR-AUC is better for class imbalance
    greater_is_better=True
)

In [None]:
automl = TabularAutoML(
    task               = task,
    timeout            = 3600,          # seconds – wall-clock budget
    cpu_limit          = 8,             # threads
    reader_params      = {"n_jobs": 8}, # FAST reader
    verbose            = 1
)

oof_pred = automl.fit_predict(train_data, roles=roles)

print("CV PR-AUC:", automl.score(oof_pred, df[TARGET]))
best_pipeline = automl.create_model("LightGBM")[0]

# Save the trained model
import os
import pickle
from pathlib import Path

models_dir = Path("../models")
models_dir.mkdir(exist_ok=True)

# Save the full automl model
with open(models_dir / "baseline_automl.pkl", "wb") as f:
    pickle.dump(automl, f)
print("Saved baseline AutoML model to ../models/baseline_automl.pkl")

# The preset internally builds a 3-layer ensemble (linear + LightGBM + CatBoost) with default HPO based on random search inside each layer.


## 5. Custom hyper-parameter optimisation

LightAutoML exposes an "AutoTune" wrapper that can replace the default tuning logic with Optuna (or hyperopt, or random).
You can create a tuned LightGBM block and inject it into the pipeline.


In [None]:
# 1️⃣  Define base algorithm
lgb_params = {
    "learning_rate": 0.03,
    "num_leaves": 128,
    "feature_fraction": 0.9,
    "min_data_in_leaf": 5,
}

lgb = LGBMClassifier(**lgb_params, random_state=42)

In [None]:
# 2️⃣  Wrap with Optuna tuner
optuna_tuner = OptunaTuner(
    search_space={
        "learning_rate": (0.005, 0.2, "loguniform"),
        "num_leaves": (16, 512, "int"),
        "feature_fraction": (0.5, 1.0, "uniform"),
        "min_data_in_leaf": (1, 50, "int"),
        "lambda_l2": (0.0, 5.0, "loguniform"),
    },
    n_trials=100,
    timeout=1800,          # 30 min tuning budget inside global timeout
    direction="maximize",
)

tuned_lgb = AutoTuner(
    lgb,
    tuner=optuna_tuner,
    cv=automl.reader.cv_splitter,   # reuse same CV object
    scoring=task.metric
)

In [None]:
# 3️⃣  Plug tuned block into an ensemble
custom_selector = ModelSelector(models=[tuned_lgb])

automl_custom = TabularAutoML(
    task      = task,
    timeout   = 5400,
    cpu_limit = 8,
    general_params = {"use_algos": [[custom_selector]]},  # single layer
    verbose   = 1
)

oof_tuned = automl_custom.fit_predict(train_data, roles=roles)
print("Tuned PR-AUC:", automl_custom.score(oof_tuned, df[TARGET]))

# Save the custom tuned model
with open(models_dir / "tuned_automl.pkl", "wb") as f:
    pickle.dump(automl_custom, f)
print("Saved tuned AutoML model to ../models/tuned_automl.pkl")


## 6. Model Validation and Best Practices

In [None]:
from sklearn.metrics import average_precision_score, roc_auc_score, classification_report
import json
from datetime import datetime

print("=== BASELINE MODEL VALIDATION ===")
test_pr_auc_baseline = test_roc_auc_baseline = None
if 'automl' in locals():
    test_pred_baseline = automl.predict(test_data)
    test_pr_auc_baseline = average_precision_score(test_data[TARGET], test_pred_baseline.data[:, 0])
    test_roc_auc_baseline = roc_auc_score(test_data[TARGET], test_pred_baseline.data[:, 0])
    print(f"Baseline Test PR-AUC: {test_pr_auc_baseline:.4f}")
    print(f"Baseline Test ROC-AUC: {test_roc_auc_baseline:.4f}")

print("\n=== TUNED MODEL VALIDATION ===")
test_pr_auc_tuned = test_roc_auc_tuned = None
if 'automl_custom' in locals():
    test_pred_tuned = automl_custom.predict(test_data)
    test_pr_auc_tuned = average_precision_score(test_data[TARGET], test_pred_tuned.data[:, 0])
    test_roc_auc_tuned = roc_auc_score(test_data[TARGET], test_pred_tuned.data[:, 0])
    print(f"Tuned Test PR-AUC: {test_pr_auc_tuned:.4f}")
    print(f"Tuned Test ROC-AUC: {test_roc_auc_tuned:.4f}")
    if test_pr_auc_baseline is not None:
        print(f"\nImprovement from tuning: {test_pr_auc_tuned - test_pr_auc_baseline:.4f} PR-AUC points")
    best_pred = test_pred_tuned
    best_pred_binary = (best_pred.data[:, 0] > 0.5).astype(int)
    print("\nClassification Report (threshold=0.5):")
    print(classification_report(test_data[TARGET], best_pred_binary))
    try:
        best_model = automl_custom
        fi = best_model.get_feature_scores()
        if fi is not None:
            print("Top 15 most important features:")
            for i, (feature, importance) in enumerate(fi.head(15).items()):
                print(f"{i+1:2d}. {feature}: {importance:.4f}")
    except Exception:
        pass

performance_summary = {
    "validation_date": datetime.now().isoformat(),
    "test_set_size": test_data.shape[0],
    "baseline_model": {
        "cv_pr_auc": float(automl.score(oof_pred, train_data[TARGET])) if 'automl' in locals() else None,
        "test_pr_auc": float(test_pr_auc_baseline) if test_pr_auc_baseline is not None else None,
        "test_roc_auc": float(test_roc_auc_baseline) if test_roc_auc_baseline is not None else None
    },
    "tuned_model": {
        "cv_pr_auc": float(automl_custom.score(oof_tuned, train_data[TARGET])) if 'automl_custom' in locals() else None,
        "test_pr_auc": float(test_pr_auc_tuned) if test_pr_auc_tuned is not None else None,
        "test_roc_auc": float(test_roc_auc_tuned) if test_roc_auc_tuned is not None else None
    }
}

models_dir = Path("../models")
models_dir.mkdir(exist_ok=True)
with open(models_dir / "model_performance.json", "w") as f:
    json.dump(performance_summary, f, indent=2)
print(f"\nPerformance summary saved to {models_dir / 'model_performance.json'}")