In [1]:
import lightgbm as lgb
import numpy as np
import optuna
import pandas as pd
import warnings
from feature_engine.selection import SmartCorrelatedSelection
from optuna.visualization import plot_optimization_history
from sklearn.model_selection import cross_val_score, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.feature_selection import SelectKBest, VarianceThreshold, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

In [2]:
train_df = pd.read_csv("../../data/train.csv")
train_df = train_df.drop(columns=["id"])

diff_cols = [col for col in train_df.columns if col.endswith("_diff")]
train_df[diff_cols] = train_df[diff_cols].fillna(0)

X_train = train_df.drop(columns=["red_win"])
y_train = train_df["red_win"]

In [3]:
# Initial feature filtering
filter_pipe = Pipeline(
    steps=[
        ("variance", VarianceThreshold(threshold=0)),
        ("mutual_info", SelectKBest(score_func=mutual_info_classif, k=100)),
    ]
)

X_train_filtered = filter_pipe.fit_transform(X_train, y_train)

Experiment Log
- 02/28/2025: 0.6290269252676522 (added venue/location based features, k = 100)
- 03/01/2025: 0.6281800250612026 (added event metadata features, k = 100)
- 03/01/2025: 0.6298790757563192 (added fighter qualitative attributes part 1, k = 100)
- 03/01/2025: 0.6308701627220016 (no additional features, k = 50)
- 03/02/2025: 0.6284594339066326 (added fighter qualitative attributes part 2, k = 100)
- 03/02/2025: 0.6296610552855587 (went back to normal CV no repeat, k = 100)
- 03/02/2025: 0.6235736054762234 (added bout metadata features and replaced target encoding with WOE, k = 100)
- 03/02/2025: 0.6065059104372694 (tried lightgbm over logistic regression, k = 100)
- 03/09/2025: 0.6246837160308553 (logistic regression, added round 1 striking and grappling features, k = 100)
- 03/09/2025: 0.6075728206237241 (lightgbm, added round 1 striking and grappling features, k = 100)
- 03/09/2025: 0.6235008472942699 (logistic regression, restricted round 1 stats to ufc only, k = 100)
- 03/09/2025: 0.6047079904234718 (lightgbm, restricted round 1 stats to ufc only, k = 100)
- 03/09/2025: 0.6182181448245128 (logistic regression, added aggregated striking and grappling features and removed all ratio features, k = 100)
- 03/09/2025: 0.599283811996892 (lightgbm, same as above, k = 100)
- 03/12/2025: 0.6019940745991823 (logistic regression, added mean devigged implied opening prob difference, k = 100)
- 03/12/2025: 0.590035580960352 (lightgbm, same as above, k = 100)

In [4]:
def objective(trial):
    selected_subset = [
        feature for feature in range(X_train_filtered.shape[1])
        if trial.suggest_categorical(f"feature_{feature}", [0, 1]) == 1
    ]

    # Avoid empty feature set
    if len(selected_subset) == 0:
        return 10.0  # Large log loss to discourage empty feature sets

    # Subset data to selected features
    X_selected = X_train_filtered[:, selected_subset]

    # Hyperparameter tuning
    C = trial.suggest_float("C", 1e-4, 1e2, log=True)  # Regularization strength

    # Create pipeline with scaling
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression(
            penalty="l2",
            C=C,
            max_iter=300,
            random_state=42
        ))
    ])

    # Cross-validation with log loss scoring
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    score = cross_val_score(pipeline, X_selected, y_train, cv=cv, scoring="neg_log_loss", n_jobs=-1).mean() # type: ignore

    return -score

sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=1000, show_progress_bar=True)

# Print best results
print("Best log loss:", study.best_value)
print("Best hyperparameters:", study.best_params)

# Visualize optimization history
fig = plot_optimization_history(study)
fig.show()

[I 2025-03-12 03:42:59,102] A new study created in memory with name: no-name-c4ffced5-d979-4891-93c5-a61f2fd47fa4


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2025-03-12 03:43:02,882] Trial 0 finished with value: 0.6314535104934772 and parameters: {'feature_0': 1, 'feature_1': 0, 'feature_2': 0, 'feature_3': 1, 'feature_4': 1, 'feature_5': 1, 'feature_6': 0, 'feature_7': 1, 'feature_8': 1, 'feature_9': 0, 'feature_10': 0, 'feature_11': 1, 'feature_12': 1, 'feature_13': 1, 'feature_14': 0, 'feature_15': 0, 'feature_16': 1, 'feature_17': 0, 'feature_18': 0, 'feature_19': 0, 'feature_20': 1, 'feature_21': 1, 'feature_22': 1, 'feature_23': 1, 'feature_24': 0, 'feature_25': 0, 'feature_26': 0, 'feature_27': 1, 'feature_28': 1, 'feature_29': 1, 'feature_30': 0, 'feature_31': 0, 'feature_32': 1, 'feature_33': 1, 'feature_34': 1, 'feature_35': 0, 'feature_36': 1, 'feature_37': 1, 'feature_38': 0, 'feature_39': 0, 'feature_40': 0, 'feature_41': 0, 'feature_42': 1, 'feature_43': 0, 'feature_44': 0, 'feature_45': 1, 'feature_46': 0, 'feature_47': 0, 'feature_48': 0, 'feature_49': 1, 'feature_50': 1, 'feature_51': 1, 'feature_52': 0, 'feature_53': 1,

In [5]:
def objective(trial):
    selected_subset = [
        feature for feature in range(X_train_filtered.shape[1])
        if trial.suggest_categorical(f"feature_{feature}", [0, 1]) == 1
    ]

    # Avoid empty feature set
    if len(selected_subset) == 0:
        return 10.0  # Large log loss to discourage empty feature sets

    # Subset data to selected features
    X_selected = X_train_filtered[:, selected_subset]

    # Hyperparameter tuning
    param_grid = {
        "objective": "binary",
        "metric": "binary_logloss",
        "boosting_type": "gbdt",
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "max_depth": trial.suggest_int("max_depth", 3, 16),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "random_state": 42
    }

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    cv_scores = []
    for train_idx, val_idx in cv.split(X_selected, y_train):
        X_train_fold, X_val_fold = X_selected[train_idx], X_selected[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = lgb.LGBMClassifier(verbosity=-1, **param_grid)
        model.fit(
            X_train_fold, 
            y_train_fold, 
            eval_set=[(X_val_fold, y_val_fold)], 
            eval_metric="binary_logloss",
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False), lgb.log_evaluation(period=0)],
        )

        y_pred = model.predict_proba(X_val_fold)
        score = log_loss(y_val_fold, y_pred)
        cv_scores.append(score)
    
    return np.mean(cv_scores)

sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)

warnings.simplefilter(action='ignore', category=UserWarning)
study.optimize(objective, n_trials=1000, show_progress_bar=True)

# Print best results
print("Best log loss:", study.best_value)
print("Best hyperparameters:", study.best_params)

# Visualize optimization history
fig = plot_optimization_history(study)
fig.show()

[I 2025-03-12 03:51:37,890] A new study created in memory with name: no-name-edfd8b89-1ff7-47b9-b80b-3091e0e0a2ca


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2025-03-12 03:51:59,270] Trial 0 finished with value: 0.6322570467263405 and parameters: {'feature_0': 1, 'feature_1': 0, 'feature_2': 0, 'feature_3': 1, 'feature_4': 1, 'feature_5': 1, 'feature_6': 0, 'feature_7': 1, 'feature_8': 1, 'feature_9': 0, 'feature_10': 0, 'feature_11': 1, 'feature_12': 1, 'feature_13': 1, 'feature_14': 0, 'feature_15': 0, 'feature_16': 1, 'feature_17': 0, 'feature_18': 0, 'feature_19': 0, 'feature_20': 1, 'feature_21': 1, 'feature_22': 1, 'feature_23': 1, 'feature_24': 0, 'feature_25': 0, 'feature_26': 0, 'feature_27': 1, 'feature_28': 1, 'feature_29': 1, 'feature_30': 0, 'feature_31': 0, 'feature_32': 1, 'feature_33': 1, 'feature_34': 1, 'feature_35': 0, 'feature_36': 1, 'feature_37': 1, 'feature_38': 0, 'feature_39': 0, 'feature_40': 0, 'feature_41': 0, 'feature_42': 1, 'feature_43': 0, 'feature_44': 0, 'feature_45': 1, 'feature_46': 0, 'feature_47': 0, 'feature_48': 0, 'feature_49': 1, 'feature_50': 1, 'feature_51': 1, 'feature_52': 0, 'feature_53': 1,