In [30]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [31]:
# data
df = pd.read_csv('./data/mlb_data.csv')

feature_cols = [
    "home_pitcher_true_freq", "away_pitcher_true_freq",
    "home_pitcher_vs_team_freq", "away_pitcher_vs_team_freq",
    "home_pitcher_vs_team_freq_count", "away_pitcher_vs_team_freq_count",
    "home_pitcher_last3_freq_1st", "away_pitcher_last3_freq_1st",
    "home_pitcher_momentum", "away_pitcher_momentum",
    "home_pitcher_vs_away_team_momentum", "away_pitcher_vs_home_team_momentum",
    "home_team_inning1_scaled", "away_team_inning1_scaled",
    "umpire_inning1_scaled", "stadium_inning1_scaled"
]
df.fillna(0.0, inplace=True)

In [42]:
df

Unnamed: 0,game_id,home_team,away_team,stadium,day_or_night,home_plate_umpire,inning_1_home,inning_1_away,target,home_plate_umpire_inning1_freq,...,home_pitcher_last3_freq_1st,away_pitcher_last3_freq_1st,home_team_momentum,away_team_momentum,home_pitcher_momentum,away_pitcher_momentum,home_pitcher_vs_away_team_momentum,away_pitcher_vs_home_team_momentum,home_pitcher_dominance_vs_away_team,away_pitcher_dominance_vs_home_team
0,747060,Baltimore Orioles,Los Angeles Angels,Oriole Park at Camden Yards,Dia,Adrian Johnson,2,1,1,0.578947,...,0.000000,0.000000,-0.116832,0.261905,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,746737,Cincinnati Reds,Washington Nationals,Great American Ball Park,Dia,Dan Iassogna,0,0,0,0.513514,...,0.000000,0.000000,-0.069231,0.120755,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,745445,San Diego Padres,San Francisco Giants,Petco Park,Dia,Mark Ripperger,0,0,0,0.500000,...,0.000000,0.000000,0.222772,0.052381,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,746165,Los Angeles Dodgers,St. Louis Cardinals,Dodger Stadium,Dia,Alan Porter,2,0,1,0.432432,...,0.000000,0.000000,0.090000,0.140741,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,745116,Tampa Bay Rays,Toronto Blue Jays,Tropicana Field,Dia,Todd Tichenor,1,0,1,0.526316,...,0.000000,0.000000,-0.047706,0.098077,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3207,777921,Chicago Cubs,Miami Marlins,Wrigley Field,Noche,Ron Kulpa,0,0,0,0.222222,...,0.333333,0.000000,-0.174510,-0.145098,0.175439,-0.100000,-0.320537,-0.074510,-0.030341,0.274510
3208,777918,Texas Rangers,Colorado Rockies,Globe Life Field,Noche,Nestor Ceja,4,0,1,0.500000,...,0.333333,0.666667,0.147573,0.107692,0.183333,0.333333,-0.075641,-0.185761,-0.291026,-0.480906
3209,777920,Houston Astros,Kansas City Royals,Daikin Park,Noche,David Rackley,0,0,0,0.390244,...,0.000000,0.000000,-0.147619,-0.128571,0.000000,-0.142857,0.000000,-0.004762,0.000000,0.290476
3210,777914,San Diego Padres,Los Angeles Angels,Petco Park,Noche,John Tumpane,3,0,1,0.487179,...,0.000000,0.666667,0.222772,0.261905,-0.333333,0.500000,0.595238,-0.277228,0.071429,-0.722772


In [33]:
X = df[feature_cols]
y = df['target']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [34]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=300,   
    max_depth=None,     
    min_samples_leaf=4, 
    max_features=0.1,   
    bootstrap=True,     
    random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'Naive Bayes': GaussianNB()
}

In [35]:
results = []

for model_name, model in models.items():
    print(f"\nmodel: {model_name}")
    accs, f1s, aucs = [], [], []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Pipeline: escalado + modelo
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('model', model)
        ])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_val)
        y_proba = pipe.predict_proba(X_val)[:, 1] if hasattr(pipe, "predict_proba") else y_pred

        accs.append(accuracy_score(y_val, y_pred))
        f1s.append(f1_score(y_val, y_pred))
        aucs.append(roc_auc_score(y_val, y_proba))

        print(f"  Fold {fold+1} - Acc: {accs[-1]:.3f} | F1: {f1s[-1]:.3f} | ROC AUC: {aucs[-1]:.3f}")

    results.append({
        'model': model_name,
        'acc_mean': sum(accs)/len(accs),
        'f1_mean': sum(f1s)/len(f1s),
        'auc_mean': sum(aucs)/len(aucs)
    })



model: LogisticRegression
  Fold 1 - Acc: 0.899 | F1: 0.891 | ROC AUC: 0.970
  Fold 2 - Acc: 0.899 | F1: 0.891 | ROC AUC: 0.973


  Fold 3 - Acc: 0.914 | F1: 0.907 | ROC AUC: 0.979
  Fold 4 - Acc: 0.900 | F1: 0.890 | ROC AUC: 0.969
  Fold 5 - Acc: 0.877 | F1: 0.865 | ROC AUC: 0.965

model: RandomForest
  Fold 1 - Acc: 0.890 | F1: 0.879 | ROC AUC: 0.959
  Fold 2 - Acc: 0.900 | F1: 0.894 | ROC AUC: 0.966
  Fold 3 - Acc: 0.928 | F1: 0.923 | ROC AUC: 0.976
  Fold 4 - Acc: 0.916 | F1: 0.908 | ROC AUC: 0.967
  Fold 5 - Acc: 0.897 | F1: 0.889 | ROC AUC: 0.962

model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 1 - Acc: 0.897 | F1: 0.890 | ROC AUC: 0.975


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 2 - Acc: 0.904 | F1: 0.898 | ROC AUC: 0.975
  Fold 3 - Acc: 0.922 | F1: 0.918 | ROC AUC: 0.982


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 4 - Acc: 0.919 | F1: 0.911 | ROC AUC: 0.979


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 5 - Acc: 0.893 | F1: 0.886 | ROC AUC: 0.974

model: Gradient Boosting
  Fold 1 - Acc: 0.904 | F1: 0.897 | ROC AUC: 0.979
  Fold 2 - Acc: 0.910 | F1: 0.904 | ROC AUC: 0.980
  Fold 3 - Acc: 0.928 | F1: 0.924 | ROC AUC: 0.984
  Fold 4 - Acc: 0.925 | F1: 0.918 | ROC AUC: 0.983
  Fold 5 - Acc: 0.900 | F1: 0.893 | ROC AUC: 0.977

model: SVM
  Fold 1 - Acc: 0.904 | F1: 0.896 | ROC AUC: 0.903
  Fold 2 - Acc: 0.911 | F1: 0.904 | ROC AUC: 0.910
  Fold 3 - Acc: 0.927 | F1: 0.923 | ROC AUC: 0.927
  Fold 4 - Acc: 0.928 | F1: 0.922 | ROC AUC: 0.927
  Fold 5 - Acc: 0.907 | F1: 0.900 | ROC AUC: 0.906

model: Naive Bayes
  Fold 1 - Acc: 0.860 | F1: 0.854 | ROC AUC: 0.956
  Fold 2 - Acc: 0.882 | F1: 0.876 | ROC AUC: 0.963
  Fold 3 - Acc: 0.903 | F1: 0.898 | ROC AUC: 0.975
  Fold 4 - Acc: 0.894 | F1: 0.888 | ROC AUC: 0.961
  Fold 5 - Acc: 0.861 | F1: 0.853 | ROC AUC: 0.954


In [40]:
results_df = pd.DataFrame(results).sort_values(by='auc_mean', ascending=False).reset_index(drop=True)


In [41]:
results_df

Unnamed: 0,model,acc_mean,f1_mean,auc_mean
0,Gradient Boosting,0.913454,0.907394,0.980653
1,XGBoost,0.906916,0.900525,0.977179
2,LogisticRegression,0.897882,0.888972,0.971128
3,RandomForest,0.906296,0.89859,0.966166
4,Naive Bayes,0.880143,0.873846,0.961668
5,SVM,0.915322,0.908722,0.914761
