In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [12]:
# data
df = pd.read_csv('./data/mlb_data.csv')

feature_cols = [
    "home_pitcher_true_freq", "away_pitcher_true_freq",
    "home_pitcher_vs_team_freq", "away_pitcher_vs_team_freq",
    "home_pitcher_vs_team_freq_count", "away_pitcher_vs_team_freq_count",
    "home_pitcher_last3_freq_1st", "away_pitcher_last3_freq_1st",
    "home_pitcher_momentum", "away_pitcher_momentum",
    "home_pitcher_vs_away_team_momentum", "away_pitcher_vs_home_team_momentum",
    "home_team_inning1_scaled", "away_team_inning1_scaled",
    "umpire_inning1_scaled", "stadium_inning1_scaled"
]
df.fillna(0, inplace=True)

In [13]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [25]:
X = df[feature_cols]
y = df['target']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [26]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=300,   
    max_depth=None,     
    min_samples_leaf=4, 
    max_features=0.1,   
    bootstrap=True,     
    random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [27]:
results = []

for model_name, model in models.items():
    print(f"\nmodel: {model_name}")
    accs, f1s, aucs = [], [], []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Pipeline: escalado + modelo
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('model', model)
        ])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_val)
        y_proba = pipe.predict_proba(X_val)[:, 1] if hasattr(pipe, "predict_proba") else y_pred

        accs.append(accuracy_score(y_val, y_pred))
        f1s.append(f1_score(y_val, y_pred))
        aucs.append(roc_auc_score(y_val, y_proba))

        print(f"  Fold {fold+1} - Acc: {accs[-1]:.3f} | F1: {f1s[-1]:.3f} | ROC AUC: {aucs[-1]:.3f}")

    results.append({
        'model': model_name,
        'acc_mean': sum(accs)/len(accs),
        'f1_mean': sum(f1s)/len(f1s),
        'auc_mean': sum(aucs)/len(aucs)
    })



model: LogisticRegression
  Fold 1 - Acc: 0.899 | F1: 0.891 | ROC AUC: 0.970
  Fold 2 - Acc: 0.899 | F1: 0.891 | ROC AUC: 0.973
  Fold 3 - Acc: 0.914 | F1: 0.907 | ROC AUC: 0.979
  Fold 4 - Acc: 0.900 | F1: 0.890 | ROC AUC: 0.969
  Fold 5 - Acc: 0.877 | F1: 0.865 | ROC AUC: 0.965

model: RandomForest
  Fold 1 - Acc: 0.890 | F1: 0.879 | ROC AUC: 0.959
  Fold 2 - Acc: 0.900 | F1: 0.894 | ROC AUC: 0.966
  Fold 3 - Acc: 0.928 | F1: 0.923 | ROC AUC: 0.976
  Fold 4 - Acc: 0.916 | F1: 0.908 | ROC AUC: 0.967
  Fold 5 - Acc: 0.897 | F1: 0.889 | ROC AUC: 0.962

model: XGBoost
  Fold 1 - Acc: 0.897 | F1: 0.890 | ROC AUC: 0.975


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 2 - Acc: 0.904 | F1: 0.898 | ROC AUC: 0.975


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 3 - Acc: 0.922 | F1: 0.918 | ROC AUC: 0.982
  Fold 4 - Acc: 0.919 | F1: 0.911 | ROC AUC: 0.979


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 5 - Acc: 0.893 | F1: 0.886 | ROC AUC: 0.974


In [28]:
results_df = pd.DataFrame(results).sort_values(by='auc_mean', ascending=False)

In [29]:
results_df

Unnamed: 0,model,acc_mean,f1_mean,auc_mean
2,XGBoost,0.906916,0.900525,0.977179
0,LogisticRegression,0.897882,0.888972,0.971128
1,RandomForest,0.906296,0.89859,0.966166
