In [2]:
import pandas as pd

In [3]:
df_2025 = pd.read_csv('./data/2025/data_2025_test.csv')

In [4]:
features = [
    "home_pitcher_true_freq", "away_pitcher_true_freq",
    "home_pitcher_vs_team_freq", "away_pitcher_vs_team_freq",
    "home_pitcher_vs_team_freq_count", "away_pitcher_vs_team_freq_count",
    "home_pitcher_last3_freq_1st", "away_pitcher_last3_freq_1st",
    "home_pitcher_momentum", "away_pitcher_momentum",
    "home_pitcher_vs_away_team_momentum", "away_pitcher_vs_home_team_momentum",
    "home_team_inning1_scaled", "away_team_inning1_scaled",
    "umpire_inning1_scaled", "stadium_inning1_scaled"
]

In [5]:
import requests
def inning_run_1(game_id):
    url = f"https://statsapi.mlb.com/api/v1/game/{game_id}/linescore"
    try:
        response = requests.get(url)
        data = response.json()

        inning_scores = data.get("innings", [])
        if len(inning_scores) >= 1:
            home_runs = inning_scores[0]['home']['runs']
            away_runs = inning_scores[0]['away']['runs']
            return 1 if (home_runs > 0 or away_runs > 0) else 0
        return 0
    except Exception as e:
        print(f"Error con gamePk {game_id}: {e}")
        return 0


In [6]:
df_2025['target']=df_2025['game_id'].apply(inning_run_1)

In [7]:
df_2025

Unnamed: 0,game_id,home_team,away_team,stadium,day_or_night,home_pitcher,away_pitcher,home_pitcher_true_freq,away_pitcher_true_freq,home_pitcher_vs_team_freq,...,away_team_momentum,home_pitcher_momentum,away_pitcher_momentum,home_pitcher_vs_away_team_momentum,away_pitcher_vs_home_team_momentum,home_plate_umpire_inning1_freq,stadium_inning1_freq,stadium_inning1_scaled,umpire_inning1_scaled,target
0,777869,Texas Rangers,Houston Astros,Globe Life Field,Día,Jack Leiter,Framber Valdez,0.428571,0.222222,-1.00,...,-0.522200,0.238095,-0.222222,-0.760295,0.006546,0.562500,0.423077,0.191763,0.778846,0
1,777873,Baltimore Orioles,Washington Nationals,Oriole Park at Camden Yards,Día,Zach Eflin,Michael Soroka,0.083333,0.125000,0.00,...,-0.137184,0.250000,-0.125000,-0.387184,-0.447760,0.722222,0.504950,0.699534,1.000000,1
2,777880,Philadelphia Phillies,Pittsburgh Pirates,Citizens Bank Park,Día,Mick Abel,Paul Skenes,0.000000,0.100000,-1.00,...,-0.345591,0.000000,-0.100000,-0.345591,-0.561538,0.470588,0.504854,0.698937,0.651584,0
3,777876,Boston Red Sox,Atlanta Braves,Fenway Park,Día,Brayan Bello,Spencer Schwellenbach,0.352941,0.076923,-1.00,...,-0.429222,0.313725,0.589744,-0.742948,-1.357473,0.600000,0.519231,0.788098,0.830769,1
4,777871,Toronto Blue Jays,Detroit Tigers,Rogers Centre,Día,José Berríos,Jackson Jobe,0.095238,0.500000,1.00,...,-0.313300,-0.095238,0.166667,-0.218061,-0.607939,0.000000,0.490196,0.608028,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,777812,Athletics,Los Angeles Angels,Sutter Health Park,Día,Luis Severino,Tyler Anderson,0.181818,0.166667,-1.00,...,-0.080720,-0.181818,0.500000,0.101098,-0.913008,0.600000,0.523810,0.795367,0.879104,1
62,777821,Detroit Tigers,Cleveland Guardians,Comerica Park,Noche,Jack Flaherty,Tanner Bibee,0.125000,0.368421,0.25,...,-0.339400,0.208333,-0.368421,-0.547733,0.078131,0.370370,0.504762,0.715830,0.000000,0
63,777814,Pittsburgh Pirates,Milwaukee Brewers,PNC Park,Noche,Mike Burrows,Aaron Civale,0.000000,0.466667,-1.00,...,-0.409756,0.000000,0.200000,-0.409756,-0.756660,0.542857,0.451923,0.495192,0.660341,1
64,777820,Washington Nationals,Atlanta Braves,Nationals Park,Noche,Trevor Williams,AJ Smith-Shawver,0.222222,0.400000,0.00,...,-0.429222,0.111111,-0.400000,-0.540333,0.019887,0.371429,0.403846,0.294439,0.004051,1


In [8]:
# Models

import os
import joblib

path_model = './model'  # Carpeta donde están los .pkl
models = {}

for file_ in os.listdir(path_model):
    if file_.endswith('.pkl'):
        
        name = file_.replace('.pkl', '')
        models[name] = joblib.load(os.path.join(path_model, file_))


In [9]:
result = {}

for name_model, model in models.items():
    pred = model.predict(df_2025[features])
    result[name_model] = pred

In [10]:
for model_name, pred in result.items():
    df_2025[f'pred_{model_name}'] = pred

In [11]:
df_2025.columns

Index(['game_id', 'home_team', 'away_team', 'stadium', 'day_or_night',
       'home_pitcher', 'away_pitcher', 'home_pitcher_true_freq',
       'away_pitcher_true_freq', 'home_pitcher_vs_team_freq',
       'away_pitcher_vs_team_freq', 'home_pitcher_vs_team_freq_count',
       'away_pitcher_vs_team_freq_count', 'home_pitcher_last3_freq_1st',
       'away_pitcher_last3_freq_1st', 'home_team_inning1_last10_freq',
       'away_team_inning1_last10_freq', 'home_team_inning1_scaled',
       'away_team_inning1_scaled', 'home_team_momentum', 'away_team_momentum',
       'home_pitcher_momentum', 'away_pitcher_momentum',
       'home_pitcher_vs_away_team_momentum',
       'away_pitcher_vs_home_team_momentum', 'home_plate_umpire_inning1_freq',
       'stadium_inning1_freq', 'stadium_inning1_scaled',
       'umpire_inning1_scaled', 'target', 'pred_LogisticRegression',
       'pred_RandomForest', 'pred_Naive_Bayes', 'pred_SVM',
       'pred_gradient_boosting_ft', 'pred_Gradient _Boosting', 'pred_XGBo

In [24]:
model_pred = [
  "pred_LogisticRegression",	"pred_RandomForest",	"pred_Gradient _Boosting",	"pred_SVM",	"pred_Naive_Bayes",	"pred_XGBoost", "pred_gradient_boosting_ft"
]

In [13]:
# Filtrar modelos que tengan predict_proba
models_con_proba = {
    name: model
    for name, model in models.items()
    if hasattr(model, 'predict_proba')
}

In [14]:
result_pro = {}

for name_model, model in models_con_proba.items():
    pred = model.predict_proba(df_2025[features])
    result_pro[name_model] = pred

In [22]:
for m, item in result_pro.items():
  print (m)

LogisticRegression
RandomForest
Naive_Bayes
gradient_boosting_ft
Gradient _Boosting
XGBoost


In [16]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [17]:
from sklearn.metrics import r2_score

In [25]:
metrics = {}
for modelo in model_pred:
    y_true = df_2025['target']
    y_pred = df_2025[modelo]
    
    metrics[modelo] = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred)
        

    }

In [26]:
df_2025.head(15)

Unnamed: 0,game_id,home_team,away_team,stadium,day_or_night,home_pitcher,away_pitcher,home_pitcher_true_freq,away_pitcher_true_freq,home_pitcher_vs_team_freq,...,stadium_inning1_scaled,umpire_inning1_scaled,target,pred_LogisticRegression,pred_RandomForest,pred_Naive_Bayes,pred_SVM,pred_gradient_boosting_ft,pred_Gradient _Boosting,pred_XGBoost
0,777869,Texas Rangers,Houston Astros,Globe Life Field,Día,Jack Leiter,Framber Valdez,0.428571,0.222222,-1.0,...,0.191763,0.778846,0,0,0,1,0,0,0,0
1,777873,Baltimore Orioles,Washington Nationals,Oriole Park at Camden Yards,Día,Zach Eflin,Michael Soroka,0.083333,0.125,0.0,...,0.699534,1.0,1,0,0,1,0,0,0,0
2,777880,Philadelphia Phillies,Pittsburgh Pirates,Citizens Bank Park,Día,Mick Abel,Paul Skenes,0.0,0.1,-1.0,...,0.698937,0.651584,0,0,0,1,0,0,0,0
3,777876,Boston Red Sox,Atlanta Braves,Fenway Park,Día,Brayan Bello,Spencer Schwellenbach,0.352941,0.076923,-1.0,...,0.788098,0.830769,1,0,1,1,1,0,0,0
4,777871,Toronto Blue Jays,Detroit Tigers,Rogers Centre,Día,José Berríos,Jackson Jobe,0.095238,0.5,1.0,...,0.608028,0.0,1,0,1,1,0,0,1,0
5,777874,Cincinnati Reds,Cleveland Guardians,Great American Ball Park,Día,Andrew Abbott,Luis L. Ortiz,0.1875,0.363636,-1.0,...,0.31103,0.674556,0,0,0,1,0,0,0,0
6,777877,Miami Marlins,Tampa Bay Rays,loanDepot park,Día,Cal Quantrill,Shane Baz,0.235294,0.272727,-1.0,...,1.0,0.580645,0,0,0,1,1,0,0,0
7,777864,Kansas City Royals,St. Louis Cardinals,Kauffman Stadium,Día,Michael Wacha,Matthew Liberatore,0.157895,0.285714,0.0,...,0.34397,0.574109,0,0,0,1,0,0,0,0
8,777865,Milwaukee Brewers,Minnesota Twins,American Family Field,Día,Freddy Peralta,Zebby Matthews,0.2,0.142857,0.0,...,0.668831,0.709193,0,0,0,1,0,0,0,0
9,777872,Chicago Cubs,Chicago White Sox,Wrigley Field,Día,Colin Rea,Jonathan Cannon,0.421053,0.352941,-1.0,...,0.0,0.807692,1,0,0,1,1,0,0,0


In [27]:
df_2025[(df_2025['home_team'] == 'Houston Astros') | (df_2025['away_team'] == 'Houston Astros')]


Unnamed: 0,game_id,home_team,away_team,stadium,day_or_night,home_pitcher,away_pitcher,home_pitcher_true_freq,away_pitcher_true_freq,home_pitcher_vs_team_freq,...,stadium_inning1_scaled,umpire_inning1_scaled,target,pred_LogisticRegression,pred_RandomForest,pred_Naive_Bayes,pred_SVM,pred_gradient_boosting_ft,pred_Gradient _Boosting,pred_XGBoost
0,777869,Texas Rangers,Houston Astros,Globe Life Field,Día,Jack Leiter,Framber Valdez,0.428571,0.222222,-1.0,...,0.191763,0.778846,0,0,0,1,0,0,0,0
18,777858,Tampa Bay Rays,Houston Astros,George M. Steinbrenner Field,Noche,Ryan Pepiot,Colton Gordon,0.45,0.0,-1.0,...,1.0,0.95875,0,0,0,1,1,0,0,0
31,777848,Tampa Bay Rays,Houston Astros,George M. Steinbrenner Field,Noche,Zack Littell,Brandon Walter,0.368421,0.0,0.0,...,1.0,0.733255,0,0,1,1,1,0,0,0
43,777832,Tampa Bay Rays,Houston Astros,George M. Steinbrenner Field,Día,Taj Bradley,Hunter Brown,0.277778,0.35,1.0,...,1.0,0.802867,1,0,1,1,1,1,1,1
65,777822,Houston Astros,Seattle Mariners,Daikin Park,Noche,Lance McCullers Jr.,George Kirby,1.0,0.315789,-1.0,...,0.0,0.660341,1,0,0,1,1,0,0,0


## threshold


In [43]:
threshold = 0.90
dataframes_filtrados = {}

X = df_2025[features]
y_true = df_2025['target']

for name, model in models_con_proba.items():
    # Obtener ambas probabilidades
    y_proba = model.predict_proba(X)
    y_pred = (y_proba[:, 1] >= threshold).astype(int)
    
    # Crear DataFrame copia con columnas nuevas
    df_result = df_2025.copy()
    df_result[f'proba_0'] = y_proba[:, 0]  # probabilidad clase 0
    df_result[f'proba_1'] = y_proba[:, 1]  # probabilidad clase 1
    df_result[f'pred'] = y_pred

    # Filtrar por threshold sobre clase 1
    # df_filtrado = df_result[(df_result[f'proba_1'] >= threshold) | (df_result[f'proba_0'] >= threshold)]

    df_filtrado = df_result[(df_result[f'proba_1'] >= threshold)]
    # df_filtrado = df_result[(df_result[f'proba_0'] >= threshold)]

    # Columnas a guardar
    columnas_resultado = [
        'game_id', 'home_team', 'away_team', 'target',
        f'proba_0', f'proba_1', f'pred'
    ] + [col for col in features if col in df_filtrado.columns]

    # Guardar DataFrame filtrado
    dataframes_filtrados[name] = df_filtrado[columnas_resultado]

    print(f"{name}: {len(df_filtrado)} partidos con proba clase 1 >= {threshold}")


LogisticRegression: 2 partidos con proba clase 1 >= 0.9
RandomForest: 0 partidos con proba clase 1 >= 0.9
Naive_Bayes: 59 partidos con proba clase 1 >= 0.9
gradient_boosting_ft: 0 partidos con proba clase 1 >= 0.9
Gradient _Boosting: 5 partidos con proba clase 1 >= 0.9
XGBoost: 6 partidos con proba clase 1 >= 0.9


In [44]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Diccionario para guardar resultados
metricas_por_modelo = {}

for nombre_modelo, df in dataframes_filtrados.items():
    # Usar el nombre del modelo para identificar la columna de predicción
    columna_pred = f'pred'
    
    y_true = df['target']
    y_pred = df[columna_pred]

    # Calcular métricas
    metricas_por_modelo[nombre_modelo] = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1_score': f1_score(y_true, y_pred, zero_division=0),
        'n_samples': len(df)
    }

# Convertir a DataFrame para visualizar
df_metricas = pd.DataFrame(metricas_por_modelo).T
print(df_metricas)


                      accuracy  precision  recall  f1_score  n_samples
LogisticRegression    0.000000   0.000000     0.0  0.000000        2.0
RandomForest               NaN   0.000000     0.0  0.000000        0.0
Naive_Bayes           0.508475   0.508475     1.0  0.674157       59.0
gradient_boosting_ft       NaN   0.000000     0.0  0.000000        0.0
Gradient _Boosting    0.800000   0.800000     1.0  0.888889        5.0
XGBoost               0.333333   0.333333     1.0  0.500000        6.0


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


In [45]:
dataframes_filtrados['RandomForest']

Unnamed: 0,game_id,home_team,away_team,target,proba_0,proba_1,pred,home_pitcher_true_freq,away_pitcher_true_freq,home_pitcher_vs_team_freq,...,home_pitcher_last3_freq_1st,away_pitcher_last3_freq_1st,home_pitcher_momentum,away_pitcher_momentum,home_pitcher_vs_away_team_momentum,away_pitcher_vs_home_team_momentum,home_team_inning1_scaled,away_team_inning1_scaled,umpire_inning1_scaled,stadium_inning1_scaled


In [46]:


columnas_a_mostrar = ['home_team', 'away_team', 'target', 'proba_0', 'proba_1', 'pred']

for nombre_modelo, df in dataframes_filtrados.items():
    if nombre_modelo == 'Naive_Bayes':
        continue
    print(f"\n### Resultados del modelo: {nombre_modelo}")
    display(df[columnas_a_mostrar])



### Resultados del modelo: LogisticRegression


Unnamed: 0,home_team,away_team,target,proba_0,proba_1,pred
50,Minnesota Twins,Cleveland Guardians,0,0.044306,0.955694,1
60,Colorado Rockies,Philadelphia Phillies,0,0.004023,0.995977,1



### Resultados del modelo: RandomForest


Unnamed: 0,home_team,away_team,target,proba_0,proba_1,pred



### Resultados del modelo: gradient_boosting_ft


Unnamed: 0,home_team,away_team,target,proba_0,proba_1,pred



### Resultados del modelo: Gradient _Boosting


Unnamed: 0,home_team,away_team,target,proba_0,proba_1,pred
4,Toronto Blue Jays,Detroit Tigers,1,0.018655,0.981345,1
12,Los Angeles Dodgers,Los Angeles Angels,1,0.052666,0.947334,1
33,Toronto Blue Jays,San Diego Padres,1,0.00731,0.99269,1
43,Tampa Bay Rays,Houston Astros,1,0.011907,0.988093,1
53,Toronto Blue Jays,San Diego Padres,0,0.083878,0.916122,1



### Resultados del modelo: XGBoost


Unnamed: 0,home_team,away_team,target,proba_0,proba_1,pred
12,Los Angeles Dodgers,Los Angeles Angels,1,0.004112,0.995888,1
27,Pittsburgh Pirates,Cincinnati Reds,0,0.041849,0.958151,1
43,Tampa Bay Rays,Houston Astros,1,0.074113,0.925887,1
50,Minnesota Twins,Cleveland Guardians,0,0.029254,0.970746,1
53,Toronto Blue Jays,San Diego Padres,0,0.000988,0.999012,1
60,Colorado Rockies,Philadelphia Phillies,0,0.00427,0.99573,1
