In [2]:
import pandas as pd

In [3]:
df_2025 = pd.read_csv('./data/2025/data_2025_test.csv')

In [4]:
features = [
    "home_pitcher_true_freq", "away_pitcher_true_freq",
    "home_pitcher_vs_team_freq", "away_pitcher_vs_team_freq",
    "home_pitcher_vs_team_freq_count", "away_pitcher_vs_team_freq_count",
    "home_pitcher_last3_freq_1st", "away_pitcher_last3_freq_1st",
    "home_pitcher_momentum", "away_pitcher_momentum",
    "home_pitcher_vs_away_team_momentum", "away_pitcher_vs_home_team_momentum",
    "home_team_inning1_scaled", "away_team_inning1_scaled",
    "umpire_inning1_scaled", "stadium_inning1_scaled"
]

In [5]:
import requests
def inning_run_1(game_id):
    url = f"https://statsapi.mlb.com/api/v1/game/{game_id}/linescore"
    try:
        response = requests.get(url)
        data = response.json()

        inning_scores = data.get("innings", [])
        if len(inning_scores) >= 1:
            home_runs = inning_scores[0]['home']['runs']
            away_runs = inning_scores[0]['away']['runs']
            return 1 if (home_runs > 0 or away_runs > 0) else 0
        return 0
    except Exception as e:
        print(f"Error con gamePk {game_id}: {e}")
        return 0


In [6]:
df_2025['target']=df_2025['game_id'].apply(inning_run_1)

In [7]:
df_2025

Unnamed: 0,game_id,home_team,away_team,stadium,day_or_night,home_pitcher,away_pitcher,home_pitcher_true_freq,away_pitcher_true_freq,home_pitcher_vs_team_freq,...,away_team_momentum,home_pitcher_momentum,away_pitcher_momentum,home_pitcher_vs_away_team_momentum,away_pitcher_vs_home_team_momentum,home_plate_umpire_inning1_freq,stadium_inning1_freq,stadium_inning1_scaled,umpire_inning1_scaled,target
0,777869,Texas Rangers,Houston Astros,Globe Life Field,Día,Jack Leiter,Framber Valdez,0.428571,0.222222,-1.00,...,-0.522200,0.238095,-0.222222,-0.760295,0.006546,0.562500,0.423077,0.191763,0.778846,0
1,777873,Baltimore Orioles,Washington Nationals,Oriole Park at Camden Yards,Día,Zach Eflin,Michael Soroka,0.083333,0.125000,0.00,...,-0.137184,0.250000,-0.125000,-0.387184,-0.447760,0.722222,0.504950,0.699534,1.000000,1
2,777880,Philadelphia Phillies,Pittsburgh Pirates,Citizens Bank Park,Día,Mick Abel,Paul Skenes,0.000000,0.100000,-1.00,...,-0.345591,0.000000,-0.100000,-0.345591,-0.561538,0.470588,0.504854,0.698937,0.651584,0
3,777876,Boston Red Sox,Atlanta Braves,Fenway Park,Día,Brayan Bello,Spencer Schwellenbach,0.352941,0.076923,-1.00,...,-0.429222,0.313725,0.589744,-0.742948,-1.357473,0.600000,0.519231,0.788098,0.830769,1
4,777871,Toronto Blue Jays,Detroit Tigers,Rogers Centre,Día,José Berríos,Jackson Jobe,0.095238,0.500000,1.00,...,-0.313300,-0.095238,0.166667,-0.218061,-0.607939,0.000000,0.490196,0.608028,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,777812,Athletics,Los Angeles Angels,Sutter Health Park,Día,Luis Severino,Tyler Anderson,0.181818,0.166667,-1.00,...,-0.080720,-0.181818,0.500000,0.101098,-0.913008,0.600000,0.523810,0.795367,0.879104,1
62,777821,Detroit Tigers,Cleveland Guardians,Comerica Park,Noche,Jack Flaherty,Tanner Bibee,0.125000,0.368421,0.25,...,-0.339400,0.208333,-0.368421,-0.547733,0.078131,0.370370,0.504762,0.715830,0.000000,0
63,777814,Pittsburgh Pirates,Milwaukee Brewers,PNC Park,Noche,Mike Burrows,Aaron Civale,0.000000,0.466667,-1.00,...,-0.409756,0.000000,0.200000,-0.409756,-0.756660,0.542857,0.451923,0.495192,0.660341,1
64,777820,Washington Nationals,Atlanta Braves,Nationals Park,Noche,Trevor Williams,AJ Smith-Shawver,0.222222,0.400000,0.00,...,-0.429222,0.111111,-0.400000,-0.540333,0.019887,0.371429,0.403846,0.294439,0.004051,1


In [None]:
# Models

import os
import joblib

path_model = './model'  # Carpeta donde están los .pkl
models = {}

for file_ in os.listdir(path_model):
    if file_.endswith('.pkl'):
        
        name = file_.replace('.pkl', '')
        models[name] = joblib.load(os.path.join(path_model, file_))


In [None]:
result = {}

for name_model, model in models.items():
    pred = model.predict(df_2025[features])
    result[name_model] = pred

In [10]:
for model_name, pred in result.items():
    df_2025[f'pred_{model_name}'] = pred

In [11]:
df_2025.columns

Index(['game_id', 'home_team', 'away_team', 'stadium', 'day_or_night',
       'home_pitcher', 'away_pitcher', 'home_pitcher_true_freq',
       'away_pitcher_true_freq', 'home_pitcher_vs_team_freq',
       'away_pitcher_vs_team_freq', 'home_pitcher_vs_team_freq_count',
       'away_pitcher_vs_team_freq_count', 'home_pitcher_last3_freq_1st',
       'away_pitcher_last3_freq_1st', 'home_team_inning1_last10_freq',
       'away_team_inning1_last10_freq', 'home_team_inning1_scaled',
       'away_team_inning1_scaled', 'home_team_momentum', 'away_team_momentum',
       'home_pitcher_momentum', 'away_pitcher_momentum',
       'home_pitcher_vs_away_team_momentum',
       'away_pitcher_vs_home_team_momentum', 'home_plate_umpire_inning1_freq',
       'stadium_inning1_freq', 'stadium_inning1_scaled',
       'umpire_inning1_scaled', 'target', 'pred_LogisticRegression',
       'pred_RandomForest', 'pred_Naive_Bayes', 'pred_SVM',
       'pred_Gradient _Boosting', 'pred_XGBoost'],
      dtype='object')

In [12]:
model_pred = [
  "pred_LogisticRegression",	"pred_RandomForest",	"pred_Gradient _Boosting",	"pred_SVM",	"pred_Naive_Bayes",	"pred_XGBoost"
]

In [24]:
# Filtrar modelos que tengan predict_proba
models_con_proba = {
    name: model
    for name, model in models.items()
    if hasattr(model, 'predict_proba')
}

In [25]:
result_pro = {}

for name_model, model in models_con_proba.items():
    pred = model.predict_proba(df_2025[features])
    result_pro[name_model] = pred

In [26]:
result_pro

{'LogisticRegression': array([[9.99941862e-01, 5.81383551e-05],
        [9.99842235e-01, 1.57765420e-04],
        [9.99999944e-01, 5.62459033e-08],
        [9.86345116e-01, 1.36548838e-02],
        [9.85566413e-01, 1.44335866e-02],
        [9.99968664e-01, 3.13355445e-05],
        [9.99999293e-01, 7.06630399e-07],
        [9.99996424e-01, 3.57611601e-06],
        [9.99782141e-01, 2.17858787e-04],
        [9.99999991e-01, 9.40675961e-09],
        [9.99999937e-01, 6.33722413e-08],
        [9.99715069e-01, 2.84930920e-04],
        [7.68988302e-01, 2.31011698e-01],
        [9.72749598e-01, 2.72504017e-02],
        [9.92466195e-01, 7.53380543e-03],
        [2.60229897e-01, 7.39770103e-01],
        [6.35856431e-01, 3.64143569e-01],
        [9.99999989e-01, 1.12428744e-08],
        [9.99998558e-01, 1.44211792e-06],
        [9.99944230e-01, 5.57703603e-05],
        [9.99998087e-01, 1.91279047e-06],
        [9.99644365e-01, 3.55634755e-04],
        [9.99999992e-01, 8.43099262e-09],
        [3.7

In [13]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [19]:
from sklearn.metrics import r2_score

In [None]:
metrics = {}
for modelo in model_pred:
    y_true = df_2025['target']
    y_pred = df_2025[modelo]
    
    metrics[modelo] = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred)
        

    }

In [30]:
df_2025.head(15)

Unnamed: 0,game_id,home_team,away_team,stadium,day_or_night,home_pitcher,away_pitcher,home_pitcher_true_freq,away_pitcher_true_freq,home_pitcher_vs_team_freq,...,stadium_inning1_freq,stadium_inning1_scaled,umpire_inning1_scaled,target,pred_LogisticRegression,pred_RandomForest,pred_Naive_Bayes,pred_SVM,pred_Gradient _Boosting,pred_XGBoost
0,777869,Texas Rangers,Houston Astros,Globe Life Field,Día,Jack Leiter,Framber Valdez,0.428571,0.222222,-1.0,...,0.423077,0.191763,0.778846,0,0,0,1,0,0,0
1,777873,Baltimore Orioles,Washington Nationals,Oriole Park at Camden Yards,Día,Zach Eflin,Michael Soroka,0.083333,0.125,0.0,...,0.50495,0.699534,1.0,1,0,0,1,0,0,0
2,777880,Philadelphia Phillies,Pittsburgh Pirates,Citizens Bank Park,Día,Mick Abel,Paul Skenes,0.0,0.1,-1.0,...,0.504854,0.698937,0.651584,0,0,0,1,0,0,0
3,777876,Boston Red Sox,Atlanta Braves,Fenway Park,Día,Brayan Bello,Spencer Schwellenbach,0.352941,0.076923,-1.0,...,0.519231,0.788098,0.830769,1,0,1,1,1,0,0
4,777871,Toronto Blue Jays,Detroit Tigers,Rogers Centre,Día,José Berríos,Jackson Jobe,0.095238,0.5,1.0,...,0.490196,0.608028,0.0,1,0,1,1,0,1,0
5,777874,Cincinnati Reds,Cleveland Guardians,Great American Ball Park,Día,Andrew Abbott,Luis L. Ortiz,0.1875,0.363636,-1.0,...,0.442308,0.31103,0.674556,0,0,0,1,0,0,0
6,777877,Miami Marlins,Tampa Bay Rays,loanDepot park,Día,Cal Quantrill,Shane Baz,0.235294,0.272727,-1.0,...,0.553398,1.0,0.580645,0,0,0,1,1,0,0
7,777864,Kansas City Royals,St. Louis Cardinals,Kauffman Stadium,Día,Michael Wacha,Matthew Liberatore,0.157895,0.285714,0.0,...,0.447619,0.34397,0.574109,0,0,0,1,0,0,0
8,777865,Milwaukee Brewers,Minnesota Twins,American Family Field,Día,Freddy Peralta,Zebby Matthews,0.2,0.142857,0.0,...,0.5,0.668831,0.709193,0,0,0,1,0,0,0
9,777872,Chicago Cubs,Chicago White Sox,Wrigley Field,Día,Colin Rea,Jonathan Cannon,0.421053,0.352941,-1.0,...,0.392157,0.0,0.807692,1,0,0,1,1,0,0


In [16]:
df_metrics  

Unnamed: 0,accuracy,f1_score,precision,recall
pred_LogisticRegression,0.454545,0.142857,0.333333,0.090909
pred_RandomForest,0.560606,0.472727,0.590909,0.393939
pred_Gradient _Boosting,0.530303,0.311111,0.583333,0.212121
pred_SVM,0.545455,0.482759,0.56,0.424242
pred_Naive_Bayes,0.5,0.652632,0.5,0.939394
pred_XGBoost,0.515152,0.238095,0.555556,0.151515


## threshold


In [73]:
threshold = 0.70
dataframes_filtrados = {}

X = df_2025[features]
y_true = df_2025['target']

for name, model in models_con_proba.items():
    # Obtener ambas probabilidades
    y_proba = model.predict_proba(X)
    y_pred = (y_proba[:, 1] >= threshold).astype(int)
    
    # Crear DataFrame copia con columnas nuevas
    df_result = df_2025.copy()
    df_result[f'proba_0'] = y_proba[:, 0]  # probabilidad clase 0
    df_result[f'proba_1'] = y_proba[:, 1]  # probabilidad clase 1
    df_result[f'pred'] = y_pred

    # Filtrar por threshold sobre clase 1
    df_filtrado = df_result[df_result[f'proba_1'] >= threshold]

    # Columnas a guardar
    columnas_resultado = [
        'game_id', 'home_team', 'away_team', 'target',
        f'proba_0', f'proba_1', f'pred'
    ] + [col for col in features if col in df_filtrado.columns]

    # Guardar DataFrame filtrado
    dataframes_filtrados[name] = df_filtrado[columnas_resultado]

    print(f"{name}: {len(df_filtrado)} partidos con proba clase 1 >= {threshold}")


LogisticRegression: 5 partidos con proba clase 1 >= 0.7
RandomForest: 5 partidos con proba clase 1 >= 0.7
Naive_Bayes: 60 partidos con proba clase 1 >= 0.7
Gradient _Boosting: 9 partidos con proba clase 1 >= 0.7
XGBoost: 9 partidos con proba clase 1 >= 0.7


In [74]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Diccionario para guardar resultados
metricas_por_modelo = {}

for nombre_modelo, df in dataframes_filtrados.items():
    # Usar el nombre del modelo para identificar la columna de predicción
    columna_pred = f'pred'
    
    y_true = df['target']
    y_pred = df[columna_pred]

    # Calcular métricas
    metricas_por_modelo[nombre_modelo] = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1_score': f1_score(y_true, y_pred, zero_division=0),
        'n_samples': len(df)
    }

# Convertir a DataFrame para visualizar
df_metricas = pd.DataFrame(metricas_por_modelo).T
print(df_metricas)


                    accuracy  precision  recall  f1_score  n_samples
LogisticRegression  0.400000   0.400000     1.0  0.571429        5.0
RandomForest        0.800000   0.800000     1.0  0.888889        5.0
Naive_Bayes         0.500000   0.500000     1.0  0.666667       60.0
Gradient _Boosting  0.666667   0.666667     1.0  0.800000        9.0
XGBoost             0.555556   0.555556     1.0  0.714286        9.0


In [75]:
dataframes_filtrados['RandomForest']

Unnamed: 0,game_id,home_team,away_team,target,proba_0,proba_1,pred,home_pitcher_true_freq,away_pitcher_true_freq,home_pitcher_vs_team_freq,...,home_pitcher_last3_freq_1st,away_pitcher_last3_freq_1st,home_pitcher_momentum,away_pitcher_momentum,home_pitcher_vs_away_team_momentum,away_pitcher_vs_home_team_momentum,home_team_inning1_scaled,away_team_inning1_scaled,umpire_inning1_scaled,stadium_inning1_scaled
15,777859,Pittsburgh Pirates,Cincinnati Reds,1,0.243029,0.756971,1,0.25,0.266667,0.333333,...,0.666667,0.666667,0.416667,0.4,-0.814918,-0.95666,0.65666,0.598251,0.772973,0.280847
33,777845,Toronto Blue Jays,San Diego Padres,1,0.199731,0.800269,1,0.318182,0.434783,1.0,...,0.333333,0.666667,0.015152,0.231884,-0.32816,-0.673156,0.741272,0.813008,0.847826,0.425002
42,777834,Pittsburgh Pirates,Cincinnati Reds,1,0.252703,0.747297,1,0.263158,0.25,0.0,...,0.333333,0.666667,0.070175,0.416667,-0.468427,-0.973327,0.65666,0.598251,0.682927,0.280847
43,777832,Tampa Bay Rays,Houston Astros,1,0.221745,0.778255,1,0.277778,0.35,1.0,...,0.333333,0.333333,0.055556,-0.016667,-0.577756,-0.387495,0.604162,0.6222,0.802867,1.0
60,777817,Colorado Rockies,Philadelphia Phillies,0,0.199036,0.800964,1,0.333333,0.25,0.0,...,0.666667,0.333333,0.333333,0.083333,-0.846341,-0.526655,0.743322,0.813008,1.0,1.0


In [84]:
from IPython.display import display

columnas_a_mostrar = ['home_team', 'away_team', 'target', 'proba_0', 'proba_1', 'pred']

for nombre_modelo, df in dataframes_filtrados.items():
    print(f"\n### Resultados del modelo: {nombre_modelo}")
    display(df[columnas_a_mostrar])



### Resultados del modelo: LogisticRegression


Unnamed: 0,home_team,away_team,target,proba_0,proba_1,pred
15,Pittsburgh Pirates,Cincinnati Reds,1,0.26023,0.73977,1
27,Pittsburgh Pirates,Cincinnati Reds,0,0.17639,0.82361,1
42,Pittsburgh Pirates,Cincinnati Reds,1,0.219327,0.780673,1
50,Minnesota Twins,Cleveland Guardians,0,0.044306,0.955694,1
60,Colorado Rockies,Philadelphia Phillies,0,0.004023,0.995977,1



### Resultados del modelo: RandomForest


Unnamed: 0,home_team,away_team,target,proba_0,proba_1,pred
15,Pittsburgh Pirates,Cincinnati Reds,1,0.243029,0.756971,1
33,Toronto Blue Jays,San Diego Padres,1,0.199731,0.800269,1
42,Pittsburgh Pirates,Cincinnati Reds,1,0.252703,0.747297,1
43,Tampa Bay Rays,Houston Astros,1,0.221745,0.778255,1
60,Colorado Rockies,Philadelphia Phillies,0,0.199036,0.800964,1



### Resultados del modelo: Naive_Bayes


Unnamed: 0,home_team,away_team,target,proba_0,proba_1,pred
0,Texas Rangers,Houston Astros,0,3.623108e-06,0.999996,1
1,Baltimore Orioles,Washington Nationals,1,3.544737e-07,1.0,1
2,Philadelphia Phillies,Pittsburgh Pirates,0,5.666861e-12,1.0,1
3,Boston Red Sox,Atlanta Braves,1,1.103305e-13,1.0,1
4,Toronto Blue Jays,Detroit Tigers,1,3.85435e-12,1.0,1
5,Cincinnati Reds,Cleveland Guardians,0,0.0004199639,0.99958,1
6,Miami Marlins,Tampa Bay Rays,0,1.602406e-13,1.0,1
7,Kansas City Royals,St. Louis Cardinals,0,4.732749e-05,0.999953,1
8,Milwaukee Brewers,Minnesota Twins,0,6.604973e-08,1.0,1
9,Chicago Cubs,Chicago White Sox,1,1.069351e-11,1.0,1



### Resultados del modelo: Gradient _Boosting


Unnamed: 0,home_team,away_team,target,proba_0,proba_1,pred
4,Toronto Blue Jays,Detroit Tigers,1,0.018655,0.981345,1
12,Los Angeles Dodgers,Los Angeles Angels,1,0.052666,0.947334,1
15,Pittsburgh Pirates,Cincinnati Reds,1,0.113857,0.886143,1
33,Toronto Blue Jays,San Diego Padres,1,0.00731,0.99269,1
42,Pittsburgh Pirates,Cincinnati Reds,1,0.112145,0.887855,1
43,Tampa Bay Rays,Houston Astros,1,0.011907,0.988093,1
50,Minnesota Twins,Cleveland Guardians,0,0.173786,0.826214,1
53,Toronto Blue Jays,San Diego Padres,0,0.083878,0.916122,1
60,Colorado Rockies,Philadelphia Phillies,0,0.134614,0.865386,1



### Resultados del modelo: XGBoost


Unnamed: 0,home_team,away_team,target,proba_0,proba_1,pred
12,Los Angeles Dodgers,Los Angeles Angels,1,0.004112,0.995888,1
15,Pittsburgh Pirates,Cincinnati Reds,1,0.202005,0.797995,1
27,Pittsburgh Pirates,Cincinnati Reds,0,0.041849,0.958151,1
33,Toronto Blue Jays,San Diego Padres,1,0.204658,0.795342,1
42,Pittsburgh Pirates,Cincinnati Reds,1,0.158113,0.841887,1
43,Tampa Bay Rays,Houston Astros,1,0.074113,0.925887,1
50,Minnesota Twins,Cleveland Guardians,0,0.029254,0.970746,1
53,Toronto Blue Jays,San Diego Padres,0,0.000988,0.999012,1
60,Colorado Rockies,Philadelphia Phillies,0,0.00427,0.99573,1
