In [3]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [4]:
# data
df = pd.read_csv('./data/mlb_data.csv')

feature_cols = [
    "home_pitcher_true_freq", "away_pitcher_true_freq",
    "home_pitcher_vs_team_freq", "away_pitcher_vs_team_freq",
    "home_pitcher_vs_team_freq_count", "away_pitcher_vs_team_freq_count",
    "home_pitcher_last3_freq_1st", "away_pitcher_last3_freq_1st",
    "home_pitcher_momentum", "away_pitcher_momentum",
    "home_pitcher_vs_away_team_momentum", "away_pitcher_vs_home_team_momentum",
    "home_team_inning1_scaled", "away_team_inning1_scaled",
    "umpire_inning1_scaled", "stadium_inning1_scaled"
]
df.fillna(0.0, inplace=True)

In [4]:
df

Unnamed: 0,game_id,home_team,away_team,stadium,day_or_night,home_plate_umpire,inning_1_home,inning_1_away,target,home_plate_umpire_inning1_freq,...,home_pitcher_last3_freq_1st,away_pitcher_last3_freq_1st,home_team_momentum,away_team_momentum,home_pitcher_momentum,away_pitcher_momentum,home_pitcher_vs_away_team_momentum,away_pitcher_vs_home_team_momentum,home_pitcher_dominance_vs_away_team,away_pitcher_dominance_vs_home_team
0,747060,Baltimore Orioles,Los Angeles Angels,Oriole Park at Camden Yards,Dia,Adrian Johnson,2,1,1,0.578947,...,0.000000,0.000000,-0.116832,0.261905,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,746737,Cincinnati Reds,Washington Nationals,Great American Ball Park,Dia,Dan Iassogna,0,0,0,0.513514,...,0.000000,0.000000,-0.069231,0.120755,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,745445,San Diego Padres,San Francisco Giants,Petco Park,Dia,Mark Ripperger,0,0,0,0.500000,...,0.000000,0.000000,0.222772,0.052381,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,746165,Los Angeles Dodgers,St. Louis Cardinals,Dodger Stadium,Dia,Alan Porter,2,0,1,0.432432,...,0.000000,0.000000,0.090000,0.140741,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,745116,Tampa Bay Rays,Toronto Blue Jays,Tropicana Field,Dia,Todd Tichenor,1,0,1,0.526316,...,0.000000,0.000000,-0.047706,0.098077,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3207,777921,Chicago Cubs,Miami Marlins,Wrigley Field,Noche,Ron Kulpa,0,0,0,0.222222,...,0.333333,0.000000,-0.174510,-0.145098,0.175439,-0.100000,-0.320537,-0.074510,-0.030341,0.274510
3208,777918,Texas Rangers,Colorado Rockies,Globe Life Field,Noche,Nestor Ceja,4,0,1,0.500000,...,0.333333,0.666667,0.147573,0.107692,0.183333,0.333333,-0.075641,-0.185761,-0.291026,-0.480906
3209,777920,Houston Astros,Kansas City Royals,Daikin Park,Noche,David Rackley,0,0,0,0.390244,...,0.000000,0.000000,-0.147619,-0.128571,0.000000,-0.142857,0.000000,-0.004762,0.000000,0.290476
3210,777914,San Diego Padres,Los Angeles Angels,Petco Park,Noche,John Tumpane,3,0,1,0.487179,...,0.000000,0.666667,0.222772,0.261905,-0.333333,0.500000,0.595238,-0.277228,0.071429,-0.722772


In [5]:
X = df[feature_cols]
y = df['target']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [6]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=300,   
    max_depth=None,     
    min_samples_leaf=4, 
    max_features=0.1,   
    bootstrap=True,     
    random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'Naive Bayes': GaussianNB()
}

In [7]:
results = []
best_models = []

for model_name, model in models.items():
    print(f"\nmodel: {model_name}")
    accs, f1s, aucs = [], [], []
    fold_models = []
    fold_aucs = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('model', model)
        ])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_val)
        y_proba = pipe.predict_proba(X_val)[:, 1] if hasattr(pipe, "predict_proba") else y_pred

        acc = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        auc = roc_auc_score(y_val, y_proba)

        accs.append(acc)
        f1s.append(f1)
        aucs.append(auc)
        fold_models.append(pipe)
        fold_aucs.append(auc)

        print(f"  Fold {fold+1} - Acc: {acc:.3f} | F1: {f1:.3f} | ROC AUC: {auc:.3f}")

    # Guardar el mejor modelo de los folds según ROC AUC
    best_fold_idx = fold_aucs.index(max(fold_aucs))
    best_models.append({
        'model': model_name,
        'best_pipeline': fold_models[best_fold_idx],
        'best_auc': fold_aucs[best_fold_idx]
    })

    results.append({
        'model': model_name,
        'acc_mean': sum(accs)/len(accs),
        'f1_mean': sum(f1s)/len(f1s),
        'auc_mean': sum(aucs)/len(aucs)
    })



# Encontrar el mejor modelo global según el mejor AUC
best_overall = max(best_models, key=lambda x: x['best_auc'])
print(f"Mejor modelo global: {best_overall['model']} con AUC: {best_overall['best_auc']:.3f}")


model: LogisticRegression
  Fold 1 - Acc: 0.899 | F1: 0.891 | ROC AUC: 0.970
  Fold 2 - Acc: 0.899 | F1: 0.891 | ROC AUC: 0.973
  Fold 3 - Acc: 0.914 | F1: 0.907 | ROC AUC: 0.979
  Fold 4 - Acc: 0.900 | F1: 0.890 | ROC AUC: 0.969
  Fold 5 - Acc: 0.877 | F1: 0.865 | ROC AUC: 0.965

model: RandomForest
  Fold 1 - Acc: 0.890 | F1: 0.879 | ROC AUC: 0.959
  Fold 2 - Acc: 0.900 | F1: 0.894 | ROC AUC: 0.966
  Fold 3 - Acc: 0.928 | F1: 0.923 | ROC AUC: 0.976
  Fold 4 - Acc: 0.916 | F1: 0.908 | ROC AUC: 0.967
  Fold 5 - Acc: 0.897 | F1: 0.889 | ROC AUC: 0.962

model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 1 - Acc: 0.897 | F1: 0.890 | ROC AUC: 0.975
  Fold 2 - Acc: 0.904 | F1: 0.898 | ROC AUC: 0.975


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 3 - Acc: 0.922 | F1: 0.918 | ROC AUC: 0.982
  Fold 4 - Acc: 0.919 | F1: 0.911 | ROC AUC: 0.979


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 5 - Acc: 0.893 | F1: 0.886 | ROC AUC: 0.974

model: Gradient Boosting
  Fold 1 - Acc: 0.904 | F1: 0.897 | ROC AUC: 0.979
  Fold 2 - Acc: 0.910 | F1: 0.904 | ROC AUC: 0.980
  Fold 3 - Acc: 0.927 | F1: 0.923 | ROC AUC: 0.984
  Fold 4 - Acc: 0.925 | F1: 0.918 | ROC AUC: 0.983
  Fold 5 - Acc: 0.900 | F1: 0.893 | ROC AUC: 0.977

model: SVM
  Fold 1 - Acc: 0.904 | F1: 0.896 | ROC AUC: 0.903
  Fold 2 - Acc: 0.911 | F1: 0.904 | ROC AUC: 0.910
  Fold 3 - Acc: 0.927 | F1: 0.923 | ROC AUC: 0.927
  Fold 4 - Acc: 0.928 | F1: 0.922 | ROC AUC: 0.927
  Fold 5 - Acc: 0.907 | F1: 0.900 | ROC AUC: 0.906

model: Naive Bayes
  Fold 1 - Acc: 0.860 | F1: 0.854 | ROC AUC: 0.956
  Fold 2 - Acc: 0.882 | F1: 0.876 | ROC AUC: 0.963
  Fold 3 - Acc: 0.903 | F1: 0.898 | ROC AUC: 0.975
  Fold 4 - Acc: 0.894 | F1: 0.888 | ROC AUC: 0.961
  Fold 5 - Acc: 0.861 | F1: 0.853 | ROC AUC: 0.954
Mejor modelo global: Gradient Boosting con AUC: 0.984


In [8]:
results_df = pd.DataFrame(results).sort_values(by='auc_mean', ascending=False).reset_index(drop=True)


In [9]:
results_df

Unnamed: 0,model,acc_mean,f1_mean,auc_mean
0,Gradient Boosting,0.913142,0.907039,0.980618
1,XGBoost,0.906916,0.900525,0.977179
2,LogisticRegression,0.897882,0.888972,0.971128
3,RandomForest,0.906296,0.89859,0.966166
4,Naive Bayes,0.880143,0.873846,0.961668
5,SVM,0.915322,0.908722,0.914761


In [11]:
import joblib

# Guardar todos los mejores modelos de cada algoritmo
for bm in best_models:
    filename = f"model/{bm['model']}.pkl"
    joblib.dump(bm['best_pipeline'], filename)
    print(f"Modelo guardado: {filename} con AUC: {bm['best_auc']:.3f}")

Modelo guardado: model/LogisticRegression.pkl con AUC: 0.979
Modelo guardado: model/RandomForest.pkl con AUC: 0.976
Modelo guardado: model/XGBoost.pkl con AUC: 0.982
Modelo guardado: model/Gradient Boosting.pkl con AUC: 0.984
Modelo guardado: model/SVM.pkl con AUC: 0.927
Modelo guardado: model/Naive Bayes.pkl con AUC: 0.975


In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy as np

# Definir los modelos base
base_models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),  # probability=True para predict_proba
    'NaiveBayes': GaussianNB()
}

# Definir los espacios de búsqueda de hiperparámetros
param_grids = {
    'LogisticRegression': {
        'model__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'model__penalty': ['l1', 'l2'],
        'model__solver': ['liblinear', 'saga']
    },
    'RandomForest': {
        'model__n_estimators': [100, 200, 300, 500],
        'model__max_depth': [10, 20, 30, None],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', 0.1, 0.2]
    },
    'XGBoost': {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [3, 4, 5, 6],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__subsample': [0.8, 0.9, 1.0],
        'model__colsample_bytree': [0.8, 0.9, 1.0]
    },
    'GradientBoosting': {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [3, 4, 5, 6],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__subsample': [0.8, 0.9, 1.0]
    },
    'SVM': {
        'model__C': [0.1, 1, 10, 100],
        'model__kernel': ['linear', 'rbf', 'poly'],
        'model__gamma': ['scale', 'auto', 0.001, 0.01, 0.1]
    },
    'NaiveBayes': {
        'model__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
    }
}

# Función para optimizar hiperparámetros
def optimize_model(model_name, base_model, param_grid, X, y, cv=5, scoring='roc_auc', search_type='grid'):
    """
    Optimiza hiperparámetros usando GridSearchCV o RandomizedSearchCV
    
    Parameters:
    - model_name: nombre del modelo
    - base_model: modelo base de sklearn
    - param_grid: diccionario con parámetros a optimizar
    - X, y: datos de entrenamiento
    - cv: número de folds para cross-validation
    - scoring: métrica de evaluación
    - search_type: 'grid' para GridSearchCV, 'random' para RandomizedSearchCV
    """
    
    # Crear pipeline
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', base_model)
    ])
    
    print(f"\nOptimizando {model_name}...")
    
    # Elegir tipo de búsqueda
    if search_type == 'grid':
        search = GridSearchCV(
            pipe, 
            param_grid, 
            cv=cv, 
            scoring=scoring, 
            n_jobs=-1, 
            verbose=1
        )
    else:  # random search
        search = RandomizedSearchCV(
            pipe, 
            param_grid, 
            cv=cv, 
            scoring=scoring, 
            n_jobs=-1, 
            verbose=1,
            n_iter=50,  # número de combinaciones aleatorias a probar
            random_state=42
        )
    
    # Ajustar el modelo
    search.fit(X, y)
    
    return search

# Función principal para entrenar y evaluar modelos optimizados
def train_optimized_models(X, y, kf, search_type='random'):
    """
    Entrena modelos con hiperparámetros optimizados usando cross-validation
    """
    results = []
    best_models = []
    optimized_models = {}
    
    # Primero optimizar hiperparámetros para cada modelo
    print("=== FASE 1: OPTIMIZACIÓN DE HIPERPARÁMETROS ===")
    for model_name in base_models.keys():
        if model_name in param_grids:
            optimized_search = optimize_model(
                model_name, 
                base_models[model_name], 
                param_grids[model_name], 
                X, y, 
                search_type=search_type
            )
            optimized_models[model_name] = optimized_search.best_estimator_
            print(f"{model_name} - Mejor score: {optimized_search.best_score_:.4f}")
            print(f"Mejores parámetros: {optimized_search.best_params_}")
        else:
            # Si no hay grid definido, usar modelo base
            pipe = Pipeline([
                ('scaler', StandardScaler()),
                ('model', base_models[model_name])
            ])
            optimized_models[model_name] = pipe
    
    print("\n=== FASE 2: EVALUACIÓN CON CROSS-VALIDATION ===")
    
    # Evaluar modelos optimizados con cross-validation manual
    for model_name, optimized_model in optimized_models.items():
        print(f"\nEvaluando modelo optimizado: {model_name}")
        accs, f1s, aucs = [], [], []
        fold_models = []
        fold_aucs = []
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            # Crear una nueva instancia del modelo optimizado
            model_copy = Pipeline([
                ('scaler', StandardScaler()),
                ('model', optimized_model.named_steps['model'])
            ])
            
            model_copy.fit(X_train, y_train)
            y_pred = model_copy.predict(X_val)
            
            # Obtener probabilidades
            if hasattr(model_copy, "predict_proba"):
                y_proba = model_copy.predict_proba(X_val)[:, 1]
            else:
                y_proba = y_pred
            
            acc = accuracy_score(y_val, y_pred)
            f1 = f1_score(y_val, y_pred)
            auc = roc_auc_score(y_val, y_proba)
            
            accs.append(acc)
            f1s.append(f1)
            aucs.append(auc)
            fold_models.append(model_copy)
            fold_aucs.append(auc)
            
            print(f"  Fold {fold+1} - Acc: {acc:.3f} | F1: {f1:.3f} | ROC AUC: {auc:.3f}")
        
        # Guardar el mejor modelo de los folds según ROC AUC
        best_fold_idx = fold_aucs.index(max(fold_aucs))
        best_models.append({
            'model': model_name,
            'best_pipeline': fold_models[best_fold_idx],
            'best_auc': fold_aucs[best_fold_idx]
        })
        
        results.append({
            'model': model_name,
            'acc_mean': np.mean(accs),
            'acc_std': np.std(accs),
            'f1_mean': np.mean(f1s),
            'f1_std': np.std(f1s),
            'auc_mean': np.mean(aucs),
            'auc_std': np.std(aucs)
        })
        
        print(f"  Promedios - Acc: {np.mean(accs):.3f}±{np.std(accs):.3f} | "
              f"F1: {np.mean(f1s):.3f}±{np.std(f1s):.3f} | "
              f"AUC: {np.mean(aucs):.3f}±{np.std(aucs):.3f}")
    
    return results, best_models, optimized_models

# Función para mostrar resultados
def show_results(results, best_models):
    """Muestra un resumen de los resultados"""
    print("\n" + "="*80)
    print("RESUMEN DE RESULTADOS")
    print("="*80)
    
    # Ordenar por AUC promedio
    results_sorted = sorted(results, key=lambda x: x['auc_mean'], reverse=True)
    
    print(f"{'Modelo':<20} {'Accuracy':<15} {'F1-Score':<15} {'ROC AUC':<15}")
    print("-" * 65)
    
    for result in results_sorted:
        print(f"{result['model']:<20} "
              f"{result['acc_mean']:.3f}±{result['acc_std']:.3f}    "
              f"{result['f1_mean']:.3f}±{result['f1_std']:.3f}    "
              f"{result['auc_mean']:.3f}±{result['auc_std']:.3f}")
    
    # Encontrar el mejor modelo global
    best_overall = max(best_models, key=lambda x: x['best_auc'])
    print(f"\n🏆 MEJOR MODELO GLOBAL: {best_overall['model']}")
    print(f"   ROC AUC: {best_overall['best_auc']:.4f}")
    
    return best_overall

# EJEMPLO DE USO:


"\n# Supongamos que tienes tus datos X, y y tu KFold definido\nfrom sklearn.model_selection import KFold\n\n# kf = KFold(n_splits=5, shuffle=True, random_state=42)\n\n# Ejecutar optimización (puedes elegir 'grid' o 'random')\nresults, best_models, optimized_models = train_optimized_models(X, y, kf, search_type='random')\n\n# Mostrar resultados\nbest_model = show_results(results, best_models)\n\n# El mejor modelo estará en best_model['best_pipeline']\n# Los modelos optimizados (sin entrenar) están en optimized_models\n"

In [6]:
results, best_models, optimized_models = train_optimized_models(
    X, y, kf, 
    search_type='random'  # o 'grid' para búsqueda exhaustiva
)

# 2. Ver resultados
best_model = show_results(results, best_models)

# 3. Usar el mejor modelo
final_model = best_model['best_pipeline']

=== FASE 1: OPTIMIZACIÓN DE HIPERPARÁMETROS ===

Optimizando LogisticRegression...
Fitting 5 folds for each of 24 candidates, totalling 120 fits




LogisticRegression - Mejor score: 0.9712
Mejores parámetros: {'model__solver': 'liblinear', 'model__penalty': 'l2', 'model__C': 0.1}

Optimizando RandomForest...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
RandomForest - Mejor score: 0.9795
Mejores parámetros: {'model__n_estimators': 300, 'model__min_samples_split': 10, 'model__min_samples_leaf': 4, 'model__max_features': 'log2', 'model__max_depth': 10}

Optimizando XGBoost...
Fitting 5 folds for each of 50 candidates, totalling 250 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


XGBoost - Mejor score: 0.9802
Mejores parámetros: {'model__subsample': 0.9, 'model__n_estimators': 300, 'model__max_depth': 5, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.8}

Optimizando GradientBoosting...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
GradientBoosting - Mejor score: 0.9793
Mejores parámetros: {'model__subsample': 0.8, 'model__n_estimators': 300, 'model__max_depth': 4, 'model__learning_rate': 0.01}

Optimizando SVM...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
SVM - Mejor score: 0.9773
Mejores parámetros: {'model__kernel': 'rbf', 'model__gamma': 0.01, 'model__C': 10}

Optimizando NaiveBayes...
Fitting 5 folds for each of 4 candidates, totalling 20 fits




NaiveBayes - Mejor score: 0.9618
Mejores parámetros: {'model__var_smoothing': 1e-09}

=== FASE 2: EVALUACIÓN CON CROSS-VALIDATION ===

Evaluando modelo optimizado: LogisticRegression
  Fold 1 - Acc: 0.900 | F1: 0.893 | ROC AUC: 0.970
  Fold 2 - Acc: 0.900 | F1: 0.892 | ROC AUC: 0.974
  Fold 3 - Acc: 0.911 | F1: 0.904 | ROC AUC: 0.979
  Fold 4 - Acc: 0.900 | F1: 0.890 | ROC AUC: 0.969
  Fold 5 - Acc: 0.879 | F1: 0.866 | ROC AUC: 0.965
  Promedios - Acc: 0.898±0.011 | F1: 0.889±0.012 | AUC: 0.971±0.005

Evaluando modelo optimizado: RandomForest
  Fold 1 - Acc: 0.911 | F1: 0.905 | ROC AUC: 0.978
  Fold 2 - Acc: 0.911 | F1: 0.904 | ROC AUC: 0.978
  Fold 3 - Acc: 0.930 | F1: 0.925 | ROC AUC: 0.986
  Fold 4 - Acc: 0.925 | F1: 0.919 | ROC AUC: 0.983
  Fold 5 - Acc: 0.905 | F1: 0.898 | ROC AUC: 0.978
  Promedios - Acc: 0.917±0.009 | F1: 0.910±0.010 | AUC: 0.981±0.003

Evaluando modelo optimizado: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 1 - Acc: 0.910 | F1: 0.904 | ROC AUC: 0.978


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 2 - Acc: 0.914 | F1: 0.909 | ROC AUC: 0.979


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 3 - Acc: 0.928 | F1: 0.924 | ROC AUC: 0.987


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 4 - Acc: 0.922 | F1: 0.915 | ROC AUC: 0.984


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Fold 5 - Acc: 0.902 | F1: 0.895 | ROC AUC: 0.979
  Promedios - Acc: 0.915±0.009 | F1: 0.909±0.010 | AUC: 0.981±0.003

Evaluando modelo optimizado: GradientBoosting
  Fold 1 - Acc: 0.913 | F1: 0.907 | ROC AUC: 0.980
  Fold 2 - Acc: 0.900 | F1: 0.893 | ROC AUC: 0.978
  Fold 3 - Acc: 0.925 | F1: 0.921 | ROC AUC: 0.985
  Fold 4 - Acc: 0.928 | F1: 0.922 | ROC AUC: 0.985
  Fold 5 - Acc: 0.907 | F1: 0.900 | ROC AUC: 0.977
  Promedios - Acc: 0.915±0.011 | F1: 0.909±0.011 | AUC: 0.981±0.003

Evaluando modelo optimizado: SVM
  Fold 1 - Acc: 0.899 | F1: 0.891 | ROC AUC: 0.977
  Fold 2 - Acc: 0.902 | F1: 0.894 | ROC AUC: 0.979
  Fold 3 - Acc: 0.933 | F1: 0.928 | ROC AUC: 0.984
  Fold 4 - Acc: 0.925 | F1: 0.917 | ROC AUC: 0.979
  Fold 5 - Acc: 0.899 | F1: 0.889 | ROC AUC: 0.972
  Promedios - Acc: 0.912±0.015 | F1: 0.904±0.016 | AUC: 0.978±0.004

Evaluando modelo optimizado: NaiveBayes
  Fold 1 - Acc: 0.860 | F1: 0.854 | ROC AUC: 0.956
  Fold 2 - Acc: 0.882 | F1: 0.876 | ROC AUC: 0.963
  Fold 3 - 

In [9]:
final_model

In [11]:
import joblib


filename = f"model/gradient_boosting_ft.pkl"
joblib.dump(final_model, filename)
print(f"Modelo guardado: {filename}")

Modelo guardado: model/gradient_boosting_ft.pkl
