## 1. Cargar y Explorar Dataset EPL

In [2]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configurar estilos
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print('‚úÖ Librer√≠as importadas correctamente')

‚úÖ Librer√≠as importadas correctamente


In [3]:
# Cargar el dataset EPL
data_path = Path('../data/raw/epl_final.csv')

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f'‚úÖ Dataset cargado: {df.shape[0]} filas √ó {df.shape[1]} columnas')
    print(f'\nüìÖ Rango temporal: {df.iloc[:, 0] if "Date" in str(df.columns) else "Verificar columnas"}')
else:
    print(f'‚ùå Archivo no encontrado en {data_path}')
    print(f'   Descarga epl_final.csv desde Kaggle y col√≥calo en: data/raw/')
    df = None

‚úÖ Dataset cargado: 9380 filas √ó 22 columnas

üìÖ Rango temporal: 0       2000/01
1       2000/01
2       2000/01
3       2000/01
4       2000/01
         ...   
9375    2024/25
9376    2024/25
9377    2024/25
9378    2024/25
9379    2024/25
Name: Season, Length: 9380, dtype: object


In [5]:
# Inspeccionar estructura del dataset
if df is not None:
    print('\n' + '='*70)
    print('ESTRUCTURA DEL DATASET')
    print('='*70)
    
    print(f'\nüìã Columnas ({len(df.columns)}):')  
    for i, col in enumerate(df.columns, 1):
        dtype = df[col].dtype
        nulls = df[col].isnull().sum()
        print(f'   {i:2d}. {col:<25} {str(dtype):<15} (nulls: {nulls:4d})')
    
    print(f'\nüìä Primeras 5 filas:')
    display(df.head())
    
    print(f'\nüìä Info del Dataset:')
    print(df.info())


ESTRUCTURA DEL DATASET

üìã Columnas (22):
    1. Season                    object          (nulls:    0)
    2. MatchDate                 object          (nulls:    0)
    3. HomeTeam                  object          (nulls:    0)
    4. AwayTeam                  object          (nulls:    0)
    5. FullTimeHomeGoals         int64           (nulls:    0)
    6. FullTimeAwayGoals         int64           (nulls:    0)
    7. FullTimeResult            object          (nulls:    0)
    8. HalfTimeHomeGoals         int64           (nulls:    0)
    9. HalfTimeAwayGoals         int64           (nulls:    0)
   10. HalfTimeResult            object          (nulls:    0)
   11. HomeShots                 int64           (nulls:    0)
   12. AwayShots                 int64           (nulls:    0)
   13. HomeShotsOnTarget         int64           (nulls:    0)
   14. AwayShotsOnTarget         int64           (nulls:    0)
   15. HomeCorners               int64           (nulls:    0)
   16. Awa

Unnamed: 0,Season,MatchDate,HomeTeam,AwayTeam,FullTimeHomeGoals,FullTimeAwayGoals,FullTimeResult,HalfTimeHomeGoals,HalfTimeAwayGoals,HalfTimeResult,...,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,AwayFouls,HomeYellowCards,AwayYellowCards,HomeRedCards,AwayRedCards
0,2000/01,2000-08-19,Charlton,Man City,4,0,H,2,0,H,...,14,4,6,6,13,12,1,2,0,0
1,2000/01,2000-08-19,Chelsea,West Ham,4,2,H,1,0,H,...,10,5,7,7,19,14,1,2,0,0
2,2000/01,2000-08-19,Coventry,Middlesbrough,1,3,A,1,1,D,...,3,9,8,4,15,21,5,3,1,0
3,2000/01,2000-08-19,Derby,Southampton,2,2,D,1,2,A,...,4,6,5,8,11,13,1,1,0,0
4,2000/01,2000-08-19,Leeds,Everton,2,0,H,2,0,H,...,8,6,6,4,21,20,1,3,0,0



üìä Info del Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9380 entries, 0 to 9379
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Season             9380 non-null   object
 1   MatchDate          9380 non-null   object
 2   HomeTeam           9380 non-null   object
 3   AwayTeam           9380 non-null   object
 4   FullTimeHomeGoals  9380 non-null   int64 
 5   FullTimeAwayGoals  9380 non-null   int64 
 6   FullTimeResult     9380 non-null   object
 7   HalfTimeHomeGoals  9380 non-null   int64 
 8   HalfTimeAwayGoals  9380 non-null   int64 
 9   HalfTimeResult     9380 non-null   object
 10  HomeShots          9380 non-null   int64 
 11  AwayShots          9380 non-null   int64 
 12  HomeShotsOnTarget  9380 non-null   int64 
 13  AwayShotsOnTarget  9380 non-null   int64 
 14  HomeCorners        9380 non-null   int64 
 15  AwayCorners        9380 non-null   int64 
 16  HomeFouls         

## 2. Preprocesamiento y Feature Engineering

In [6]:
if df is not None:
    # Hacer una copia para trabajar
    df_processed = df.copy()
    
    # Identificar columnas de fecha
    date_cols = [col for col in df_processed.columns if 'date' in col.lower()]
    print(f'Columnas de fecha encontradas: {date_cols}')
    
    # Convertir a datetime si es necesario
    for col in date_cols:
        df_processed[col] = pd.to_datetime(df_processed[col], errors='coerce')
    
    # Verificar valores nulos
    print(f'\nüìä Valores nulos por columna:')
    null_counts = df_processed.isnull().sum()
    null_counts = null_counts[null_counts > 0].sort_values(ascending=False)
    if len(null_counts) > 0:
        print(null_counts)
    else:
        print('‚úÖ Sin valores nulos')
    
    print(f'\n‚úÖ Preprocesamiento inicial completado')

Columnas de fecha encontradas: ['MatchDate']

üìä Valores nulos por columna:
‚úÖ Sin valores nulos

‚úÖ Preprocesamiento inicial completado


In [7]:
# Feature Engineering - Crear features derivadas
if df_processed is not None:
    # Extraer caracter√≠sticas de tiempo
    date_col = date_cols[0] if date_cols else None
    
    if date_col:
        df_processed['Year'] = df_processed[date_col].dt.year
        df_processed['Month'] = df_processed[date_col].dt.month
        df_processed['DayOfWeek'] = df_processed[date_col].dt.dayofweek
        df_processed['Season'] = df_processed['Year'].apply(lambda x: f'{x}-{x+1}')
        print(f'‚úÖ Features temporales creadas')
    
    # Identificar columnas de equipo y resultado
    print(f'\nColumnas disponibles:')
    print(df_processed.columns.tolist())

‚úÖ Features temporales creadas

Columnas disponibles:
['Season', 'MatchDate', 'HomeTeam', 'AwayTeam', 'FullTimeHomeGoals', 'FullTimeAwayGoals', 'FullTimeResult', 'HalfTimeHomeGoals', 'HalfTimeAwayGoals', 'HalfTimeResult', 'HomeShots', 'AwayShots', 'HomeShotsOnTarget', 'AwayShotsOnTarget', 'HomeCorners', 'AwayCorners', 'HomeFouls', 'AwayFouls', 'HomeYellowCards', 'AwayYellowCards', 'HomeRedCards', 'AwayRedCards', 'Year', 'Month', 'DayOfWeek']


## 3. An√°lisis Exploratorio (EDA)

In [8]:
if df_processed is not None:
    print('\n' + '='*70)
    print('AN√ÅLISIS EXPLORATORIO')
    print('='*70)
    
    # Estad√≠sticas b√°sicas
    print('\nüìä Estad√≠sticas descriptivas:')
    display(df_processed.describe())


AN√ÅLISIS EXPLORATORIO

üìä Estad√≠sticas descriptivas:


Unnamed: 0,MatchDate,FullTimeHomeGoals,FullTimeAwayGoals,HalfTimeHomeGoals,HalfTimeAwayGoals,HomeShots,AwayShots,HomeShotsOnTarget,AwayShotsOnTarget,HomeCorners,AwayCorners,HomeFouls,AwayFouls,HomeYellowCards,AwayYellowCards,HomeRedCards,AwayRedCards,Year,Month,DayOfWeek
count,9380,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0,9380.0
mean,2013-01-17 13:05:33.134328576,1.535394,1.182623,0.688273,0.518763,13.617484,10.810661,5.973134,4.69371,6.040299,4.774733,11.276333,11.765672,1.468124,1.792431,0.062473,0.085288,2012.529104,6.733902,4.448507
min,2000-08-19 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2000.0,1.0,0.0
25%,2006-12-05 18:00:00,1.0,0.0,0.0,0.0,10.0,7.0,4.0,3.0,4.0,3.0,9.0,9.0,1.0,1.0,0.0,0.0,2006.0,3.0,5.0
50%,2013-01-19 00:00:00,1.0,1.0,0.0,0.0,13.0,10.0,6.0,4.0,6.0,4.0,11.0,12.0,1.0,2.0,0.0,0.0,2013.0,8.0,5.0
75%,2019-03-02 00:00:00,2.0,2.0,1.0,1.0,17.0,14.0,8.0,6.0,8.0,6.0,14.0,14.0,2.0,3.0,0.0,0.0,2019.0,10.0,5.0
max,2025-05-05 00:00:00,9.0,9.0,5.0,5.0,43.0,37.0,24.0,20.0,20.0,19.0,33.0,29.0,7.0,9.0,3.0,2.0,2025.0,12.0,6.0
std,,1.305432,1.157414,0.835079,0.735357,5.356424,4.696501,3.267954,2.750045,3.110619,2.749541,3.75025,3.924179,1.21719,1.288481,0.253221,0.290183,7.213963,3.928129,1.731962


## 2.1 An√°lisis de Target Variables

In [9]:
# Analizar los targets
if df_processed is not None:
    print('üìä DISTRIBUCI√ìN DE RESULTADOS (1X2):')
    result_counts = df_processed['FullTimeResult'].value_counts()
    result_pct = df_processed['FullTimeResult'].value_counts(normalize=True) * 100
    
    for result in ['H', 'D', 'A']:
        if result in result_counts.index:
            count = result_counts[result]
            pct = result_pct[result]
            label = 'Home Win' if result == 'H' else 'Draw' if result == 'D' else 'Away Win'
            print(f"  {label:12} ('{result}'): {count:5d} ({pct:5.1f}%)")
    
    print('\nüìä DISTRIBUCI√ìN DE GOLES TOTALES:')
    df_processed['TotalGoals'] = df_processed['FullTimeHomeGoals'] + df_processed['FullTimeAwayGoals']
    
    print(f"  M√≠nimo: {df_processed['TotalGoals'].min()}")
    print(f"  M√°ximo: {df_processed['TotalGoals'].max()}")
    print(f"  Promedio: {df_processed['TotalGoals'].mean():.2f}")
    print(f"  Mediana: {df_processed['TotalGoals'].median():.1f}")
    
    print('\n  Distribuci√≥n:')
    print(df_processed['TotalGoals'].value_counts().sort_index())

üìä DISTRIBUCI√ìN DE RESULTADOS (1X2):
  Home Win     ('H'):  4299 ( 45.8%)
  Draw         ('D'):  2313 ( 24.7%)
  Away Win     ('A'):  2768 ( 29.5%)

üìä DISTRIBUCI√ìN DE GOLES TOTALES:
  M√≠nimo: 0
  M√°ximo: 11
  Promedio: 2.72
  Mediana: 3.0

  Distribuci√≥n:
TotalGoals
0      692
1     1626
2     2238
3     2057
4     1476
5      758
6      312
7      143
8       54
9       18
10       5
11       1
Name: count, dtype: int64


In [10]:
# Visualizar distribuciones
if df_processed is not None:
    # Encontrar columnas num√©ricas potencialmente interesantes
    numeric_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
    
    print(f'Columnas num√©ricas: {numeric_cols[:10]}...')
    print(f'\nTotal de columnas num√©ricas: {len(numeric_cols)}')

Columnas num√©ricas: ['FullTimeHomeGoals', 'FullTimeAwayGoals', 'HalfTimeHomeGoals', 'HalfTimeAwayGoals', 'HomeShots', 'AwayShots', 'HomeShotsOnTarget', 'AwayShotsOnTarget', 'HomeCorners', 'AwayCorners']...

Total de columnas num√©ricas: 20


## 3. Feature Engineering - Crear Variables Predictivas

### Qu√© son Features?
Son variables que usa el modelo ML para predecir. Pueden ser:
- **Features base**: Las que ya existen en el dataset (Shots, Fouls, etc)
- **Features derivadas**: Que creamos combinando info hist√≥rica (Form, H2H, etc)

### Features que Vamos a Crear:

1. **Form**: Puntos de √∫ltimos 5 partidos (3=win, 1=draw, 0=loss)
2. **Head-to-Head (H2H)**: % victorias hist√≥ricas entre equipos
3. **Goles Promedio**: Media de goles a favor/contra
4. **Home Advantage**: Ventaja de jugar en casa
5. **Temporales**: Mes, d√≠a semana, a√±o

In [11]:
import sys
sys.path.insert(0, '../src')

try:
    from src.feature_engineering import EPLFeatureEngineer, prepare_training_data
except ModuleNotFoundError:
    # Fallback: cargar directamente desde el archivo ../src/feature_engineering.py
    import importlib.util
    from pathlib import Path

    fe_path = Path('../src/feature_engineering.py').resolve()
    if fe_path.exists():
        spec = importlib.util.spec_from_file_location('feature_engineering', str(fe_path))
        fe_mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(fe_mod)
        EPLFeatureEngineer = getattr(fe_mod, 'EPLFeatureEngineer')
        prepare_training_data = getattr(fe_mod, 'prepare_training_data')
    else:
        # Si no existe el archivo, relanzamos el error original para que el usuario lo vea
        raise

if df_processed is not None:
    print('üîß CREANDO FEATURES INGENIERILES...\n')
    
    # Crear el ingeniero de features
    engineer = EPLFeatureEngineer(df_processed)
    
    # Crear todas las features
    X, y_result, y_goals = engineer.engineer_features()
    
    print(f'\n‚úÖ Features creadas exitosamente!')
    print(f'   Dimensiones X: {X.shape}')
    print(f'   - {X.shape[0]} muestras (partidos)')
    print(f'   - {X.shape[1]} features (variables)')
    
    print(f'\nüìä Target Variables:')
    print(f'   - y_result: Resultado (0=Away, 1=Draw, 2=Home)')
    print(f'   - y_goals: Goles totales')

üîß CREANDO FEATURES INGENIERILES...

üîß Creando features...
  ‚Üí Form de equipos...
  ‚Üí Estad√≠sticas de goles...
  ‚Üí Ventaja de casa...
  ‚Üí Estad√≠sticas de goles...
  ‚Üí Ventaja de casa...
  ‚Üí Estad√≠sticas de tiros...
‚úÖ Features creadas: 208 columnas

‚úÖ Features creadas exitosamente!
   Dimensiones X: (9380, 208)
   - 9380 muestras (partidos)
   - 208 features (variables)

üìä Target Variables:
   - y_result: Resultado (0=Away, 1=Draw, 2=Home)
   - y_goals: Goles totales
  ‚Üí Estad√≠sticas de tiros...
‚úÖ Features creadas: 208 columnas

‚úÖ Features creadas exitosamente!
   Dimensiones X: (9380, 208)
   - 9380 muestras (partidos)
   - 208 features (variables)

üìä Target Variables:
   - y_result: Resultado (0=Away, 1=Draw, 2=Home)
   - y_goals: Goles totales


In [12]:
# Inspeccionar las features creadas
if X is not None:
    print('üìã COLUMNAS DE FEATURES CREADAS:')
    print(f'\nTotal: {len(X.columns)} features\n')
    
    for i, col in enumerate(X.columns, 1):
        dtype = X[col].dtype
        nulls = X[col].isnull().sum()
        print(f'   {i:2d}. {col:<30} {str(dtype):<10} (nulls: {nulls:4d})')
    
    print(f'\nüìä ESTAD√çSTICAS DE FEATURES:')
    print(X.describe().T[['count', 'mean', 'std', 'min', 'max']].head(10))

üìã COLUMNAS DE FEATURES CREADAS:

Total: 208 features

    1. HomeShots                      int64      (nulls:    0)
    2. AwayShots                      int64      (nulls:    0)
    3. HomeShotsOnTarget              int64      (nulls:    0)
    4. AwayShotsOnTarget              int64      (nulls:    0)
    5. HomeCorners                    int64      (nulls:    0)
    6. AwayCorners                    int64      (nulls:    0)
    7. HomeFouls                      int64      (nulls:    0)
    8. AwayFouls                      int64      (nulls:    0)
    9. HomeYellowCards                int64      (nulls:    0)
   10. AwayYellowCards                int64      (nulls:    0)
   11. HomeRedCards                   int64      (nulls:    0)
   12. AwayRedCards                   int64      (nulls:    0)
   13. HalfTimeHomeGoals              int64      (nulls:    0)
   14. HalfTimeAwayGoals              int64      (nulls:    0)
   15. HomeTeam_Form                  float64    (nulls:    0

## 4. Preparar Datos para Modelado

## 4. Preparar Datos para Modelado

In [13]:
# Preparar datos para entrenamiento
if X is not None and y_result is not None:
    from sklearn.preprocessing import StandardScaler
    
    print('üìä PREPARANDO DATOS...\n')
    
    # Llenar NaNs con forward fill
    X_filled = X.fillna(method='ffill').fillna(method='bfill')
    
    print(f'‚úÖ NaNs llenos:')
    print(f'   Nulls restantes en X: {X_filled.isnull().sum().sum()}')
    
    # Split temporal (no aleatorio para series de tiempo)
    # Usamos 80/20 con orden temporal
    split_idx = int(len(X_filled) * 0.8)
    
    X_train = X_filled[:split_idx]
    X_test = X_filled[split_idx:]
    
    y_result_train = y_result[:split_idx]
    y_result_test = y_result[split_idx:]
    
    y_goals_train = y_goals[:split_idx]
    y_goals_test = y_goals[split_idx:]
    
    print(f'\n‚úÖ Split Train/Test (temporal):')
    print(f'   Train: {len(X_train):,} partidos')
    print(f'   Test:  {len(X_test):,} partidos')
    
    # Normalizar features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print(f'\n‚úÖ Features normalizadas (media=0, std=1)')
    
    # Guardar para las siguientes fases
    print(f'\nüì¶ Datos listos para modelos ML')

üìä PREPARANDO DATOS...

‚úÖ NaNs llenos:
   Nulls restantes en X: 0

‚úÖ Split Train/Test (temporal):
   Train: 7,504 partidos
   Test:  1,876 partidos

‚úÖ Features normalizadas (media=0, std=1)

üì¶ Datos listos para modelos ML


In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

if df_processed is not None:
    print('Datos listos para la siguiente fase:')
    print(f'  - {df_processed.shape[0]} muestras')
    print(f'  - {df_processed.shape[1]} features')
    print(f'\n‚úÖ Pr√≥ximas fases: An√°lisis de target variables')

Datos listos para la siguiente fase:
  - 9380 muestras
  - 26 features

‚úÖ Pr√≥ximas fases: An√°lisis de target variables


## 5. Modelo de Predicci√≥n de Resultados (1X2)

In [16]:
# Entrenar modelos de resultado
if X_train_scaled is not None and y_result_train is not None:
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
    
    print('ü§ñ ENTRENANDO MODELOS - PREDICCI√ìN DE RESULTADOS (1X2)\n')
    
    # Random Forest
    print('1Ô∏è‚É£  Random Forest Classifier')
    rf_result = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
    rf_result.fit(X_train_scaled, y_result_train)
    y_pred_rf = rf_result.predict(X_test_scaled)
    
    rf_acc = accuracy_score(y_result_test, y_pred_rf)
    rf_f1 = f1_score(y_result_test, y_pred_rf, average='weighted')
    print(f'   Accuracy: {rf_acc:.4f} ({rf_acc*100:.2f}%)')
    print(f'   F1-Score: {rf_f1:.4f}')
    
    # Gradient Boosting
    print('\n2Ô∏è‚É£  Gradient Boosting Classifier')
    gb_result = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    gb_result.fit(X_train_scaled, y_result_train)
    y_pred_gb = gb_result.predict(X_test_scaled)
    
    gb_acc = accuracy_score(y_result_test, y_pred_gb)
    gb_f1 = f1_score(y_result_test, y_pred_gb, average='weighted')
    print(f'   Accuracy: {gb_acc:.4f} ({gb_acc*100:.2f}%)')
    print(f'   F1-Score: {gb_f1:.4f}')
    
    # Comparaci√≥n
    print('\n' + '-'*60)
    if gb_acc > rf_acc:
        print(f'üèÜ Mejor: Gradient Boosting ({gb_acc:.4f})')
        mejor_resultado = 'GB'
    else:
        print(f'üèÜ Mejor: Random Forest ({rf_acc:.4f})')
        mejor_resultado = 'RF'
    print('-'*60)

ü§ñ ENTRENANDO MODELOS - PREDICCI√ìN DE RESULTADOS (1X2)

1Ô∏è‚É£  Random Forest Classifier
   Accuracy: 0.6274 (62.74%)
   F1-Score: 0.5805

2Ô∏è‚É£  Gradient Boosting Classifier
   Accuracy: 0.6274 (62.74%)
   F1-Score: 0.5805

2Ô∏è‚É£  Gradient Boosting Classifier
   Accuracy: 0.5800 (58.00%)
   F1-Score: 0.5983

------------------------------------------------------------
üèÜ Mejor: Random Forest (0.6274)
------------------------------------------------------------
   Accuracy: 0.5800 (58.00%)
   F1-Score: 0.5983

------------------------------------------------------------
üèÜ Mejor: Random Forest (0.6274)
------------------------------------------------------------


In [17]:
# Entrenar modelos de goles totales
if X_train_scaled is not None and y_goals_train is not None:
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    import numpy as np
    
    print('ü§ñ ENTRENANDO MODELOS - PREDICCI√ìN DE GOLES TOTALES\n')
    
    # Random Forest Regressor
    print('1Ô∏è‚É£  Random Forest Regressor')
    rf_goals = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
    rf_goals.fit(X_train_scaled, y_goals_train)
    y_pred_rf_goals = rf_goals.predict(X_test_scaled)
    
    rf_mae = mean_absolute_error(y_goals_test, y_pred_rf_goals)
    rf_rmse = np.sqrt(mean_squared_error(y_goals_test, y_pred_rf_goals))
    rf_r2 = r2_score(y_goals_test, y_pred_rf_goals)
    print(f'   MAE: {rf_mae:.4f}')
    print(f'   RMSE: {rf_rmse:.4f}')
    print(f'   R¬≤: {rf_r2:.4f}')
    
    # Gradient Boosting Regressor
    print('\n2Ô∏è‚É£  Gradient Boosting Regressor')
    gb_goals = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    gb_goals.fit(X_train_scaled, y_goals_train)
    y_pred_gb_goals = gb_goals.predict(X_test_scaled)
    
    gb_mae = mean_absolute_error(y_goals_test, y_pred_gb_goals)
    gb_rmse = np.sqrt(mean_squared_error(y_goals_test, y_pred_gb_goals))
    gb_r2 = r2_score(y_goals_test, y_pred_gb_goals)
    print(f'   MAE: {gb_mae:.4f}')
    print(f'   RMSE: {gb_rmse:.4f}')
    print(f'   R¬≤: {gb_r2:.4f}')
    
    # Comparaci√≥n
    print('\n' + '-'*60)
    if gb_r2 > rf_r2:
        print(f'üèÜ Mejor: Gradient Boosting (R¬≤: {gb_r2:.4f})')
        mejor_goles = 'GB'
    else:
        print(f'üèÜ Mejor: Random Forest (R¬≤: {rf_r2:.4f})')
        mejor_goles = 'RF'
    print('-'*60)

ü§ñ ENTRENANDO MODELOS - PREDICCI√ìN DE GOLES TOTALES

1Ô∏è‚É£  Random Forest Regressor
   MAE: 0.9654
   RMSE: 1.1882
   R¬≤: 0.5125

2Ô∏è‚É£  Gradient Boosting Regressor
   MAE: 0.9584
   RMSE: 1.1843
   R¬≤: 0.5157

------------------------------------------------------------
üèÜ Mejor: Gradient Boosting (R¬≤: 0.5157)
------------------------------------------------------------


## Secci√≥n 6: Guardar Modelos para Uso Futuro

In [26]:
# Guardar modelos para usar despu√©s
import pickle
import os
from pathlib import Path

# Obtener directorio actual y crear ruta absoluta
current_dir = Path.cwd()
models_dir = current_dir / 'models'

print(f'üìÇ Directorio actual: {current_dir}')
print(f'üìÇ Guardando en: {models_dir}')

# Crear carpeta models/ si no existe
models_dir.mkdir(exist_ok=True)

# Guardar los 4 modelos + scaler
models_to_save = {
    'rf_result': rf_result,           # Random Forest - Predicci√≥n 1X2
    'gb_result': gb_result,           # Gradient Boosting - Predicci√≥n 1X2
    'rf_goals': rf_goals,             # Random Forest - Predicci√≥n Goles
    'gb_goals': gb_goals,             # Gradient Boosting - Predicci√≥n Goles
    'scaler': scaler                  # StandardScaler - Normalizaci√≥n de features
}

print('\nüíæ GUARDANDO MODELOS ENTRENADOS...\n')
for name, model in models_to_save.items():
    filepath = models_dir / f'{name}_model.pkl'
    with open(filepath, 'wb') as f:
        pickle.dump(model, f)
    print(f'‚úÖ {filepath}')

print('\nüéâ Todos los modelos guardados correctamente')
print(f'üìÇ Ubicaci√≥n: {models_dir}/')

# Verificar que existen
print(f'\n‚úì Verificaci√≥n:')
for file in models_dir.glob('*.pkl'):
    size = file.stat().st_size / (1024*1024)  # Size en MB
    print(f'  {file.name}: {size:.1f} MB')

üìÇ Directorio actual: /workspaces/codespaces-blank/premier-league-ml/notebooks
üìÇ Guardando en: /workspaces/codespaces-blank/premier-league-ml/notebooks/models

üíæ GUARDANDO MODELOS ENTRENADOS...

‚úÖ /workspaces/codespaces-blank/premier-league-ml/notebooks/models/rf_result_model.pkl
‚úÖ /workspaces/codespaces-blank/premier-league-ml/notebooks/models/gb_result_model.pkl
‚úÖ /workspaces/codespaces-blank/premier-league-ml/notebooks/models/rf_goals_model.pkl
‚úÖ /workspaces/codespaces-blank/premier-league-ml/notebooks/models/gb_goals_model.pkl
‚úÖ /workspaces/codespaces-blank/premier-league-ml/notebooks/models/scaler_model.pkl

üéâ Todos los modelos guardados correctamente
üìÇ Ubicaci√≥n: /workspaces/codespaces-blank/premier-league-ml/notebooks/models/

‚úì Verificaci√≥n:
  gb_goals_model.pkl: 0.4 MB
  gb_result_model.pkl: 1.2 MB
  scaler_model.pkl: 0.0 MB
  rf_result_model.pkl: 17.7 MB
  rf_goals_model.pkl: 16.1 MB


## Secci√≥n 7: Hacer Predicciones en Nuevos Partidos

In [24]:
# Cargar el m√≥dulo de predicci√≥n
import sys
from pathlib import Path
import importlib

# Agregar src al path para importar m√≥dulos
sys.path.insert(0, str(Path.cwd() / 'src'))

# Recargar m√≥dulos para asegurar cambios recientes
import predictor
importlib.reload(predictor)
from predictor import EPLPredictor

# Inicializar predictor (carga modelos desde ./models/)
predictor_instance = EPLPredictor('models')

# Hacer predicci√≥n para un partido futuro
print('='*70)
print('üîÆ EJEMPLO DE PREDICCI√ìN - Nuevo Partido')
print('='*70)

# Predecir: Chelsea vs Liverpool el 22 de Febrero, 2025
# Pasamos X_train_scaled para que el predictor use la distribuci√≥n media
prediccion = predictor_instance.predict_match(
    df_historical=df,
    home_team='Chelsea',
    away_team='Liverpool',
    match_date='2025-02-22',
    X_train_scaled=X_train_scaled
)

# Mostrar resultados
predictor_instance.print_prediction(prediccion, verbose=True)

# Acceder a valores espec√≠ficos
print('üí° Acceso program√°tico a resultados:')
print(f'\nPredicci√≥n RF (1X2): {prediccion["resultado"]["random_forest"]["prediccion"]}')
print(f'Confianza: {prediccion["resultado"]["random_forest"]["confianza"]:.1f}%')
print(f'Goles totales (promedio): {prediccion["goles_totales"]["promedio"]}')

‚úÖ Modelos cargados desde: models
üîÆ EJEMPLO DE PREDICCI√ìN - Nuevo Partido

üîÆ PREDICCI√ìN EPL
üìÖ Chelsea vs Liverpool (2025-02-22)

üìä RESULTADO (1X2):

  üå≤ Random Forest:
     Predicci√≥n: Home Win
     Confianza: 71.3%
     Detalles: Away 14.4% | Draw 14.3% | Home 71.3%

  ‚ö° Gradient Boosting:
     Predicci√≥n: Home Win
     Confianza: 73.9%
     Detalles: Away 6.8% | Draw 19.3% | Home 73.9%

‚öΩ GOLES TOTALES:
  üå≤ Random Forest: 2.24
  ‚ö° Gradient Boosting: 2.41
  üìà Promedio: 2.33


üí° Acceso program√°tico a resultados:

Predicci√≥n RF (1X2): Home Win
Confianza: 71.3%
Goles totales (promedio): 2.33


In [28]:
# Predicciones m√∫ltiples (Batch)
import importlib
importlib.reload(predictor)
from predictor import EPLPredictor

# Recargar predictor
predictor_instance = EPLPredictor('models')

print('='*70)
print('üîÆ EJEMPLO: Predicciones M√∫ltiples (Fin de Semana)')
print('='*70)

matches = [
    {'home': 'Chelsea', 'away': 'Liverpool', 'date': '2025-02-22'},
    {'home': 'Arsenal', 'away': 'Man City', 'date': '2025-03-01'},
    {'home': 'Tottenham', 'away': 'Man United', 'date': '2025-03-08'},
]

print('\nü§ñ Realizando predicciones...\n')
predictions = predictor_instance.predict_batch(df, matches, X_train_scaled)

# Mostrar resumen
print('\n' + '='*70)
print('üìä RESUMEN DE PREDICCIONES')
print('='*70 + '\n')

for pred in predictions:
    print(f"{pred['partido']} ({pred['fecha']})")
    print(f"  üå≤ RF: {pred['resultado']['random_forest']['prediccion']} ({pred['resultado']['random_forest']['confianza']:.1f}%)")
    print(f"  ‚ö° GB: {pred['resultado']['gradient_boosting']['prediccion']} ({pred['resultado']['gradient_boosting']['confianza']:.1f}%)")
    print(f"  ‚öΩ Goles: {pred['goles_totales']['promedio']}")
    print()


‚úÖ Modelos cargados desde: models
üîÆ EJEMPLO: Predicciones M√∫ltiples (Fin de Semana)

ü§ñ Realizando predicciones...

‚úÖ Chelsea vs Liverpool
‚úÖ Arsenal vs Man City
‚úÖ Tottenham vs Man United

üìä RESUMEN DE PREDICCIONES

Chelsea vs Liverpool (2025-02-22)
  üå≤ RF: Home Win (71.3%)
  ‚ö° GB: Home Win (73.9%)
  ‚öΩ Goles: 2.33

Arsenal vs Man City (2025-03-01)
  üå≤ RF: Home Win (71.3%)
  ‚ö° GB: Home Win (73.9%)
  ‚öΩ Goles: 2.33

Tottenham vs Man United (2025-03-08)
  üå≤ RF: Home Win (71.3%)
  ‚ö° GB: Home Win (73.9%)
  ‚öΩ Goles: 2.33



## 6. Modelo de Predicci√≥n de Goles Totales

In [11]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print('üìä Modelos de predicci√≥n de goles preparados')
print('  - Random Forest Regressor')
print('  - Gradient Boosting Regressor')

üìä Modelos de predicci√≥n de goles preparados
  - Random Forest Regressor
  - Gradient Boosting Regressor


## 7. Evaluaci√≥n y Comparaci√≥n de Modelos

In [12]:
print('‚úÖ Fase de evaluaci√≥n lista')
print('M√©tricas a evaluar:')
print('  Clasificaci√≥n: Accuracy, Precision, Recall, F1-Score, ROC-AUC')
print('  Regresi√≥n: MAE, RMSE, R¬≤')

‚úÖ Fase de evaluaci√≥n lista
M√©tricas a evaluar:
  Clasificaci√≥n: Accuracy, Precision, Recall, F1-Score, ROC-AUC
  Regresi√≥n: MAE, RMSE, R¬≤


## 8. Comparar Predicciones vs Odds del Mercado

In [13]:
print('üìä Investigar APIs de odds disponibles:')
print('  1. odds-api.com - Gratis con l√≠mite de requests')
print('  2. RapidAPI - M√∫ltiples endpoints de apuestas')
print('  3. Datos hist√≥ricos de sitios especializados')

üìä Investigar APIs de odds disponibles:
  1. odds-api.com - Gratis con l√≠mite de requests
  2. RapidAPI - M√∫ltiples endpoints de apuestas
  3. Datos hist√≥ricos de sitios especializados


## 9. Identificar Oportunidades de Value Betting

In [14]:
print('üí° Estrategia de Value Betting:')
print('\n1. Calcular probabilidad impl√≠cita de odds:')
print('   Prob_impl√≠cita = 1 / Odd')
print('\n2. Comparar con probabilidad predicha por modelo:')
print('   Si Prob_modelo > Prob_impl√≠cita ‚Üí Posible value bet')
print('\n3. Filtrar por edge m√≠nimo (ej: 5%)')
print('   Edge = Prob_modelo - Prob_impl√≠cita')

üí° Estrategia de Value Betting:

1. Calcular probabilidad impl√≠cita de odds:
   Prob_impl√≠cita = 1 / Odd

2. Comparar con probabilidad predicha por modelo:
   Si Prob_modelo > Prob_impl√≠cita ‚Üí Posible value bet

3. Filtrar por edge m√≠nimo (ej: 5%)
   Edge = Prob_modelo - Prob_impl√≠cita


## 10. An√°lisis de Rentabilidad

In [15]:
print('üí∞ M√©tricas de Rentabilidad:')
print('  - ROI (Return on Investment)')
print('  - Win Rate (%)')
print('  - Valor Esperado (EV) por apuesta')
print('  - Backtesting en datos hist√≥ricos')
print('  - An√°lisis de rentabilidad por temporada')

üí∞ M√©tricas de Rentabilidad:
  - ROI (Return on Investment)
  - Win Rate (%)
  - Valor Esperado (EV) por apuesta
  - Backtesting en datos hist√≥ricos
  - An√°lisis de rentabilidad por temporada
