In [1]:
import numpy as np
import os
import pandas as pd
import pickle
import xgboost as xgb
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [6]:
df = pd.read_csv('data/processed_files/df_datos_completos.csv')

In [7]:
# Crear una lista de columnas para eliminar que contengan 'titu-' o 'les-'
columns_to_drop = df.filter(like='titu-').columns.tolist() + df.filter(like='les-').columns.tolist()

# Añadir las columnas adicionales a la lista de columnas a eliminar
columns_to_drop += ['index', 'fixture_id', 'resultado', 'goles_local', 'goles_totales', 'goles_visitante', 'goles_descanso_local', 'goles_descanso_visitante', 'fecha_timestamp']

# Eliminar las columnas en la lista columns_to_drop de DataFrame df_partidos
X = df.drop(columns_to_drop, axis=1)
y = df['goles_local']

In [8]:
X.head()

Unnamed: 0,id_equipo_local,id_equipo_visitante,arbitro,estadio,season,shots_on_goal_local,shots_on_goal_away,shots_off_goal_local,shots_off_goal_away,total_shots_local,...,goalkeeper_saves_away,total_pass_local,total_pass_away,odd_1,odd_x,odd_2,odd_mas_25,odd_menos_25,tiros_para_marcar_local,tiros_para_marcar_away
0,530,539,"Alfonso Alvarez Izquierdo, Spain",Estadio Vicente Calderón (Madrid),2014,5.0,3.0,5.0,4.0,0.0,...,3.0,0.0,0.0,1.2,6.5,17.0,1.75,2.28,9.25,11.333333
1,536,538,"Carlos Velasco Carballo, Spain",Estadio Ramón Sánchez Pizjuán (Sevilla),2014,6.0,1.0,3.0,7.0,0.0,...,4.0,0.0,0.0,1.67,3.8,5.5,1.81,2.19,9.0,7.5
2,797,533,"Carlos Clos Gomez, Spain",Estadio Manuel Martínez Valero (Elche),2014,7.0,3.0,5.0,6.0,0.0,...,5.0,0.0,0.0,5.0,3.6,1.75,2.3,1.77,25.0,12.0
3,544,531,"Alberto Undiano, Spain",Estadio Municipal de Riazor (A Coruña (La Coru...,2014,7.0,5.0,4.0,8.0,0.0,...,6.0,0.0,0.0,3.2,3.1,2.4,2.56,1.6,44.0,11.333333
4,535,723,"Fernando Teixeira Vitienes, Spain",Estadio La Rosaleda (Málaga),2014,8.0,5.0,2.0,2.0,0.0,...,6.0,0.0,0.0,1.53,4.1,6.5,2.04,1.99,6.0,11.0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Pipeline para codificar la columna 'arbitro' con OneHotEncoder
arbitro_pipeline = Pipeline([
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

# Pipeline para codificar la columna 'estadio' con TargetEncoder
estadio_pipeline = Pipeline([
    ('target', TargetEncoder())
])

# ColumnTransformer para aplicar los pipelines a las columnas correspondientes
preprocessor = ColumnTransformer([
    ('arbitro', arbitro_pipeline, ['arbitro']),
    ('estadio', estadio_pipeline, ['estadio']),
    ], remainder = "passthrough")

# Pipeline final con el preprocesamiento y el modelo RandomForestClassifier
pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()), #En este caso el escalado de variables funciona mucho peor
    ('xgbr', xgb.XGBRegressor())
])

param_grid = {
    'xgbr__n_estimators': [50, 100, 200, 300, 400, 500,600,700],
    'xgbr__learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2, 0.3],
    'xgbr__max_depth': [3, 4, 5, 6, 7, 8],
    'xgbr__min_child_weight': [1, 2, 3, 4, 5, 6],
    'xgbr__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0,1.2,1.3,1.4],
    'xgbr__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'xgbr__gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 5],
    'xgbr__reg_alpha': [0, 0.1, 0.5, 1, 2, 5],
    'xgbr__reg_lambda': [0.1, 0.5, 1, 2, 5, 10]
}


In [11]:
random_search = RandomizedSearchCV(
    estimator=pipeline_xgb,
    param_distributions=param_grid,
    n_iter=200,  # Número de combinaciones aleatorias de hiperparámetros a probar
    scoring='neg_mean_absolute_error',
    cv=4,  # Número de divisiones de validación cruzada
    verbose=2,
    n_jobs=-1)

In [12]:
random_search.fit(X, y)

Fitting 4 folds for each of 200 candidates, totalling 800 fits


292 fits failed out of a total of 800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
116 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\gonzalo.velazquez\Anaconda3\envs\general\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\gonzalo.velazquez\Anaconda3\envs\general\lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\gonzalo.velazquez\Anaconda3\envs\general\lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "c:\Users\gonzalo.velazquez\Anaconda3\envs\general\lib\site-pack

In [7]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predecir valores en el conjunto de prueba
y_pred = random_search.predict(X_test)

# Calcular métricas de evaluación
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [13]:
df_goles_visitantes = pd.read_csv('data/processed_files/df_datos_completos.csv')

In [14]:
# Crear una lista de columnas para eliminar que contengan 'titu-' o 'les-'
columns_to_drop = df_goles_visitantes.filter(like='titu-').columns.tolist() + df_goles_visitantes.filter(like='les-').columns.tolist()

# Añadir las columnas adicionales a la lista de columnas a eliminar
columns_to_drop += ['index', 'fixture_id', 'resultado', 'goles_local', 'goles_totales', 'goles_visitante', 'goles_descanso_local', 'goles_descanso_visitante', 'fecha_timestamp']

# Eliminar las columnas en la lista columns_to_drop de DataFrame df_partidos
X = df_goles_visitantes.drop(columns_to_drop, axis=1)
y = df_goles_visitantes['goles_visitante']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Pipeline para codificar la columna 'arbitro' con OneHotEncoder
arbitro_pipeline = Pipeline([
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

# Pipeline para codificar la columna 'estadio' con TargetEncoder
estadio_pipeline = Pipeline([
    ('target', TargetEncoder())
])

# ColumnTransformer para aplicar los pipelines a las columnas correspondientes
preprocessor = ColumnTransformer([
    ('arbitro', arbitro_pipeline, ['arbitro']),
    ('estadio', estadio_pipeline, ['estadio']),
    ], remainder = "passthrough")

# Pipeline final con el preprocesamiento y el modelo RandomForestClassifier
pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()), #En este caso el escalado de variables funciona mucho peor
    ('xgbr', xgb.XGBRegressor())
])

param_grid = {
    'xgbr__n_estimators': [50, 100, 200, 300, 400, 500,600,700],
    'xgbr__learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2, 0.3],
    'xgbr__max_depth': [3, 4, 5, 6, 7, 8],
    'xgbr__min_child_weight': [1, 2, 3, 4, 5, 6],
    'xgbr__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0,1.2,1.3,1.4],
    'xgbr__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'xgbr__gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 5],
    'xgbr__reg_alpha': [0, 0.1, 0.5, 1, 2, 5],
    'xgbr__reg_lambda': [0.1, 0.5, 1, 2, 5, 10]
}


In [16]:
random_search_gv= RandomizedSearchCV(
    estimator=pipeline_xgb,
    param_distributions=param_grid,
    n_iter=200,  # Número de combinaciones aleatorias de hiperparámetros a probar
    scoring='neg_mean_absolute_error',
    cv=4,  # Número de divisiones de validación cruzada
    verbose=2,
    n_jobs=-1)

In [44]:
df_corners = pd.read_csv('data/processed_files/df_datos_completos.csv')

In [45]:
df_corners['corners_total'] = df_corners['corners_local'] + df_corners['corners_away']

In [None]:
# Crear una lista de columnas para eliminar que contengan 'titu-' o 'les-'
columns_to_drop = df_corners.filter(like='titu-').columns.tolist() + df_corners.filter(like='les-').columns.tolist()

# Añadir las columnas adicionales a la lista de columnas a eliminar
columns_to_drop += ['index', 'fixture_id', 'resultado', 'goles_local', 'goles_totales', 'goles_visitante', 'goles_descanso_local', 'goles_descanso_visitante', 'fecha_timestamp','corners_total','corners_local','corners_away']

# Eliminar las columnas en la lista columns_to_drop de DataFrame df_partidos
X = df_corners.drop(columns_to_drop, axis=1)
y = df_corners['corners_total']

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Pipeline para codificar la columna 'arbitro' con OneHotEncoder
arbitro_pipeline = Pipeline([
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

# Pipeline para codificar la columna 'estadio' con TargetEncoder
estadio_pipeline = Pipeline([
    ('target', TargetEncoder())
])

# ColumnTransformer para aplicar los pipelines a las columnas correspondientes
preprocessor = ColumnTransformer([
    ('arbitro', arbitro_pipeline, ['arbitro']),
    ('estadio', estadio_pipeline, ['estadio']),
    ], remainder = "passthrough")

# Pipeline final con el preprocesamiento y el modelo RandomForestClassifier
pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    #('scaler', StandardScaler()), #En este caso el escalado de variables funciona mucho peor
    ('xgbr', xgb.XGBRegressor())
])

param_grid = {
    'xgbr__n_estimators': [50, 100, 200, 300, 400, 500],
    'xgbr__learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2, 0.3],
    'xgbr__max_depth': [3, 4, 5, 6, 7, 8],
    'xgbr__min_child_weight': [1, 2, 3, 4, 5, 6],
    'xgbr__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'xgbr__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'xgbr__gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 5],
    'xgbr__reg_alpha': [0, 0.1, 0.5, 1, 2, 5],
    'xgbr__reg_lambda': [0.1, 0.5, 1, 2, 5, 10]
}


In [48]:
random_search_corner = RandomizedSearchCV(
    estimator=pipeline_xgb,
    param_distributions=param_grid,
    n_iter=200,  # Número de combinaciones aleatorias de hiperparámetros a probar
    scoring='neg_mean_absolute_error',
    cv=3,  # Número de divisiones de validación cruzada
    verbose=2,
    n_jobs=-1,  # Utilizar todos los núcleos disponibles
    random_state=42)

In [49]:
random_search_corner.fit(X, y)

Fitting 3 folds for each of 200 candidates, totalling 600 fits




In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predecir valores en el conjunto de prueba
y_pred = random_search_corner.predict(X_test)

# Calcular métricas de evaluación
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [50]:
df_tarjetas = pd.read_csv('data/processed_files/df_datos_completos.csv')

In [51]:
df_tarjetas['tarjetas_total'] = df_tarjetas['yellow_cards_local'] + df_tarjetas['yellow_cards_away'] + df_tarjetas['red_cards_local'] + df_tarjetas['red_cards_away']

In [52]:
# Crear una lista de columnas para eliminar que contengan 'titu-' o 'les-'
columns_to_drop = df_tarjetas.filter(like='titu-').columns.tolist() + df_tarjetas.filter(like='les-').columns.tolist()

# Añadir las columnas adicionales a la lista de columnas a eliminar
columns_to_drop += ['index', 'fixture_id', 'resultado', 'goles_local', 'goles_totales', 'goles_visitante', 'goles_descanso_local', 'goles_descanso_visitante', 'fecha_timestamp','tarjetas_total','yellow_cards_local','yellow_cards_away','red_cards_local','red_cards_away']

# Eliminar las columnas en la lista columns_to_drop de DataFrame df_partidos
X = df_tarjetas.drop(columns_to_drop, axis=1)
y = df_tarjetas['tarjetas_total']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Pipeline para codificar la columna 'arbitro' con OneHotEncoder
arbitro_pipeline = Pipeline([
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

# Pipeline para codificar la columna 'estadio' con TargetEncoder
estadio_pipeline = Pipeline([
    ('target', TargetEncoder())
])

# ColumnTransformer para aplicar los pipelines a las columnas correspondientes
preprocessor = ColumnTransformer([
    ('arbitro', arbitro_pipeline, ['arbitro']),
    ('estadio', estadio_pipeline, ['estadio']),
    ], remainder = "passthrough")

# Pipeline final con el preprocesamiento y el modelo RandomForestClassifier
pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    #('scaler', StandardScaler()), #En este caso el escalado de variables funciona mucho peor
    ('xgbr', xgb.XGBRegressor())
])

param_grid = {
    'xgbr__n_estimators': [50, 100, 200, 300, 400, 500],
    'xgbr__learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2, 0.3],
    'xgbr__max_depth': [3, 4, 5, 6, 7, 8],
    'xgbr__min_child_weight': [1, 2, 3, 4, 5, 6],
    'xgbr__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'xgbr__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'xgbr__gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 5],
    'xgbr__reg_alpha': [0, 0.1, 0.5, 1, 2, 5],
    'xgbr__reg_lambda': [0.1, 0.5, 1, 2, 5, 10]
}


In [54]:
random_search_tarjetas = RandomizedSearchCV(
    estimator=pipeline_xgb,
    param_distributions=param_grid,
    n_iter=200,  # Número de combinaciones aleatorias de hiperparámetros a probar
    scoring='neg_mean_absolute_error',
    cv=3,  # Número de divisiones de validación cruzada
    verbose=2,
    n_jobs=-1,  # Utilizar todos los núcleos disponibles
    random_state=42)

In [55]:
random_search_tarjetas.fit(X, y)

Fitting 3 folds for each of 200 candidates, totalling 600 fits




In [17]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predecir valores en el conjunto de prueba
y_pred = random_search_tarjetas.predict(X_test)

# Calcular métricas de evaluación
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [22]:
id_equipo_local = 535
id_equipo_visitante = 537
odd_1 = 2.75
odd_x = 2.9
odd_2 = 2.5
odd_mas_25 = 1.7
odd_menos_25 = 2.3
arbitro ='Saul Ais Reig, Spain'
estadio = 'Estadio La Rosaleda (Málaga)'
season = 2022  


In [24]:
from utils.functions import data_processing
data_processing = data_processing()

In [19]:
with open("modelo_goles.pkl", "wb") as archivo_modelo_goles:
    pickle.dump(random_search, archivo_modelo_goles)
with open("modelo_corners.pkl", "wb") as archivo_modelo_corners:
    pickle.dump(random_search_corner, archivo_modelo_corners)
with open("modelo_corners.pkl", "wb") as archivo_modelo_tarjetas:
    pickle.dump(random_search_tarjetas, archivo_modelo_tarjetas)



In [20]:
with open("modelo_goles.pkl", "rb") as archivo_modelo_goles:
    random_search = pickle.load(archivo_modelo_goles)
with open("modelo_corners.pkl", "rb") as archivo_modelo_corners:
    random_search_corner = pickle.load(archivo_modelo_corners)
with open("modelo_corners.pkl", "rb") as archivo_modelo_tarjetas:
    random_search_tarjetas = pickle.load(archivo_modelo_tarjetas)

In [25]:
datos_nuevos = data_processing.creacion_datos_nuevos_stats(df,id_equipo_local, id_equipo_visitante,odd_1, odd_x, odd_2,odd_mas_25, odd_menos_25,arbitro, estadio, season)
datos_nuevos_corner = datos_nuevos.drop(['corners_local','corners_away'], axis = 1)     
datos_nuevos_tarjetas = datos_nuevos.drop(['yellow_cards_local','yellow_cards_away','red_cards_local','red_cards_away'], axis = 1)                              

In [61]:

y_pred_goles = random_search.predict(datos_nuevos)
y_pred_corners = random_search_corner.predict(datos_nuevos)
y_pred_tarjetas = random_search_tarjetas.predict(datos_nuevos)

In [27]:
y_pred_goles = random_search.predict(datos_nuevos)
print(f'Se marcarán {y_pred_goles} goles')

Se marcarán [2.5342925] goles


In [63]:
print(f'Se marcarán {y_pred_goles} goles')
print(f'Habrá {y_pred_corners} corners')
print(f'Se mostrarán {y_pred_tarjetas} tarjetas')

Se marcarán [6.0187254] goles
Habrá [5.3380566] corners
Se mostrarán [5.3380566] tarjetas


In [29]:
random_search.best_params_

{'xgbr__subsample': 1.0,
 'xgbr__reg_lambda': 5,
 'xgbr__reg_alpha': 2,
 'xgbr__n_estimators': 500,
 'xgbr__min_child_weight': 1,
 'xgbr__max_depth': 3,
 'xgbr__learning_rate': 0.1,
 'xgbr__gamma': 0.3,
 'xgbr__colsample_bytree': 0.7}

In [64]:
random_search.best_params_

{'xgbr__subsample': 0.5,
 'xgbr__reg_lambda': 1,
 'xgbr__reg_alpha': 0.5,
 'xgbr__n_estimators': 600,
 'xgbr__min_child_weight': 1,
 'xgbr__max_depth': 8,
 'xgbr__learning_rate': 0.01,
 'xgbr__gamma': 0.5,
 'xgbr__colsample_bytree': 0.5}

In [65]:
random_search_corner.best_params_

{'xgbr__subsample': 0.7,
 'xgbr__reg_lambda': 5,
 'xgbr__reg_alpha': 0.1,
 'xgbr__n_estimators': 400,
 'xgbr__min_child_weight': 1,
 'xgbr__max_depth': 4,
 'xgbr__learning_rate': 0.01,
 'xgbr__gamma': 0.4,
 'xgbr__colsample_bytree': 0.7}

In [66]:
random_search_tarjetas.best_params_

{'xgbr__subsample': 0.7,
 'xgbr__reg_lambda': 5,
 'xgbr__reg_alpha': 0.1,
 'xgbr__n_estimators': 400,
 'xgbr__min_child_weight': 1,
 'xgbr__max_depth': 4,
 'xgbr__learning_rate': 0.01,
 'xgbr__gamma': 0.4,
 'xgbr__colsample_bytree': 0.7}