In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from interpret.glassbox import ExplainableBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from time import time

pitch = pd.read_excel("reduced_pitch.xlsx")

numeric_vars = ['Transfer_value', 'Exp_contr', 'Age', 'Minutes_pl', 'Minutes_pl_BC', 'NP_goals', 'Pen_goals', 'Pen_goals_BC', 'Assists', 'Interceptions', 'GCA_BC', 'Yellow_cards', 'Red_cards']
pitch[numeric_vars] = StandardScaler().fit_transform(pitch[numeric_vars])

In [5]:
ind_vars = ['Exp_contr', 'Age', 'Minutes_pl', 'Minutes_pl_BC',
       'NP_goals', 'Pen_goals', 'Pen_goals_BC', 'Assists', 'Interceptions',
       'GCA_BC', 'Yellow_cards', 'Red_cards', 'Reduced_team_from_Inter',
       'Reduced_team_from_AC Milan', 'Reduced_team_from_Juventus',
       'Reduced_team_from_Napoli', 'Reduced_team_from_Paris SG',
       'Reduced_team_from_Real Madrid', 'Reduced_team_from_Barcelona',
       'Reduced_team_from_Atlético Madrid', 'Reduced_team_from_Bayern Munich',
       'Reduced_team_from_Bor. Dortmund', 'Reduced_team_from_Chelsea',
       'Reduced_team_from_Arsenal', 'Reduced_team_from_Man City',
       'Reduced_team_from_Man Utd', 'Reduced_team_from_Liverpool', 'Reduced_country_from_Inglaterra',
       'Reduced_country_from_Francia', 'Reduced_country_from_España',
       'Reduced_country_from_Italia', 'Reduced_country_from_Alemania',
       'Reduced_country_from_Portugal', 'Reduced_country_from_Países Bajos',
       'Reduced_country_from_Argentina', 'Reduced_country_from_Brasil', 'Position_Centre-Forward',
       'Position_Left-Back', 'Position_Central Midfield',
       'Position_Centre-Back', 'Position_Right Winger',
       'Position_Attacking Midfield', 'Position_Defensive Midfield',
       'Position_Left Winger', 'Position_Second Striker',
       'Position_Left Midfield', 'Position_Right Midfield']

In [6]:
time0 = time()

X = pitch[ind_vars]  # Asegúrate de usar las variables significativas
y = pitch['Transfer_value']  # Variable dependiente

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

ebm = ExplainableBoostingRegressor(random_state=0)
ebm.fit(X_train, y_train)

y_pred = ebm.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R² del modelo: {r2:.4f}")
print(f"MSE del modelo: {mse:.4f}")

print(time() - time0)

R² del modelo: 0.4341
MSE del modelo: 0.5301
53.395737409591675


In [8]:
global_explanation = ebm.explain_global()

# # Mostrar importancia de cada variable
# for feature, score in zip(global_explanation.data()['names'], global_explanation.data()['scores']):
#     print(f"{feature}: {score:.4f}")

In [9]:
importance_df = pd.DataFrame({
    'Variable': global_explanation.data()['names'],
    'Importancia': global_explanation.data()['scores']
}).sort_values(by='Importancia', ascending=False)

print(importance_df.head(20))

                                           Variable  Importancia
1                                               Age     0.218964
3                                     Minutes_pl_BC     0.189599
0                                         Exp_contr     0.184833
9                                            GCA_BC     0.137094
4                                          NP_goals     0.107687
27                  Reduced_country_from_Inglaterra     0.090287
39                             Position_Centre-Back     0.075013
8                                     Interceptions     0.053434
2                                        Minutes_pl     0.048316
7                                           Assists     0.045680
28                     Reduced_country_from_Francia     0.038727
76  Minutes_pl_BC & Reduced_country_from_Inglaterra     0.031586
31                    Reduced_country_from_Alemania     0.030115
30                      Reduced_country_from_Italia     0.029699
49                       

In [10]:
sum(importance_df['Importancia'])

2.0390276724063705

In [29]:
X = pitch[ind_vars]
y = pitch['Transfer_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Define el espacio de hiperparámetros que quieres explorar
param_grid = {
    'max_rounds': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_leaves': [3, 5, 7],
    'min_samples_leaf': [1, 5, 10],
    'interactions': [None, 2]  # Considerar hasta 2 pares de interacciones
}

# Crea el objeto GridSearchCV
grid_search = GridSearchCV(estimator=ExplainableBoostingRegressor(random_state=0),
                           param_grid=param_grid,
                           scoring='r2',
                           cv=3,
                           n_jobs=-1,
                           verbose=2)

time0 = time()
grid_search.fit(X_train, y_train)
print(f"Tiempo de GridSearchCV: {time() - time0:.2f} segundos")

# Mejores hiperparámetros encontrados
print(f"Mejores hiperparámetros: {grid_search.best_params_}")

# Mejor puntuación (R²) obtenida
print(f"Mejor R² en validación cruzada: {grid_search.best_score_:.4f}")

# Evaluar el modelo con los mejores hiperparámetros en el conjunto de prueba
best_ebm = grid_search.best_estimator_
y_pred_best = best_ebm.predict(X_test)
r2_best = r2_score(y_test, y_pred_best)
mse_best = mean_squared_error(y_test, y_pred_best)

print(f"R² del mejor modelo en el conjunto de prueba: {r2_best:.4f}")
print(f"MSE del mejor modelo en el conjunto de prueba: {mse_best:.4f}")

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Tiempo de GridSearchCV: 1219.17 segundos
Mejores hiperparámetros: {'interactions': 2, 'learning_rate': 0.05, 'max_leaves': 3, 'max_rounds': 100, 'min_samples_leaf': 5}
Mejor R² en validación cruzada: 0.5097
R² del mejor modelo en el conjunto de prueba: 0.3888
MSE del mejor modelo en el conjunto de prueba: 0.5726


In [30]:
global_explanation = best_ebm.explain_global()

importance_df = pd.DataFrame({
    'Variable': global_explanation.data()['names'],
    'Importancia': global_explanation.data()['scores']
}).sort_values(by='Importancia', ascending=False)

print(importance_df.head(20))

                             Variable  Importancia
1                                 Age     0.191886
0                           Exp_contr     0.190268
3                       Minutes_pl_BC     0.177568
9                              GCA_BC     0.122098
27    Reduced_country_from_Inglaterra     0.091934
4                            NP_goals     0.087603
39               Position_Centre-Back     0.057713
2                          Minutes_pl     0.055091
7                             Assists     0.047709
47          Exp_contr & Minutes_pl_BC     0.045384
8                       Interceptions     0.043579
6                        Pen_goals_BC     0.030264
28       Reduced_country_from_Francia     0.027814
30        Reduced_country_from_Italia     0.024905
32      Reduced_country_from_Portugal     0.024101
31      Reduced_country_from_Alemania     0.022199
11                          Red_cards     0.021498
33  Reduced_country_from_Países Bajos     0.021371
48          Exp_contr & Interce