In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Cargar datos
pitch = pd.read_excel("reduced_pitch.xlsx")

# sqrt_vars = ['Interceptions', 'Yellow_cards', 'Transfer_value', 'Exp_contr', 'Minutes_pl_BC', 'NP_goals', 'Pen_goals', 'Pen_goals_BC', 'Assists', 'GCA_BC', 'Red_cards']
# for var in sqrt_vars: 
#     pitch[var] = pitch[var].apply(np.sqrt)


# Escalar variables numéricas
numeric_vars = ['Transfer_value', 'Exp_contr', 'Age', 'Minutes_pl', 'Minutes_pl_BC', 'NP_goals', 'Pen_goals', 'Pen_goals_BC', 'Assists', 'Interceptions', 'GCA_BC', 'Yellow_cards', 'Red_cards']
pitch[numeric_vars] = StandardScaler().fit_transform(pitch[numeric_vars])

# Definir X e Y


In [3]:
ind_vars = ['Exp_contr', 'Age', 'Minutes_pl', 'Minutes_pl_BC',
       'NP_goals', 'Pen_goals', 'Pen_goals_BC', 'Assists', 'Interceptions',
       'GCA_BC', 'Yellow_cards', 'Red_cards', 'Reduced_team_from_Inter',
       'Reduced_team_from_AC Milan', 'Reduced_team_from_Juventus',
       'Reduced_team_from_Napoli', 'Reduced_team_from_Paris SG',
       'Reduced_team_from_Real Madrid', 'Reduced_team_from_Barcelona',
       'Reduced_team_from_Atlético Madrid', 'Reduced_team_from_Bayern Munich',
       'Reduced_team_from_Bor. Dortmund', 'Reduced_team_from_Chelsea',
       'Reduced_team_from_Arsenal', 'Reduced_team_from_Man City',
       'Reduced_team_from_Man Utd', 'Reduced_team_from_Liverpool', 'Reduced_country_from_Inglaterra',
       'Reduced_country_from_Francia', 'Reduced_country_from_España',
       'Reduced_country_from_Italia', 'Reduced_country_from_Alemania',
       'Reduced_country_from_Portugal', 'Reduced_country_from_Países Bajos',
       'Reduced_country_from_Argentina', 'Reduced_country_from_Brasil', 'Position_Centre-Forward',
       'Position_Left-Back', 'Position_Central Midfield',
       'Position_Centre-Back', 'Position_Right Winger',
       'Position_Attacking Midfield', 'Position_Defensive Midfield',
       'Position_Left Winger', 'Position_Second Striker',
       'Position_Left Midfield', 'Position_Right Midfield']

In [4]:
X = pitch[ind_vars]
Y = pitch["Transfer_value"]

# Definir el modelo base
elastic = ElasticNet(max_iter=10000)

# Definir la malla de hiperparámetros
param_grid = {
    'alpha': np.logspace(-4, 1, 10),       # regularización
    'l1_ratio': np.linspace(0.1, 0.9, 9)   # mezcla L1/L2
}

# Grid search con validación cruzada
grid_search = GridSearchCV(estimator=elastic,
                           param_grid=param_grid,
                           scoring='r2',
                           cv=5,
                           n_jobs=-1)

# Ajustar
grid_search.fit(X, Y)

# Resultados
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X)
r2 = best_model.score(X, Y)
mse = mean_squared_error(Y, y_pred)

print("Mejor alpha:", grid_search.best_params_['alpha'])
print("Mejor l1_ratio:", grid_search.best_params_['l1_ratio'])
print(f"R^2 en los datos de entrenamiento: {r2}")
print(f"MSE en los datos de entrenamiento: {mse}")
print("Coeficientes:", best_model.coef_)

Mejor alpha: 0.004641588833612782
Mejor l1_ratio: 0.1
R^2 en los datos de entrenamiento: 0.4930121059036896
MSE en los datos de entrenamiento: 0.5069878940963105
Coeficientes: [ 0.23767955 -0.32819091  0.00253545  0.18999379  0.20090948  0.01645333
 -0.00716147  0.08512242  0.1038901   0.26847681  0.02330764 -0.01946905
  0.13023642  0.          0.26607544  0.23301445  0.38743959  0.24313504
  0.03834466  0.37350318  0.28091041  0.2561352   0.21586318 -0.00868792
  0.07068971  0.00601203  0.          0.36051369 -0.17069021 -0.03178005
 -0.05919203 -0.12260525  0.34434332  0.21303507  0.05254691  0.15907358
 -0.01948434 -0.02416084  0.00640742  0.26343401 -0.0606451   0.00643253
  0.21526658 -0.04402585  0.08481516 -0.09053987  0.        ]


In [5]:
columnas = X.columns
coeficientes = best_model.coef_

# Crear DataFrame con nombres y coeficientes
coef_df = pd.DataFrame({
    'Variable': columnas,
    'Coeficiente': coeficientes
})

# Filtrar coeficientes distintos de cero
coef_df = coef_df[coef_df['Coeficiente'] != 0]

# Ordenar por importancia (valor absoluto)
coef_df = coef_df.reindex(coef_df['Coeficiente'].abs().sort_values(ascending=False).index)

# Mostrar
print("\nVariables con coeficiente distinto de cero:")
print(coef_df)


Variables con coeficiente distinto de cero:
                             Variable  Coeficiente
16         Reduced_team_from_Paris SG     0.387440
19  Reduced_team_from_Atlético Madrid     0.373503
27    Reduced_country_from_Inglaterra     0.360514
32      Reduced_country_from_Portugal     0.344343
1                                 Age    -0.328191
20    Reduced_team_from_Bayern Munich     0.280910
9                              GCA_BC     0.268477
14         Reduced_team_from_Juventus     0.266075
39               Position_Centre-Back     0.263434
21    Reduced_team_from_Bor. Dortmund     0.256135
17      Reduced_team_from_Real Madrid     0.243135
0                           Exp_contr     0.237680
15           Reduced_team_from_Napoli     0.233014
22          Reduced_team_from_Chelsea     0.215863
42        Position_Defensive Midfield     0.215267
33  Reduced_country_from_Países Bajos     0.213035
4                            NP_goals     0.200909
3                       Minutes_pl_BC

In [6]:
coef_dict = dict(zip(X.columns, best_model.coef_))

# (Opcional) Filtrar los que sean distintos de cero
coef_dict_no_cero = {k: v for k, v in coef_dict.items() if v != 0}

coef_ordenados = dict(sorted(coef_dict_no_cero.items(), key=lambda item: item[1], reverse=True))

# Mostrar los coeficientes ordenados
for variable, coef in coef_ordenados.items():
    print(f"{variable}: {coef}")

Reduced_team_from_Paris SG: 0.38743959252791765
Reduced_team_from_Atlético Madrid: 0.3735031765957066
Reduced_country_from_Inglaterra: 0.36051368626614677
Reduced_country_from_Portugal: 0.3443433203884835
Reduced_team_from_Bayern Munich: 0.28091041122066135
GCA_BC: 0.2684768138221964
Reduced_team_from_Juventus: 0.2660754428863584
Position_Centre-Back: 0.26343401209763045
Reduced_team_from_Bor. Dortmund: 0.2561352023057055
Reduced_team_from_Real Madrid: 0.24313503706313497
Exp_contr: 0.23767954790993062
Reduced_team_from_Napoli: 0.23301445271920726
Reduced_team_from_Chelsea: 0.21586317674995548
Position_Defensive Midfield: 0.21526658341649524
Reduced_country_from_Países Bajos: 0.2130350726940557
NP_goals: 0.20090947898716632
Minutes_pl_BC: 0.18999379471439473
Reduced_country_from_Brasil: 0.1590735793889858
Reduced_team_from_Inter: 0.1302364203472944
Interceptions: 0.10389009864926961
Assists: 0.08512242145533629
Position_Second Striker: 0.08481515514444285
Reduced_team_from_Man City: 0.

In [7]:
from sklearn.model_selection import cross_val_score

# Evaluación con validación cruzada usando R² como métrica
cv_scores = cross_val_score(best_model, X, Y, cv=5, scoring='r2')

# Mostrar resultados
print("R^2 por fold:", cv_scores)
print(f"R^2 promedio (validación cruzada): {cv_scores.mean():.4f}")
print(f"Desviación estándar: {cv_scores.std():.4f}")

R^2 por fold: [0.49073542 0.47137311 0.38717695 0.51223389 0.46465203]
R^2 promedio (validación cruzada): 0.4652
Desviación estándar: 0.0424
