In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

pitch = pd.read_excel("reduced_defensas.xlsx")

# sqrt_vars = ['Interceptions', 'Yellow_cards', 'Transfer_value', 'Exp_contr', 'Minutes_pl_BC', 'NP_goals', 'Pen_goals', 'Pen_goals_BC', 'Assists', 'GCA_BC', 'Red_cards']
# for var in sqrt_vars: 
#     pitch[var] = pitch[var].apply(np.sqrt)


numeric_vars = ['Transfer_value', 'Exp_contr', 'Age', 'Minutes_pl', 'Minutes_pl_BC', 'NP_goals', 'Pen_goals', 'Pen_goals_BC', 'Assists', 'Interceptions', 'GCA_BC', 'Yellow_cards', 'Red_cards']

scaler = StandardScaler()
pitch[numeric_vars] = scaler.fit_transform(pitch[numeric_vars])

In [4]:
X = pitch.drop("Transfer_value", axis=1)
Y = pitch["Transfer_value"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Definir el modelo de árbol de decisión
dt_model = DecisionTreeRegressor(random_state=42)

# Hiperparámetros a ajustar con GridSearchCV
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 5, 10, 15, 20],
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error']
}

# Realizar la búsqueda de hiperparámetros con validación cruzada (solo en entrenamiento)
grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, Y_train)

# Mejor modelo encontrado
best_model = grid_search.best_estimator_

# Predicciones en ambos conjuntos
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Métricas en entrenamiento
r2_train = best_model.score(X_train, Y_train)
mse_train = mean_squared_error(Y_train, y_train_pred)

# Métricas en prueba
r2_test = best_model.score(X_test, Y_test)
mse_test = mean_squared_error(Y_test, y_test_pred)

# Imprimir resultados
print("Mejores hiperparámetros:", grid_search.best_params_)
print(f"R^2 en entrenamiento: {r2_train}")
print(f"MSE en entrenamiento: {mse_train}")
print(f"R^2 en prueba: {r2_test}")
print(f"MSE en prueba: {mse_test}")

Mejores hiperparámetros: {'criterion': 'squared_error', 'max_depth': 10, 'min_samples_leaf': 15, 'min_samples_split': 2}
R^2 en entrenamiento: 0.5081981463398161
MSE en entrenamiento: 0.45416962636602
R^2 en prueba: 0.45802959586798175
MSE en prueba: 0.6932410637015808


In [9]:
importances = best_model.feature_importances_
feature_importance = pd.Series(importances, index=X.columns).sort_values(ascending=False)
print(feature_importance.head(11)) 

Exp_contr                            0.445336
Interceptions                        0.175004
GCA_BC                               0.134111
NP_goals                             0.091126
Age                                  0.081297
Minutes_pl_BC                        0.031404
Reduced_country_from_Países Bajos    0.020862
Reduced_country_from_Inglaterra      0.012001
Minutes_pl                           0.004367
Assists                              0.002623
Yellow_cards                         0.001869
dtype: float64
