In [154]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

pitch = pd.read_excel("corrected_pitch.xlsx")
gk = pd.read_excel("corrected_gk.xlsx")

In [155]:
def list_reduced(lista, mains):
    new_list = []
    for c in lista:
        if c in mains:
            new_list.append(c)
        else:
            new_list.append("Otros")
    return new_list

mains = ["Inglaterra", "Francia", "Espa√±a", "Italia", "Alemania", "Portugal", "Pa√≠ses Bajos", "Argentina", "Brasil"]

mains_team = ["Inter", "AC Milan", "Juventus", "Napoli", "Paris SG", "Real Madrid", "Barcelona", "Atl√©tico Madrid",
             "Bayern Munich", "Bor. Dortmund", "Chelsea", "Arsenal", "Man City", "Man Utd", "Liverpool"]

c_pitch_from_reduced = list_reduced(pitch.Country_from, mains)
c_pitch_to_reduced = list_reduced(pitch.Country_to, mains)

pitch["Reduced_country_from"] = c_pitch_from_reduced
pitch["Reduced_country_to"] = c_pitch_to_reduced

t_pitch_from_reduced = list_reduced(pitch.Team_from, mains_team)
t_pitch_to_reduced = list_reduced(pitch.Team_to, mains_team)

pitch["Reduced_team_from"] = t_pitch_from_reduced
pitch["Reduced_team_to"] = t_pitch_to_reduced

c_gk_from_reduced = list_reduced(gk.Country_from, mains)
c_gk_to_reduced = list_reduced(gk.Country_to, mains)

gk["Reduced_country_from"] = c_gk_from_reduced
gk["Reduced_country_to"] = c_gk_to_reduced

t_gk_from_reduced = list_reduced(gk.Team_from, mains_team)
t_gk_to_reduced = list_reduced(gk.Team_to, mains_team)

gk["Reduced_team_from"] = t_gk_from_reduced
gk["Reduced_team_to"] = t_gk_to_reduced

gk["Save_perc"] = gk.apply(lambda row: round((row['SoTA'] - row['GA'])/row['SoTA'], 2) if row['SoTA'] > 0 else 0, axis=1)
gk["Save_perc_BC"] = gk.apply(lambda row: round((row['SoTA_BC'] - row['GA_BC'])/row['SoTA_BC'], 2) if row['SoTA_BC'] > 0 else 0, axis=1)
gk["Save_perc_LS"] = gk.apply(lambda row: round((row['SoTA_LS'] - row['GA_LS'])/row['SoTA_LS'], 2) if row['SoTA_LS'] > 0 else 0, axis=1)

In [158]:
categoric_pitch = ['Reduced_team_from', 'Reduced_team_to', 'Reduced_country_from', 'Reduced_country_to', 'Position']

pitch_vars = ['Transfer_value', 'Exp_contr',
       'Year', 'Age', 'Minutes_pl', 'Minutes_pl_BC', 'NP_goals',
       'Pen_goals', 'Pen_goals_BC', 'Assists', 'Interceptions', 'GCA_BC',
       'Yellow_cards', 'Red_cards']

categoric_gk = ['Team_from', 'Team_to', 'Country_from', 'Country_to']
                   
gk_vars = ['Transfer_value', 'Exp_contr', 'Year', 'Age', 'Minutes_pl', 'Minutes_pl_LS', 'Minutes_pl_BC',
       'Save_perc', 'Save_perc_BC', 'Save_perc_LS', 'PKA', 'PKSv',
       'Assists', 'Yellow_cards', 'Red_cards']

In [159]:
mains = ["Otros", "Inglaterra", "Francia", "Espa√±a", "Italia", "Alemania", "Portugal", "Pa√≠ses Bajos", "Argentina", "Brasil"]

mains_team = ["Otros", "Inter", "AC Milan", "Juventus", "Napoli", "Paris SG", "Real Madrid", "Barcelona", "Atl√©tico Madrid",
             "Bayern Munich", "Bor. Dortmund", "Chelsea", "Arsenal", "Man City", "Man Utd", "Liverpool"]


pos = ['Right-Back', 'Centre-Forward', 'Left-Back', 'Central Midfield', 
       'Centre-Back', 'Right Winger', 'Attacking Midfield',
       'Defensive Midfield', 'Left Winger', 'Second Striker',
       'Right Midfield', 'Left Midfield']

pitch["Reduced_team_from"] = pd.Categorical(pitch["Reduced_team_from"], categories=mains_team)
df_dummies = pd.get_dummies(pitch[pitch_vars + categoric_pitch], columns=["Reduced_team_from"], drop_first=True)

pitch["Reduced_team_to"] = pd.Categorical(pitch["Reduced_team_to"], categories=mains_team)
df_dummies = pd.get_dummies(df_dummies, columns=["Reduced_team_to"], drop_first=True)

pitch["Reduced_country_from"] = pd.Categorical(pitch["Reduced_country_from"], categories=mains)
df_dummies = pd.get_dummies(df_dummies, columns=["Reduced_country_from"], drop_first=True)

pitch["Reduced_country_to"] = pd.Categorical(pitch["Reduced_country_to"], categories=mains)
df_dummies = pd.get_dummies(df_dummies, columns=["Reduced_country_to"], drop_first=True)

pitch["Position"] = pd.Categorical(pitch["Position"], categories=pos)
df_dummies = pd.get_dummies(df_dummies, columns=["Position"], drop_first=True)

In [160]:
X = df_dummies.drop("Transfer_value", axis=1)
y = df_dummies["Transfer_value"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# A√±adir una constante al conjunto de entrenamiento para el t√©rmino independiente (intercepto)
X_train_sm = sm.add_constant(X_train)

# Entrenar el modelo de Ordinary Least Squares (OLS) de statsmodels
model_sm = sm.OLS(y_train, X_train_sm).fit()

# Obtener los p-valores de los coeficientes
p_values = model_sm.pvalues

# Crear un DataFrame con los coeficientes y sus p-valores
coefficients_df = pd.DataFrame({
    "Variable": X_train_sm.columns,
    "Coeficiente": model_sm.params,
    "P-value": p_values
})

# Ordenar el DataFrame por el valor absoluto del coeficiente
print(coefficients_df.sort_values(by="Coeficiente", key=abs, ascending=False))

# Tambi√©n puedes ver un resumen completo del modelo con mucha m√°s informaci√≥n
print(model_sm.summary())

                                                Variable   Coeficiente  \
const                                              const -3.023109e+08   
Reduced_team_to_Barcelona      Reduced_team_to_Barcelona  3.530942e+07   
Reduced_team_to_Man Utd          Reduced_team_to_Man Utd  3.271107e+07   
Reduced_team_to_Real Madrid  Reduced_team_to_Real Madrid  3.136076e+07   
Reduced_team_to_Man City        Reduced_team_to_Man City  2.526581e+07   
...                                                  ...           ...   
Yellow_cards                                Yellow_cards  6.059324e+04   
Interceptions                              Interceptions  2.607730e+04   
Reduced_country_to_Italia      Reduced_country_to_Italia -1.644432e+04   
Minutes_pl_BC                              Minutes_pl_BC  7.388606e+02   
Minutes_pl                                    Minutes_pl -2.454812e+02   

                                  P-value  
const                        1.127862e-01  
Reduced_team_to_Barcelo

In [161]:
p_values_df = pd.DataFrame({"Variable": p_values.index, "P-value": p_values.values})

alpha = 0.05

# Identificar las variables significativas (excluyendo la constante)
significant_variables = p_values_df[(p_values_df["P-value"] <= alpha) & (p_values_df["Variable"] != "const")]["Variable"].tolist()

print("Variables significativas:")
print(significant_variables)

Variables significativas:
['Exp_contr', 'Age', 'Minutes_pl_BC', 'NP_goals', 'Pen_goals', 'Assists', 'Interceptions', 'GCA_BC', 'Reduced_team_from_Inter', 'Reduced_team_from_Juventus', 'Reduced_team_from_Napoli', 'Reduced_team_from_Paris SG', 'Reduced_team_from_Atl√©tico Madrid', 'Reduced_team_from_Chelsea', 'Reduced_team_to_Inter', 'Reduced_team_to_AC Milan', 'Reduced_team_to_Juventus', 'Reduced_team_to_Napoli', 'Reduced_team_to_Paris SG', 'Reduced_team_to_Real Madrid', 'Reduced_team_to_Barcelona', 'Reduced_team_to_Atl√©tico Madrid', 'Reduced_team_to_Bayern Munich', 'Reduced_team_to_Bor. Dortmund', 'Reduced_team_to_Chelsea', 'Reduced_team_to_Arsenal', 'Reduced_team_to_Man City', 'Reduced_team_to_Man Utd', 'Reduced_team_to_Liverpool', 'Reduced_country_from_Inglaterra', 'Reduced_country_from_Francia', 'Reduced_country_from_Espa√±a', 'Reduced_country_from_Portugal', 'Reduced_country_from_Pa√≠ses Bajos', 'Reduced_country_to_Inglaterra', 'Position_Centre-Back', 'Position_Defensive Midfield'

In [162]:
no_significant_variables = p_values_df[(p_values_df["P-value"] > alpha) & (p_values_df["Variable"] != "const")]["Variable"].tolist()

print("Variables significativas:")
print(no_significant_variables)

Variables significativas:
['Year', 'Minutes_pl', 'Pen_goals_BC', 'Yellow_cards', 'Red_cards', 'Reduced_team_from_AC Milan', 'Reduced_team_from_Real Madrid', 'Reduced_team_from_Barcelona', 'Reduced_team_from_Bayern Munich', 'Reduced_team_from_Bor. Dortmund', 'Reduced_team_from_Arsenal', 'Reduced_team_from_Man City', 'Reduced_team_from_Man Utd', 'Reduced_team_from_Liverpool', 'Reduced_country_from_Italia', 'Reduced_country_from_Alemania', 'Reduced_country_from_Argentina', 'Reduced_country_from_Brasil', 'Reduced_country_to_Francia', 'Reduced_country_to_Espa√±a', 'Reduced_country_to_Italia', 'Reduced_country_to_Alemania', 'Reduced_country_to_Portugal', 'Reduced_country_to_Pa√≠ses Bajos', 'Reduced_country_to_Argentina', 'Reduced_country_to_Brasil', 'Position_Centre-Forward', 'Position_Left-Back', 'Position_Central Midfield', 'Position_Right Winger', 'Position_Attacking Midfield', 'Position_Left Winger', 'Position_Second Striker', 'Position_Right Midfield', 'Position_Left Midfield']


In [163]:
X = df_dummies[significant_variables]
y = df_dummies["Transfer_value"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# A√±adir una constante al conjunto de entrenamiento para el t√©rmino independiente (intercepto)
X_train_sm = sm.add_constant(X_train)

# Entrenar el modelo de Ordinary Least Squares (OLS) de statsmodels
model_sm = sm.OLS(y_train, X_train_sm).fit()

# Obtener los p-valores de los coeficientes
p_values = model_sm.pvalues

# Crear un DataFrame con los coeficientes y sus p-valores
coefficients_df = pd.DataFrame({
    "Variable": X_train_sm.columns,
    "Coeficiente": model_sm.params,
    "P-value": p_values
})

# Ordenar el DataFrame por el valor absoluto del coeficiente
print(coefficients_df.sort_values(by="Coeficiente", key=abs, ascending=False))

# Tambi√©n puedes ver un resumen completo del modelo con mucha m√°s informaci√≥n
print(model_sm.summary())

                                                            Variable  \
Reduced_team_to_Barcelona                  Reduced_team_to_Barcelona   
Reduced_team_to_Man Utd                      Reduced_team_to_Man Utd   
Reduced_team_to_Real Madrid              Reduced_team_to_Real Madrid   
Reduced_team_to_Man City                    Reduced_team_to_Man City   
Reduced_team_to_Chelsea                      Reduced_team_to_Chelsea   
Reduced_team_to_Paris SG                    Reduced_team_to_Paris SG   
Reduced_team_to_Liverpool                  Reduced_team_to_Liverpool   
Reduced_team_to_Arsenal                      Reduced_team_to_Arsenal   
Reduced_team_to_Bayern Munich          Reduced_team_to_Bayern Munich   
Reduced_team_to_Juventus                    Reduced_team_to_Juventus   
const                                                          const   
Reduced_team_from_Atl√©tico Madrid  Reduced_team_from_Atl√©tico Madrid   
Reduced_team_from_Paris SG                Reduced_team_from_Pa

In [138]:
# dummies_coef = coeficientes[coeficientes["Variable"].str.contains("Reduced_team_to")]

# # Ver los coeficientes de las dummies
# print(dummies_coef.sort_values(by="Coeficiente", key=abs, ascending=False))

Regresi√≥n lineal / regresi√≥n Lasso/Ridge/ElasticNet
Buen punto de partida. Puedes ver directamente la influencia de cada variable sobre el valor del jugador.

√Årboles de decisi√≥n
Son f√°ciles de visualizar y explicar (aunque pueden sobreajustarse). √ötiles para mostrar reglas del tipo: ‚Äúsi tiene m√°s de 25 a√±os y juega en la Premier, entonces...‚Äù

Reglas de decisi√≥n (RuleFit, BRL, etc.)
Modelos que combinan reglas simples con pesos. Son m√°s interpretables que muchos otros.

üîç Modelos complejos + t√©cnicas de explicabilidad (black-box + XAI)
Estos modelos son m√°s potentes, pero necesitas aplicar t√©cnicas de XAI para entenderlos:

Random Forest / Gradient Boosting (XGBoost, LightGBM)
Excelentes para rendimiento predictivo. Puedes explicar predicciones con:

SHAP (SHapley Additive exPlanations): muestra la contribuci√≥n de cada variable a la predicci√≥n.

LIME (Local Interpretable Model-agnostic Explanations): crea explicaciones locales alrededor de cada predicci√≥n.

----------------------------------------

EBM (Explainable Boosting Machine) para una soluci√≥n lista y explicable.

XGBoost + SHAP para alta precisi√≥n con explicaci√≥n post hoc.

GAM (pyGAM) si tus variables tienen relaciones suaves y no lineales.

Redujimos el n√∫mero de equipos, 