In [20]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from statsmodels.stats.diagnostic import het_breuschpagan

defensas = pd.read_excel("reduced_defensas.xlsx")

numeric_vars = ['Transfer_value', 'Exp_contr', 'Age', 'Minutes_pl', 'Minutes_pl_BC', 'NP_goals', 'Pen_goals', 'Pen_goals_BC', 'Assists', 'Interceptions', 'GCA_BC', 'Yellow_cards', 'Red_cards']
defensas[numeric_vars] = StandardScaler().fit_transform(defensas[numeric_vars])

In [21]:
dic_vars = {
'vars1': ['Age', 'Exp_contr', 'Minutes_pl_BC', 'NP_goals', 'Pen_goals', 'Pen_goals_BC', 'Assists', 'Interceptions', 'GCA_BC', 'Yellow_cards', 'Red_cards'],
    
'vars2': ['Reduced_team_from_Inter', 'Reduced_team_from_AC Milan', 'Reduced_team_from_Juventus', 'Reduced_team_from_Napoli', 'Reduced_team_from_Paris SG', 'Reduced_team_from_Real Madrid', 'Reduced_team_from_Barcelona', 'Reduced_team_from_Atlético Madrid',
         'Reduced_team_from_Bayern Munich', 'Reduced_team_from_Bor. Dortmund', 'Reduced_team_from_Chelsea', 'Reduced_team_from_Arsenal', 'Reduced_team_from_Man City', 'Reduced_team_from_Man Utd', 'Reduced_team_from_Liverpool', 'Reduced_country_from_Inglaterra', 
         'Reduced_country_from_Francia', 'Reduced_country_from_España', 'Reduced_country_from_Italia', 'Reduced_country_from_Alemania', 'Reduced_country_from_Portugal', 'Reduced_country_from_Países Bajos', 'Reduced_country_from_Argentina', 'Reduced_country_from_Brasil'],
}

In [22]:
def multiple_lr(df, indep_vars):
    X = df[indep_vars]
    y = df["Transfer_value"]  # Transfer_value_sqrt

    # Añadir una constante al conjunto de entrenamiento para el término independiente (intercepto)
    X = sm.add_constant(X)

    # Entrenar el modelo de Ordinary Least Squares (OLS) de statsmodels
    model_sm = sm.OLS(y, X).fit()
    
    model_sm_robust = model_sm.get_robustcov_results(cov_type='HC3')

    # Obtener los p-valores de los coeficientes
    p_values = model_sm_robust.pvalues

    # Crear un DataFrame con los coeficientes y sus p-valores
    p_values_df = pd.DataFrame({
        "Variable": X.columns,
        "P-value": p_values
    })
    
    mse = model_sm_robust.mse_resid
    print(f"MSE: {mse}")

    # Ordenar el DataFrame por el valor absoluto del coeficiente
#     print(coefficients_df.sort_values(by="Coeficiente", key=abs, ascending=False))

    
    return (model_sm_robust, p_values_df)

In [23]:
(summ, p_values) = multiple_lr(defensas, dic_vars['vars1'])

MSE: 0.6005690679366017


In [24]:
ns = 1
significant_variables = dic_vars['vars1']

while ns > 0:
    summ, p_values_df = multiple_lr(defensas, significant_variables)

    alpha = 0.05

    # Identificar las variables significativas (excluyendo la constante)
    significant_variables = p_values_df[(p_values_df["P-value"] <= alpha) & (p_values_df["Variable"] != "const")]["Variable"].tolist()
    ns = len(p_values_df) - len(significant_variables) - 1
#     print(ns)

significant_variables_next = significant_variables
print(summ.summary())

MSE: 0.6005690679366017
MSE: 0.6017022964283191
                            OLS Regression Results                            
Dep. Variable:         Transfer_value   R-squared:                       0.403
Model:                            OLS   Adj. R-squared:                  0.399
Method:                 Least Squares   F-statistic:                     35.75
Date:                Sat, 10 May 2025   Prob (F-statistic):           9.82e-45
Time:                        15:52:37   Log-Likelihood:                -1170.2
No. Observations:                1008   AIC:                             2356.
Df Residuals:                    1000   BIC:                             2396.
Df Model:                           7                                         
Covariance Type:                  HC3                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------

In [25]:
ns = 1
significant_variables = list(significant_variables_next + dic_vars['vars2'])

while ns > 0:
    summ, p_values_df = multiple_lr(defensas, significant_variables)

    alpha = 0.05

    # Identificar las variables significativas (excluyendo la constante)
    significant_variables = p_values_df[(p_values_df["P-value"] <= alpha) & (p_values_df["Variable"] != "const")]["Variable"].tolist()
    ns = len(p_values_df) - len(significant_variables) - 1
#     print(ns)

significant_variables_next = significant_variables
print(summ.summary())

MSE: 0.5569239865552015
MSE: 0.5734802009363157
MSE: 0.580201705696078
                            OLS Regression Results                            
Dep. Variable:         Transfer_value   R-squared:                       0.427
Model:                            OLS   Adj. R-squared:                  0.420
Method:                 Least Squares   F-statistic:                     256.5
Date:                Sat, 10 May 2025   Prob (F-statistic):          1.88e-281
Time:                        15:52:37   Log-Likelihood:                -1149.9
No. Observations:                1008   AIC:                             2324.
Df Residuals:                     996   BIC:                             2383.
Df Model:                          11                                         
Covariance Type:                  HC3                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------

In [8]:
defensas.columns

Index(['Unnamed: 0', 'Transfer_value', 'Exp_contr', 'Age', 'Minutes_pl',
       'Minutes_pl_BC', 'NP_goals', 'Pen_goals', 'Pen_goals_BC', 'Assists',
       'Interceptions', 'GCA_BC', 'Yellow_cards', 'Red_cards',
       'Reduced_team_from_Inter', 'Reduced_team_from_AC Milan',
       'Reduced_team_from_Juventus', 'Reduced_team_from_Napoli',
       'Reduced_team_from_Paris SG', 'Reduced_team_from_Real Madrid',
       'Reduced_team_from_Barcelona', 'Reduced_team_from_Atlético Madrid',
       'Reduced_team_from_Bayern Munich', 'Reduced_team_from_Bor. Dortmund',
       'Reduced_team_from_Chelsea', 'Reduced_team_from_Arsenal',
       'Reduced_team_from_Man City', 'Reduced_team_from_Man Utd',
       'Reduced_team_from_Liverpool', 'Reduced_country_from_Inglaterra',
       'Reduced_country_from_Francia', 'Reduced_country_from_España',
       'Reduced_country_from_Italia', 'Reduced_country_from_Alemania',
       'Reduced_country_from_Portugal', 'Reduced_country_from_Países Bajos',
       'Reduced

In [17]:









defensas = pd.read_excel("delanteros.xlsx")

def list_reduced(lista, mains):
    new_list = []
    for c in lista:
        if c in mains:
            new_list.append(c)
        else:
            new_list.append("Otros")
    return new_list

mains = ["Inglaterra", "Francia", "España", "Italia", "Alemania", "Portugal", "Países Bajos", "Argentina", "Brasil"]

mains_team = ["Inter", "AC Milan", "Juventus", "Napoli", "Paris SG", "Real Madrid", "Barcelona", "Atlético Madrid",
             "Bayern Munich", "Bor. Dortmund", "Chelsea", "Arsenal", "Man City", "Man Utd", "Liverpool"]

c_defensas_from_reduced = list_reduced(defensas.Country_from, mains)
defensas["Reduced_country_from"] = c_defensas_from_reduced

t_defensas_from_reduced = list_reduced(defensas.Team_from, mains_team)
defensas["Reduced_team_from"] = t_defensas_from_reduced

In [18]:
categoric_pitch = ['Reduced_team_from', 'Reduced_country_from', 'Position']

pitch_vars = ['Transfer_value', 'Exp_contr',
       'Age', 'Minutes_pl', 'Minutes_pl_BC', 'NP_goals',
       'Pen_goals', 'Pen_goals_BC', 'Assists', 'Interceptions', 'GCA_BC',
       'Yellow_cards', 'Red_cards']

mains = ["Otros", "Inglaterra", "Francia", "España", "Italia", "Alemania", "Portugal", "Países Bajos", "Argentina", "Brasil"]

mains_team = ["Otros", "Inter", "AC Milan", "Juventus", "Napoli", "Paris SG", "Real Madrid", "Barcelona", "Atlético Madrid",
             "Bayern Munich", "Bor. Dortmund", "Chelsea", "Arsenal", "Man City", "Man Utd", "Liverpool"]

defensas["Reduced_team_from"] = pd.Categorical(defensas["Reduced_team_from"], categories=mains_team)
df_dummies = pd.get_dummies(defensas[pitch_vars + categoric_pitch], columns=["Reduced_team_from"], drop_first=True)

df_dummies["Reduced_country_from"] = pd.Categorical(defensas["Reduced_country_from"], categories=mains)
df_dummies = pd.get_dummies(df_dummies, columns=["Reduced_country_from"], drop_first=True)

In [19]:
df_dummies.drop('Position', axis = 1).to_excel("reduced_delanteros.xlsx", index = False)