In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from statsmodels.stats.diagnostic import het_breuschpagan

delanteros = pd.read_excel("reduced_delanteros.xlsx")

numeric_vars = ['Transfer_value', 'Exp_contr', 'Age', 'Minutes_pl', 'Minutes_pl_BC', 'NP_goals', 'Pen_goals', 'Pen_goals_BC', 'Assists', 'Interceptions', 'GCA_BC', 'Yellow_cards', 'Red_cards']
delanteros[numeric_vars] = StandardScaler().fit_transform(delanteros[numeric_vars])

In [2]:
dic_vars = {
'vars1': ['Age', 'Exp_contr', 'Minutes_pl_BC', 'NP_goals', 'Pen_goals', 'Pen_goals_BC', 'Assists', 'Interceptions', 'GCA_BC', 'Yellow_cards', 'Red_cards'],
    
'vars2': ['Reduced_team_from_Inter', 'Reduced_team_from_AC Milan', 'Reduced_team_from_Juventus', 'Reduced_team_from_Napoli', 'Reduced_team_from_Paris SG', 'Reduced_team_from_Real Madrid', 'Reduced_team_from_Barcelona', 'Reduced_team_from_Atlético Madrid',
         'Reduced_team_from_Bayern Munich', 'Reduced_team_from_Bor. Dortmund', 'Reduced_team_from_Chelsea', 'Reduced_team_from_Arsenal', 'Reduced_team_from_Man City', 'Reduced_team_from_Man Utd', 'Reduced_team_from_Liverpool', 'Reduced_country_from_Inglaterra', 
         'Reduced_country_from_Francia', 'Reduced_country_from_España', 'Reduced_country_from_Italia', 'Reduced_country_from_Alemania', 'Reduced_country_from_Portugal', 'Reduced_country_from_Países Bajos', 'Reduced_country_from_Argentina', 'Reduced_country_from_Brasil'],
}

In [3]:
def multiple_lr(df, indep_vars):
    X = df[indep_vars]
    y = df["Transfer_value"]  # Transfer_value_sqrt

    # Añadir una constante al conjunto de entrenamiento para el término independiente (intercepto)
    X = sm.add_constant(X)

    # Entrenar el modelo de Ordinary Least Squares (OLS) de statsmodels
    model_sm = sm.OLS(y, X).fit()
    
    model_sm_robust = model_sm.get_robustcov_results(cov_type='HC3')

    # Obtener los p-valores de los coeficientes
    p_values = model_sm_robust.pvalues

    # Crear un DataFrame con los coeficientes y sus p-valores
    p_values_df = pd.DataFrame({
        "Variable": X.columns,
        "P-value": p_values
    })
    
    mse = model_sm_robust.mse_resid
    print(f"MSE: {mse}")

    # Ordenar el DataFrame por el valor absoluto del coeficiente
#     print(coefficients_df.sort_values(by="Coeficiente", key=abs, ascending=False))

    
    return (model_sm_robust, p_values_df)

In [4]:
(summ, p_values) = multiple_lr(delanteros, dic_vars['vars1'])

MSE: 0.5029998102545604


In [5]:
ns = 1
significant_variables = dic_vars['vars1']

while ns > 0:
    summ, p_values_df = multiple_lr(delanteros, significant_variables)

    alpha = 0.05

    # Identificar las variables significativas (excluyendo la constante)
    significant_variables = p_values_df[(p_values_df["P-value"] <= alpha) & (p_values_df["Variable"] != "const")]["Variable"].tolist()
    ns = len(p_values_df) - len(significant_variables) - 1
#     print(ns)

significant_variables_next = significant_variables
print(summ.summary())

MSE: 0.5029998102545604
MSE: 0.5038584589380325
                            OLS Regression Results                            
Dep. Variable:         Transfer_value   R-squared:                       0.499
Model:                            OLS   Adj. R-squared:                  0.497
Method:                 Least Squares   F-statistic:                     69.69
Date:                Sat, 10 May 2025   Prob (F-statistic):           2.19e-64
Time:                        15:59:16   Log-Likelihood:                -1348.7
No. Observations:                1256   AIC:                             2709.
Df Residuals:                    1250   BIC:                             2740.
Df Model:                           5                                         
Covariance Type:                  HC3                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
cons

In [6]:
ns = 1
significant_variables = list(significant_variables_next + dic_vars['vars2'])

while ns > 0:
    summ, p_values_df = multiple_lr(delanteros, significant_variables)

    alpha = 0.05

    # Identificar las variables significativas (excluyendo la constante)
    significant_variables = p_values_df[(p_values_df["P-value"] <= alpha) & (p_values_df["Variable"] != "const")]["Variable"].tolist()
    ns = len(p_values_df) - len(significant_variables) - 1
#     print(ns)

significant_variables_next = significant_variables
print(summ.summary())

MSE: 0.4717472029218676
MSE: 0.4850937686223945
MSE: 0.48482895277822113
                            OLS Regression Results                            
Dep. Variable:         Transfer_value   R-squared:                       0.519
Model:                            OLS   Adj. R-squared:                  0.516
Method:                 Least Squares   F-statistic:                     42.70
Date:                Sat, 10 May 2025   Prob (F-statistic):           6.37e-67
Time:                        15:59:19   Log-Likelihood:                -1322.5
No. Observations:                1256   AIC:                             2665.
Df Residuals:                    1246   BIC:                             2716.
Df Model:                           9                                         
Covariance Type:                  HC3                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------