In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LassoCV
import warnings
warnings.filterwarnings("ignore")  # Para evitar mensajes innecesarios de statsmodels


In [2]:
#%% Cargar el dataset de diabetes (regresión)
data = load_diabetes()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

print("Dimensiones del dataset:", df.shape)
print(df.head())


Dimensiones del dataset: (442, 11)
        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  


In [3]:
#%% 1. Método de Filtro: Correlación entre variables predictoras
def correlation_filtering(dataframe, threshold):
    predictors = dataframe.drop(columns=['target'])
    corr_matrix = predictors.corr().abs()
    # Selecciona la parte superior de la matriz para evitar duplicados
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    # Variables a eliminar: si tienen correlación mayor al umbral con alguna otra
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    selected_features = [col for col in predictors.columns if col not in to_drop]
    return selected_features

selected_corr = correlation_filtering(df, threshold=0.8)
print("\n[Filtro - Correlación] Variables seleccionadas:", selected_corr)


[Filtro - Correlación] Variables seleccionadas: ['age', 'sex', 'bmi', 'bp', 's1', 's3', 's4', 's5', 's6']


In [4]:
#%% 2. Función para filtrado con VIF
def vif_filtering(dataframe, threshold=5):
    predictors = dataframe.drop(columns=['target'])
    features = list(predictors.columns)
    
    while True:
        # Calcular VIF para cada variable
        vif_data = pd.DataFrame()
        vif_data['variable'] = features
        vif_data['VIF'] = [variance_inflation_factor(predictors[features].values, i) 
                           for i in range(len(features))]
        
        # Si el máximo VIF es mayor al umbral, se elimina la variable correspondiente
        max_vif = vif_data['VIF'].max()
        if max_vif > threshold:
            variable_to_drop = vif_data.loc[vif_data['VIF'].idxmax(), 'variable']
            features.remove(variable_to_drop)
        else:
            break
    return features

selected_vif = vif_filtering(df, threshold=5)
print("\n[Filtro - VIF] Variables seleccionadas:", selected_vif)


[Filtro - VIF] Variables seleccionadas: ['age', 'sex', 'bmi', 'bp', 's2', 's3', 's5', 's6']


In [5]:
#%% Preparamos X e y para los siguientes métodos (dataset original)
X = df.drop(columns=['target'])
y = df['target']


In [6]:
#%% 3. Método de Envoltura: Forward Selection (Selección hacia adelante)
def forward_selection_regression(X, y, significance_level=0.05):
    selected_features = []
    remaining_features = list(X.columns)
    
    while remaining_features:
        pvals = pd.Series(index=remaining_features, dtype=float)
        for feature in remaining_features:
            features_to_test = selected_features + [feature]
            X_with_const = sm.add_constant(X[features_to_test])
            model = sm.OLS(y, X_with_const).fit()
            pvals[feature] = model.pvalues[feature]
        min_pval = pvals.min()
        if min_pval < significance_level:
            best_feature = pvals.idxmin()
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break
    return selected_features

selected_forward = forward_selection_regression(X, y, significance_level=0.05)
print("\n[Envoltura - Forward] Variables seleccionadas:", selected_forward)



[Envoltura - Forward] Variables seleccionadas: ['bmi', 's5', 'bp', 's1', 'sex', 's2']


In [7]:
#%% 4. Método de Envoltura: Backward Elimination (Eliminación hacia atrás)
def backward_elimination_regression(X, y, significance_level=0.05):
    features = list(X.columns)
    while len(features) > 0:
        X_with_const = sm.add_constant(X[features])
        model = sm.OLS(y, X_with_const).fit()
        # Excluir la constante al evaluar los p-valores
        pvalues = model.pvalues.iloc[1:]
        max_pval = pvalues.max()
        if max_pval > significance_level:
            worst_feature = pvalues.idxmax()
            features.remove(worst_feature)
        else:
            break
    return features

selected_backward = backward_elimination_regression(X, y, significance_level=0.05)
print("\n[Envoltura - Backward] Variables seleccionadas:", selected_backward)



[Envoltura - Backward] Variables seleccionadas: ['sex', 'bmi', 'bp', 's1', 's2', 's5']


In [8]:
#%% 5. Método de Envoltura: Stepwise Selection (Selección por pasos bidireccional)
def stepwise_selection_regression(X, y, significance_level_in=0.05, significance_level_out=0.05):
    included = []
    while True:
        changed = False
        
        # Paso de incorporación
        excluded = list(set(X.columns) - set(included))
        new_pvals = pd.Series(index=excluded, dtype=float)
        for new_feature in excluded:
            X_with_const = sm.add_constant(pd.DataFrame(X[included + [new_feature]]))
            model = sm.OLS(y, X_with_const).fit()
            new_pvals[new_feature] = model.pvalues[new_feature]
        if not new_pvals.empty:
            best_feature = new_pvals.idxmin()
            if new_pvals[best_feature] < significance_level_in:
                included.append(best_feature)
                changed = True
        
        # Paso de eliminación
        if included:
            X_with_const = sm.add_constant(pd.DataFrame(X[included]))
            model = sm.OLS(y, X_with_const).fit()
            pvalues = model.pvalues.iloc[1:]
            worst_feature = pvalues.idxmax()
            if pvalues[worst_feature] > significance_level_out:
                included.remove(worst_feature)
                changed = True
        
        if not changed:
            break
    return included

selected_stepwise = stepwise_selection_regression(X, y, significance_level_in=0.05, significance_level_out=0.05)
print("\n[Envoltura - Stepwise] Variables seleccionadas:", selected_stepwise)



[Envoltura - Stepwise] Variables seleccionadas: ['bmi', 's5', 'bp', 's1', 'sex', 's2']


In [9]:
#%% 6. Método Integrado: Regularización con Lasso
def lasso_selection(X, y):
    # LassoCV realiza la validación cruzada para seleccionar el mejor alpha
    lasso = LassoCV(cv=5, random_state=0).fit(X, y)
    coef = pd.Series(lasso.coef_, index=X.columns)
    selected_features = list(coef[coef != 0].index)
    return selected_features

selected_lasso = lasso_selection(X, y)
print("\n[Integrado - Lasso] Variables seleccionadas:", selected_lasso)



[Integrado - Lasso] Variables seleccionadas: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's4', 's5', 's6']
