# Machine Learning Pipeline - Feature Selection

In this notebook, we pick up the transformed datasets that we saved in the previous notebook.

## Reproducibility: Setting the seed

With the aim to ensure reproducibility between runs of the same notebook, but also between the research and production environment, for each step that includes some element of randomness, it is extremely important that we **set the seed**.

In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# to build the models
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectFromModel

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [3]:
# cargar el conjunto de entrenamiento y prueba con las engineered variables

# Nosotros construimos y guardamos estos conjuntos de datos en la lección anterior.
# Si no lo has hecho, revisa el NOTEBOOK anterior para saber cómo crear estos conjuntos de datos.
X_train = pd.read_parquet('../data/xtrain_postprocess.parquet')
X_test= pd.read_parquet('../data/xtest_postprocess.parquet')

X_train.head()

Unnamed: 0,24M_MONTO,24M_TASA,EDAD,MARCA_LABORAL,PROPENSION,COMPETITIVIDAD,PRINCIPALIDAD_CONSUMO,ULTIMA_AGRUPACION,RANGO_RCI,ESTADO_CIVIL,GENERO,veces_acepto_producto,tiempo_desde_ultima_conversion,tiempo_desde_ultima_negacion,intentos_totales,meses_gestionados,dias_ultima_gestion,ultima_gestion,veces_sin_respuesta,veces_solicitud_seguimiento,promedio_dias_entre_gestiones,max_intentos_en_un_mes,veces_respuesta_positiva,veces_respuesta_negativa,_merge_variables,Bancos_PLD_Total,Cajas_PLD_Total,Retail_PLD_Total,PLD_Total,Bancos_TC_Total,Retail_TC_Total,TC_Total,Bancos_PLD_Entidades,Cajas_PLD_Entidades,Retail_PLD_Entidades,PLD_Entidades,Bancos_TC_Entidades,Retail_TC_Entidades,TC_Entidades,TC_Entidades_Mas3,Tiene_Deuda_PLD,CANTIDAD_CELULARES,veces_acepto_producto_na,tiempo_desde_ultima_conversion_na,tiempo_desde_ultima_negacion_na,intentos_totales_na,meses_gestionados_na,dias_ultima_gestion_na,veces_sin_respuesta_na,veces_solicitud_seguimiento_na,promedio_dias_entre_gestiones_na,max_intentos_en_un_mes_na,veces_respuesta_positiva_na,veces_respuesta_negativa_na,ESTADO_TASA,ESTADO_OFERTA,NUEVA_OFERTA
0,1.291955,-0.048104,-0.635266,0.430569,-1.435378,1.023889,0.047958,0.573193,1.131137,0.738081,1.027575,0.0,0.0,0.082951,-2.599419e-16,-9.124915e-16,0.144654,0.612022,0.067193,7.642187e-17,0.322698,2.800129e-16,1.185844e-16,0.536515,-0.627359,-0.41253,-0.21688,-0.250451,-0.36634,1.77878,-0.553423,2.723939,-0.405492,-0.215308,-0.245543,-0.526079,5.032017,-0.50462,3.082557,-0.070643,-0.553611,1.874272,0.627359,0.007422,0.250199,0.627359,0.627359,0.627359,0.627359,0.627359,0.627359,0.627359,0.627359,0.627359,-0.771717,-0.770394,-0.591161
1,0.003182,-0.048104,0.474877,0.430569,0.696681,-0.976669,-1.015682,1.881238,-0.567665,-1.354865,-0.973165,0.0,0.0,0.082951,-2.599419e-16,-9.124915e-16,0.144654,0.612022,0.067193,7.642187e-17,0.322698,2.800129e-16,1.185844e-16,0.536515,-0.627359,-0.41253,-0.21688,-0.250451,-0.36634,-0.562183,-0.553423,-0.391352,-0.405492,-0.215308,-0.245543,-0.526079,-0.519054,-0.50462,-0.714007,-0.070643,-0.553611,-0.915133,0.627359,0.007422,0.250199,0.627359,0.627359,0.627359,0.627359,0.627359,0.627359,0.627359,0.627359,0.627359,-0.002441,0.358935,-0.591161
2,1.203497,0.651609,0.050392,-1.273593,0.696681,1.023889,0.047958,-0.734852,-0.567665,-1.354865,-0.973165,-0.02628,0.0,0.082951,-0.449434,3.207781,0.144654,-1.726393,0.067193,-0.2363434,0.322698,-0.4853716,-0.2580264,-1.863879,1.593984,-0.41253,-0.21688,-0.250451,-0.36634,1.77878,1.806935,0.434225,-0.405492,-0.215308,-0.245543,-0.526079,3.18166,1.27419,3.082557,-0.070643,-0.553611,-0.915133,-1.593984,0.007422,0.250199,-1.593984,-1.593984,-1.593984,-1.593984,-1.593984,-1.593984,-1.593984,-1.593984,-1.593984,-0.771717,0.358935,-0.591161
3,-2.807544,-0.735512,1.64485,0.430569,0.696681,-0.976669,1.111599,-0.734852,1.980538,-1.354865,1.027575,0.0,0.0,0.082951,-2.599419e-16,-9.124915e-16,0.144654,0.612022,0.067193,7.642187e-17,0.322698,2.800129e-16,1.185844e-16,0.536515,-0.627359,2.424065,-0.21688,-0.250451,2.828669,-0.562183,-0.553423,-0.391352,2.298049,-0.215308,-0.245543,1.520659,-0.519054,-0.50462,-0.714007,-0.070643,1.806322,1.874272,0.627359,0.007422,0.250199,0.627359,0.627359,0.627359,0.627359,0.627359,0.627359,0.627359,0.627359,0.627359,-0.771717,0.358935,-0.591161
4,1.360376,0.651609,-0.855836,-0.421512,0.696681,-0.976669,0.047958,-0.734852,-0.567665,0.738081,-0.973165,-0.02628,0.0,0.082951,-0.7421025,-0.9017157,0.144654,-1.726393,0.067193,-0.2363434,0.322698,-0.6430049,-0.2580264,-1.863879,1.593984,-0.41253,-0.21688,-0.250451,-0.36634,1.77878,-0.553423,-0.36258,-0.405492,-0.215308,-0.245543,-0.526079,1.331303,-0.50462,0.551514,-0.070643,-0.553611,0.944471,-1.593984,0.007422,0.250199,-1.593984,-1.593984,-1.593984,-1.593984,-1.593984,-1.593984,-1.593984,-1.593984,-1.593984,-0.771717,-0.770394,-0.591161


In [5]:
# Cargar el objetivo (recuerda que el objetivo está transformado logarítmicamente).
y_train = pd.read_parquet('../data/ytrain_postprocess.parquet')
y_test = pd.read_parquet('../data/ytest_postprocess.parquet')

y_train.head()

Unnamed: 0,target
669583,0.0
661308,0.0
743633,0.0
148963,0.0
557373,0.0


### Feature Selection

Vamos a seleccionar un subconjunto de las características más predictivas. Hay un elemento de aleatoriedad en la regresión Lasso, así que recuerda establecer la semilla.

In [9]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier  # O puedes usar XGBClassifier

# Modelo LGBM con manejo de desbalanceo
model = LGBMClassifier(n_estimators=500, importance_type='gain', scale_pos_weight=10, random_state=0)

# Selector de variables con LGBM
selector = SelectFromModel(model, threshold="mean", prefit=False)

# Entrenar el modelo y seleccionar características
selector.fit(X_train, y_train)

# Obtener las variables seleccionadas
selected_features = X_train.columns[selector.get_support()]
print("📌 Variables seleccionadas:", selected_features.tolist())


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 794, number of negative: 689153
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025673 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1236
[LightGBM] [Info] Number of data points in the train set: 689947, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001151 -> initscore=-6.766135
[LightGBM] [Info] Start training from score -6.766135
📌 Variables seleccionadas: ['24M_MONTO', 'EDAD', 'PRINCIPALIDAD_CONSUMO', 'ULTIMA_AGRUPACION', 'intentos_totales', 'max_intentos_en_un_mes', 'veces_respuesta_positiva']


Cantidad de Características seleccionadas:

In [10]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest

# Selección de variables basada en información mutua
selector = SelectKBest(score_func=mutual_info_classif, k=10)  # Selecciona las 10 mejores
X_new = selector.fit_transform(X_train, y_train)

# Obtener las variables seleccionadas
selected_features = X_train.columns[selector.get_support()]
print("📌 Variables seleccionadas:", selected_features.tolist())


  y = column_or_1d(y, warn=True)


📌 Variables seleccionadas: ['COMPETITIVIDAD', 'GENERO', 'veces_respuesta_negativa', 'Bancos_TC_Total', 'Retail_TC_Total', 'PLD_Entidades', 'Tiene_Deuda_PLD', 'dias_ultima_gestion_na', 'veces_solicitud_seguimiento_na', 'NUEVA_OFERTA']


In [13]:
[['24M_MONTO', 'EDAD', 'PRINCIPALIDAD_CONSUMO', 
 'ULTIMA_AGRUPACION', 'intentos_totales',
 'max_intentos_en_un_mes', 'veces_respuesta_positiva'] ,
['COMPETITIVIDAD', 'GENERO', 'veces_respuesta_negativa', 
 'Bancos_TC_Total', 'Retail_TC_Total', 'PLD_Entidades', 
 'Tiene_Deuda_PLD', 'dias_ultima_gestion_na',
 'veces_solicitud_seguimiento_na', 'NUEVA_OFERTA']]

[['24M_MONTO',
  'EDAD',
  'PRINCIPALIDAD_CONSUMO',
  'ULTIMA_AGRUPACION',
  'intentos_totales',
  'max_intentos_en_un_mes',
  'veces_respuesta_positiva'],
 ['COMPETITIVIDAD',
  'GENERO',
  'veces_respuesta_negativa',
  'Bancos_TC_Total',
  'Retail_TC_Total',
  'PLD_Entidades',
  'Tiene_Deuda_PLD',
  'dias_ultima_gestion_na',
  'veces_solicitud_seguimiento_na',
  'NUEVA_OFERTA']]

In [11]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

# Modelo Random Forest con ajuste para datos desbalanceados
model = RandomForestClassifier(class_weight='balanced', random_state=0)

# RFECV para selección de variables óptima
selector = RFECV(model, step=1, cv=5, scoring="f1")  # Usa F1-score por el desbalance

selector.fit(X_train, y_train)

# Obtener las variables seleccionadas
selected_features = X_train.columns[selector.support_]
print("📌 Variables seleccionadas:", selected_features.tolist())


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

KeyboardInterrupt: 

In [8]:
# Imprimamos el número de características totales y seleccionadas

# Así es como podemos hacer una lista de las características seleccionadas
selected_feats = X_train.columns[(sel_.get_support())]

# Imprimamos algunas estadísticas
print('Número de Features en Total: {}'.format((X_train.shape[1])))
print('Número de Features Seleccionados: {}'.format(len(selected_feats)))
print('Características con coeficientes reducidos a cero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

Número de Features en Total: 81
Número de Features Seleccionados: 36
Características con coeficientes reducidos a cero: 45


In [9]:
# Imprimimos las Características o Features seleccionados
selected_feats

Index(['MSSubClass', 'MSZoning', 'LotArea', 'LotShape', 'LandContour',
       'LotConfig', 'Neighborhood', 'OverallQual', 'OverallCond',
       'YearRemodAdd', 'RoofStyle', 'Exterior1st', 'ExterQual', 'Foundation',
       'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC', 'CentralAir',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath',
       'HalfBath', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageFinish', 'GarageCars', 'PavedDrive', 'WoodDeckSF',
       'ScreenPorch', 'SaleCondition'],
      dtype='object')

In [10]:
pd.Series(selected_feats).to_csv('../data/04_feature/selected_features.csv', index=False)