# 03 Construcción de Características

Este notebook se enfoca en la creación de nuevas características y la transformación de las existentes para mejorar la capacidad predictiva del modelo.

## 1. Cargar el Conjunto de Datos

In [5]:
## 1. Cargar el Conjunto de Datos
# Importar la librería necesarias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Cargar el conjunto de datos
file_path = '../data/interim/bank-full.csv'
data = pd.read_csv(file_path)

# Definir las características (X) y la variable objetivo (y)
X = data.drop('y', axis=1)
y = data['y']

# Mostrar las primeras filas del conjunto de datos
data.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,0.519481,0,0.092259,1,0,0.133333,0.05307,0.0,0.0,0.0,...,False,False,False,True,False,False,False,False,False,True
1,0.337662,0,0.073067,1,0,0.133333,0.030704,0.0,0.0,0.0,...,False,False,False,True,False,False,False,False,False,True
2,0.194805,0,0.072822,1,1,0.133333,0.015453,0.0,0.0,0.0,...,False,False,False,True,False,False,False,False,False,True
3,0.376623,0,0.086476,1,0,0.133333,0.018707,0.0,0.0,0.0,...,False,False,False,True,False,False,False,False,False,True
4,0.194805,0,0.072812,0,0,0.133333,0.04026,0.0,0.0,0.0,...,False,False,False,True,False,False,False,False,False,True


## 2. Sobremuestreo de la Clase Minoritaria
Duplicar o sintetizar nuevos ejemplos de la clase minoritaria.

In [6]:
from imblearn.over_sampling import SMOTE

# Aplicar SMOTE para sobremuestreo
smote = SMOTE(random_state=8)
X_oversampled, y_oversampled = smote.fit_resample(X, y)

# Verificar la distribución de clases después del sobremuestreo
print("Antes del sobremuestreo:")
print(y.value_counts())
print("Después del sobremuestreo:")
print(y_oversampled.value_counts())

Antes del sobremuestreo:
y
0    39922
1     5289
Name: count, dtype: int64
Después del sobremuestreo:
y
0    39922
1    39922
Name: count, dtype: int64


In [7]:
from sklearn.model_selection import train_test_split

# Division de datos en Train y Test
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_oversampled, y_oversampled, test_size=0.25, random_state=8)

# Mostrar las dimensiones de los conjuntos de datos
print(f'X_train_over shape: {X_train_over.shape}')
print(f'X_test_over shape: {X_test_over.shape}')
print(f'y_train_over shape: {y_train_over.shape}')
print(f'y_test_over shape: {y_test_over.shape}')

X_train_over shape: (59883, 42)
X_test_over shape: (19961, 42)
y_train_over shape: (59883,)
y_test_over shape: (19961,)


In [8]:
dir_path = '../data/processed/oversampled'

# Crear un DataFrame con la data completa sobremuestreada
data_oversampled = pd.concat([X_oversampled, y_oversampled], axis=1)

# Guardar la data_oversampled
data_oversampled.to_csv(f'{dir_path}/bank-full.csv', index=False)

# Guardar los conjuntos de datos preprocesados
X_train_over.to_csv(f'{dir_path}/X_train.csv', index=False)
X_test_over.to_csv(f'{dir_path}/X_test.csv', index=False)
y_train_over.to_csv(f'{dir_path}/y_train.csv', index=False)
y_test_over.to_csv(f'{dir_path}/y_test.csv', index=False)

## 3. Submuestreo de la Clase Mayoritaria
Eliminar ejemplos de la clase mayoritaria.

In [9]:
from imblearn.under_sampling import RandomUnderSampler

# Aplicar RandomUnderSampler para submuestreo
rus = RandomUnderSampler(random_state=8)
X_undersampled, y_undersampled = rus.fit_resample(X, y)

# Verificar la distribución de clases después del submuestreo
print("Antes del submuestreo:")
print(y.value_counts())
print("Después del submuestreo:")
print(y_undersampled.value_counts())

Antes del submuestreo:
y
0    39922
1     5289
Name: count, dtype: int64
Después del submuestreo:
y
0    5289
1    5289
Name: count, dtype: int64


In [10]:
from sklearn.model_selection import train_test_split

# Division de datos en Train y Test
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_undersampled, y_undersampled, test_size=0.25, random_state=8)

# Mostrar las dimensiones de los conjuntos de datos
print(f'X_train_under shape: {X_train_under.shape}')
print(f'X_test_under shape: {X_test_under.shape}')
print(f'y_train_under shape: {y_train_under.shape}')
print(f'y_test_under shape: {y_test_under.shape}')

X_train_under shape: (7933, 42)
X_test_under shape: (2645, 42)
y_train_under shape: (7933,)
y_test_under shape: (2645,)


In [12]:
dir_path = '../data/processed/undersampled'

# Crear un DataFrame con la data completa submuestreada
data_undersampled = pd.concat([X_undersampled, y_undersampled], axis=1)

# Guardar la data_undersampled
data_undersampled.to_csv(f'{dir_path}/bank-full.csv', index=False)

# Guardar los conjuntos de datos preprocesados
X_train_under.to_csv(f'{dir_path}/X_train.csv', index=False)
X_test_under.to_csv(f'{dir_path}/X_test.csv', index=False)
y_train_under.to_csv(f'{dir_path}/y_train.csv', index=False)
y_test_under.to_csv(f'{dir_path}/y_test.csv', index=False)