In [66]:
import pandas as pd
from pathlib import Path

In [67]:
try:
    project_root = Path.cwd().parent
    clean_data_path = project_root / "data" / "processed" / "credit_card_clients_clean.csv"
    df = pd.read_csv(clean_data_path)
except FileNotFoundError:
    print(f"Error: No se encontró el archivo: {clean_data_path}")

In [68]:
bill_amt_cols = ['bill_amt_sept', 'bill_amt_aug', 'bill_amt_july', 'bill_amt_june', 'bill_amt_may', 'bill_amt_april']
pay_amt_cols = ['pay_amt_sept', 'pay_amt_aug', 'pay_amt_july', 'pay_amt_june', 'pay_amt_may', 'pay_amt_april']
pay_status_cols = ['pay_sept', 'pay_aug', 'pay_july', 'pay_june', 'pay_may', 'pay_april']


In [69]:
df_features = df.copy()

In [70]:
for i, month in enumerate(['sept', 'aug', 'july', 'june', 'may', 'april']):
    # Evitar división por cero añadiendo un valor pequeño (epsilon)
    epsilon = 1e-6
    df_features[f'utilization_{month}'] = df_features[f'bill_amt_{month}'] / (df_features['limit_bal'] + epsilon)
    df_features[f'payment_ratio_{month}'] = df_features[f'pay_amt_{month}'] / (df_features[f'bill_amt_{month}'] + epsilon)

In [71]:
df_features['bill_amt_avg'] = df_features[bill_amt_cols].mean(axis=1)
df_features['bill_amt_std'] = df_features[bill_amt_cols].std(axis=1)
df_features['bill_amt_max'] = df_features[bill_amt_cols].max(axis=1)

In [72]:
def calculate_slope(row, cols):
    x = np.array(range(len(cols)))
    y = row[cols].values
    slope, _ = np.polyfit(x, y, 1)
    return slope


In [73]:
df_features['bill_amt_slope'] = df_features.apply(lambda row: calculate_slope(row, bill_amt_cols[::-1]), axis=1)
df_features['pay_amt_slope'] = df_features.apply(lambda row: calculate_slope(row, pay_amt_cols[::-1]), axis=1)

In [74]:
bill_amt_cols = ['bill_amt_sept', 'bill_amt_aug', 'bill_amt_july', 'bill_amt_june', 'bill_amt_may', 'bill_amt_april']
pay_amt_cols = ['pay_amt_sept', 'pay_amt_aug', 'pay_amt_july', 'pay_amt_june', 'pay_amt_may', 'pay_amt_april']
months = ['sept', 'aug', 'july', 'june', 'may', 'april']

for i, month in enumerate(months):
    df_features[f'bill_minus_pay_{month}'] = df_features[bill_amt_cols[i]] - df_features[pay_amt_cols[i]]

# Agregados de la deuda neta
bill_minus_pay_cols = [f'bill_minus_pay_{m}' for m in months]
df_features['bill_minus_pay_avg'] = df_features[bill_minus_pay_cols].mean(axis=1)

In [75]:
pay_status_cols = ['pay_sept', 'pay_aug', 'pay_july', 'pay_june', 'pay_may', 'pay_april']

df_features['pay_status_max'] = df_features[pay_status_cols].max(axis=1)
df_features['pay_status_avg'] = df_features[pay_status_cols].mean(axis=1)
df_features['months_with_delay'] = (df_features[pay_status_cols] > 0).sum(axis=1)

df_features['pay_status_slope'] = df_features.apply(lambda row: calculate_slope(row, pay_status_cols[::-1]), axis=1)

In [76]:
df_features['zero_payment_months'] = (df_features[pay_amt_cols] == 0).sum(axis=1)

In [77]:
df_features_reg = df.copy()

In [78]:
bill_cols_reg = ['bill_amt_june', 'bill_amt_may', 'bill_amt_april']
pay_cols_reg = ['pay_amt_may', 'pay_amt_april'] # Solo pagos anteriores
pay_status_cols_reg = ['pay_june', 'pay_may', 'pay_april']

In [79]:
for month in ['june', 'may', 'april']:
    epsilon = 1e-6
    df_features_reg[f'utilization_{month}'] = df_features_reg[f'bill_amt_{month}'] / (df_features_reg['limit_bal'] + epsilon)


In [80]:
df_features_reg['bill_amt_avg_3m'] = df_features_reg[bill_cols_reg].mean(axis=1)
df_features_reg['pay_amt_avg_2m'] = df_features_reg[pay_cols_reg].mean(axis=1)
df_features_reg['pay_status_avg_3m'] = df_features_reg[pay_status_cols_reg].mean(axis=1)


In [81]:
df_features_reg['bill_amt_slope_3m'] = df_features_reg.apply(lambda row: calculate_slope(row, bill_cols_reg[::-1]), axis=1)

In [82]:
# Deuda neta en los meses previos (disponibles antes del pago de junio)
df_features_reg['bill_minus_pay_may'] = df_features_reg['bill_amt_may'] - df_features_reg['pay_amt_may']
df_features_reg['bill_minus_pay_april'] = df_features_reg['bill_amt_april'] - df_features_reg['pay_amt_april']

# Máximo retraso y promedio en los meses previos (disponibles antes del pago de junio)
pay_status_cols_reg_safe = ['pay_may', 'pay_april']
df_features_reg['pay_status_max_prev'] = df_features_reg[pay_status_cols_reg_safe].max(axis=1)
df_features_reg['pay_status_avg_prev'] = df_features_reg[pay_status_cols_reg_safe].mean(axis=1)

In [83]:
# — VIF para df_features —
import numpy as np
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 1. Selección de columnas numéricas (sin ID ni target)
num_cols_feat = [
    col for col in df_features.columns
    if col not in ('ID', 'default_payment_next_month')
    and np.issubdtype(df_features[col].dtype, np.number)
]

# Confirmación de columnas a usar
print("Columnas numéricas para VIF en df_features:", num_cols_feat)

# 2. Cálculo de VIF
X_feat = add_constant(df_features[num_cols_feat])
vif_feat = pd.DataFrame({
    'variable': X_feat.columns,
    'VIF'     : [variance_inflation_factor(X_feat.values, i)
                 for i in range(X_feat.shape[1])]
}).sort_values('VIF', ascending=False)

display(vif_feat)


Columnas numéricas para VIF en df_features: ['limit_bal', 'sex', 'education', 'marriage', 'age', 'pay_sept', 'pay_aug', 'pay_july', 'pay_june', 'pay_may', 'pay_april', 'bill_amt_sept', 'bill_amt_aug', 'bill_amt_july', 'bill_amt_june', 'bill_amt_may', 'bill_amt_april', 'pay_amt_sept', 'pay_amt_aug', 'pay_amt_july', 'pay_amt_june', 'pay_amt_may', 'pay_amt_april', 'utilization_sept', 'payment_ratio_sept', 'utilization_aug', 'payment_ratio_aug', 'utilization_july', 'payment_ratio_july', 'utilization_june', 'payment_ratio_june', 'utilization_may', 'payment_ratio_may', 'utilization_april', 'payment_ratio_april', 'bill_amt_avg', 'bill_amt_std', 'bill_amt_max', 'bill_amt_slope', 'pay_amt_slope', 'bill_minus_pay_sept', 'bill_minus_pay_aug', 'bill_minus_pay_july', 'bill_minus_pay_june', 'bill_minus_pay_may', 'bill_minus_pay_april', 'bill_minus_pay_avg', 'pay_status_max', 'pay_status_avg', 'months_with_delay', 'pay_status_slope', 'zero_payment_months']


  vif = 1. / (1. - r_squared_i)


Unnamed: 0,variable,VIF
16,bill_amt_may,inf
17,bill_amt_april,inf
18,pay_amt_sept,inf
19,pay_amt_aug,inf
12,bill_amt_sept,inf
20,pay_amt_july,inf
15,bill_amt_june,inf
14,bill_amt_july,inf
13,bill_amt_aug,inf
43,bill_minus_pay_july,inf


In [84]:
# — VIF para df_features_reg —
import numpy as np
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 1. Selección de columnas numéricas (sin ID ni target de regresión)
num_cols_reg = [
    col for col in df_features_reg.columns
    if col not in ('ID', 'pay_amt_june')  # elimina también cualquier columna filtrada por leakage
    and np.issubdtype(df_features_reg[col].dtype, np.number)
]

# Confirmación de columnas a usar
print("Columnas numéricas para VIF en df_features_reg:", num_cols_reg)

# 2. Cálculo de VIF
X_reg = add_constant(df_features_reg[num_cols_reg])
vif_reg = pd.DataFrame({
    'variable': X_reg.columns,
    'VIF'     : [variance_inflation_factor(X_reg.values, i)
                 for i in range(X_reg.shape[1])]
}).sort_values('VIF', ascending=False)

display(vif_reg)


Columnas numéricas para VIF en df_features_reg: ['limit_bal', 'sex', 'education', 'marriage', 'age', 'pay_sept', 'pay_aug', 'pay_july', 'pay_june', 'pay_may', 'pay_april', 'bill_amt_sept', 'bill_amt_aug', 'bill_amt_july', 'bill_amt_june', 'bill_amt_may', 'bill_amt_april', 'pay_amt_sept', 'pay_amt_aug', 'pay_amt_july', 'pay_amt_may', 'pay_amt_april', 'default_payment_next_month', 'utilization_june', 'utilization_may', 'utilization_april', 'bill_amt_avg_3m', 'pay_amt_avg_2m', 'pay_status_avg_3m', 'bill_amt_slope_3m', 'bill_minus_pay_may', 'bill_minus_pay_april', 'pay_status_max_prev', 'pay_status_avg_prev']


  vif = 1. / (1. - r_squared_i)


Unnamed: 0,variable,VIF
17,bill_amt_april,inf
10,pay_may,inf
9,pay_june,inf
27,bill_amt_avg_3m,inf
28,pay_amt_avg_2m,inf
29,pay_status_avg_3m,inf
11,pay_april,inf
15,bill_amt_june,inf
16,bill_amt_may,inf
21,pay_amt_may,inf


In [63]:
processed_data_path = project_root / "data" / "processed"
processed_data_path.mkdir(parents=True, exist_ok=True)

In [64]:
path_clasificacion = processed_data_path / "features_clasificacion.csv"
df_features.to_csv(path_clasificacion, index=False)
print(f"DataFrame para clasificación guardado en: {path_clasificacion}")

DataFrame para clasificación guardado en: /Users/edusant/Desktop/personal/blue_tab/proyecto-riesgo-crediticio/data/processed/features_clasificacion.csv


In [65]:
path_regresion = processed_data_path / "features_regresion.csv"
df_features_reg.to_csv(path_regresion, index=False)
print(f"DataFrame para regresión guardado en: {path_regresion}")

DataFrame para regresión guardado en: /Users/edusant/Desktop/personal/blue_tab/proyecto-riesgo-crediticio/data/processed/features_regresion.csv
