In [9]:
import pandas as pd
import numpy as np
import boto3
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Config
s3 = boto3.client('s3')
sts = boto3.client('sts')
account_id = sts.get_caller_identity()['Account']
bucket_name = f'ml-reestructuraciones-{account_id}'

print(f'Bucket: {bucket_name}')

Bucket: ml-reestructuraciones-029885540752


In [5]:
# Cargar datos del EDA
print('Cargando datos...')
df = pd.read_csv(f's3://{bucket_name}/data/raw/maestria.csv')

print(f'Shape inicial: {df.shape}')

Cargando datos...


severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Shape inicial: (711277, 45)


In [6]:
# eliminar columnas no necesarias

df.drop_duplicates(inplace=True)        
columnas_a_eliminar = ["oblg_dest","mora_max_6m", "num_doc", "fecha_reest", "ult_mant", "nit","cal_interna"]
columnas_existentes = [col for col in columnas_a_eliminar if col in df.columns]
df.drop(columnas_existentes, axis=1, inplace=True)


In [10]:
## llenar datos nulos

df["nivel_academico"].fillna("Desconocido", inplace=True)
df["nivel_riesgo"].fillna("Desconocido", inplace=True)
df["sub_segmento"].fillna("Desconocido", inplace=True)
df["estado_civil"].fillna("Desconocido", inplace=True)
df["genero"].fillna("Desconocido", inplace=True)
df["ano_nac"].fillna(df["ano_nac"].median(), inplace=True)
df["ocupacion"].fillna("Desconocido", inplace=True)
df["ingresos_totales"].fillna(df["ingresos_totales"].median(), inplace=True)
df["patrimonio"].fillna(df["patrimonio"].median(), inplace=True)
df["num_reest_ext"].fillna(0, inplace=True)
df["num_oblg_mora_ext"].fillna(0, inplace=True)
df["num_oblg_activa_ext"].fillna(0, inplace=True)
df["num_oblg_embarg_ext"].fillna(0, inplace=True)
df["vr_mora_total_ext"].fillna(0, inplace=True)
df["cupo_total"].fillna(0, inplace=True)

In [11]:
vr_numericas = [
    'vr_total_reest',
    'avg_alt_mora_6m',
    'max_alt_mora_6m',
    'avg_std_mora_6m',
    'avg_meses_con_mora',
    'meses_mora_180plus',
    'meses_mora_90plus',
    'meses_mora_30plus',
    'max_sld_int_6m',
    'avg_saldo_capital_6m',
    'max_saldo_capital_6m',
    'max_saldo_vencido_30_6m',
    'max_saldo_vencido_90_6m',
    'ano_nac',
    'ingresos_totales',
    'patrimonio',
    'ipc',
    'tasa_desempleo',
    'pib',
    'tasa_interes',
    'num_reest_anteriores',
    'num_reest_ext',
    'num_oblg_mora_ext',
    'num_oblg_activa_ext',
    'num_oblg_embarg_ext',
    'vr_mora_total_ext',
    'cupo_total',
    'coef_tendencia_mora'
]

vr_categoricas = [
    'segmento',
    'sub_segmento',
    'estado_civil',
    'genero',
    'ocupacion',
    'nivel_academico',
    'nivel_riesgo',
    'flag_tuvo_mora_90_dias',
    'flag_tuvo_mora_60_dias',
    'flag_tuvo_mora_30_dias',
    'flag_mora_recurrente'
]

In [12]:
## estandarizar variables numericas

data_numerics = df[vr_numericas].copy()
data_numerics = data_numerics.apply(lambda col: col.astype(float) if col.dtype == 'object' or 'decimal' in str(col.dtype) else col)
data_numerics.fillna(data_numerics.mean(), inplace=True)
scaler = StandardScaler(with_mean=False)
df_numerics = pd.DataFrame(scaler.fit_transform(data_numerics))
df_numerics.columns = vr_numericas

In [13]:
X = df_numerics.join(pd.get_dummies(df[vr_categoricas].astype('str')))

In [14]:
y = df["cumple_6m"]

In [18]:
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(y.value_counts())

X shape: (711189, 103)
y shape: (711189,)
cumple_6m
1    442242
0    268947
Name: count, dtype: int64


In [20]:
# Guardar datos procesados

X.to_csv(f's3://{bucket_name}/data/features/X.csv',index=False)
y.to_csv(f's3://{bucket_name}/data/features/y.csv',index=False)

print("Datos guardados correctamente")

Datos guardados correctamente
