# 02_preprocessing_local_v2.ipynb

Preprocesamiento basado en hallazgos de la EDA.

- `o` se descarta (72% nulos).
- `monto` → `log1p` para estabilizar cola larga.
- `g`: rare bucketing + codificación de frecuencia. Artefactos guardados.
- `j`: target encoding con smoothing, ajustado **sólo en TRAIN**. Artefactos guardados.
- `p`: binaria {Y=1, N=0}.
- Split estratificado 70/15/15 (train/val/test).
- Scaling (aunque no requerido por arboles) para mostrar uso de artefactos.


In [1]:
import os, joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# --- Paths ---
RAW_PATH = "./MeLiFraud.csv"
ARTIFACTS_DIR = "./"
PROCESSED_DIR = "./"

os.makedirs(ARTIFACTS_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

In [5]:
# --- Useful vars ---
TARGET = "fraude"
RANDOM_STATE = 42

In [7]:
# --- Cargar raw data ---
df = pd.read_csv(RAW_PATH)
print("Shape raw:", df.shape)
df.head(2)

Shape raw: (150000, 19)


Unnamed: 0,a,b,c,d,e,f,g,h,j,k,l,m,n,o,p,fecha,monto,score,fraude
0,4,0.6812,50084.12,50.0,0.0,20.0,AR,1,cat_d26ab52,0.365475,2479.0,952.0,1,,Y,2020-03-20 09:28:19,57.63,100,0
1,4,0.6694,66005.49,0.0,0.0,2.0,AR,1,cat_ea962fb,0.612728,2603.0,105.0,1,Y,Y,2020-03-09 13:58:28,40.19,25,0


In [9]:
# --- Limpiezas basicas ---
df = df.drop(columns=["o"])
df["fecha"] = pd.to_datetime(df["fecha"], errors="coerce")
df["p"] = df["p"].map({"Y":1, "N":0})

In [11]:
# --- Split estratificado 70/15/15 ---
y = df[TARGET].astype(int)
X = df.drop(columns=[TARGET])

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=RANDOM_STATE, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=RANDOM_STATE, stratify=y_temp
)

print("Train:", X_train.shape, " Val:", X_val.shape, " Test:", X_test.shape)

Train: (105000, 17)  Val: (22500, 17)  Test: (22500, 17)


In [13]:
# monto_raw para ganancia (no se usa como feature)
for part in (X_train, X_val, X_test):
    part["monto_raw"] = part["monto"].astype(float)

In [15]:
# --- g: rare bucketing + frequency encoding ---
RARE_THRESHOLD = 500 # Corte para englobar otros paises en "other"

def fit_g_frequency(col, rare_threshold=RARE_THRESHOLD):
    vc = col.value_counts(dropna=False)
    rare = vc[vc < rare_threshold].index.tolist()
    col_clean = col.where(~col.isin(rare), other="Other").fillna("Missing")
    freq_map = col_clean.value_counts(normalize=True).to_dict()
    return {"rare": rare, "freq_map": freq_map}

def transform_g_frequency(col, art):
    col_clean = col.where(~col.isin(art["rare"]), other="Other").fillna("Missing")
    return col_clean.map(art["freq_map"]).fillna(0.0)

g_art = fit_g_frequency(X_train["g"])
X_train["g_freq"] = transform_g_frequency(X_train["g"], g_art)
X_val["g_freq"]   = transform_g_frequency(X_val["g"], g_art)
X_test["g_freq"]  = transform_g_frequency(X_test["g"], g_art)

joblib.dump(g_art, os.path.join(ARTIFACTS_DIR, "g_encoding.joblib"))
print("Artefacto g guardado")

Artefacto g guardado


In [17]:
# --- j: target encoding con smoothing ---
def fit_target_encoding(series, y, m=200):
    global_mean = float(y.mean())
    df_tmp = pd.DataFrame({"cat": series.fillna("Missing"), "y": y.values})
    stats = df_tmp.groupby("cat")["y"].agg(["mean","count"])
    te = ((stats["mean"] * stats["count"]) + global_mean * m) / (stats["count"] + m)
    return {"map": te.to_dict(), "global_mean": global_mean, "m": m}

def transform_target_encoding(series, art):
    return series.fillna("Missing").map(art["map"]).fillna(art["global_mean"])

j_art = fit_target_encoding(X_train["j"], y_train, m=200)
X_train["j_te"] = transform_target_encoding(X_train["j"], j_art)
X_val["j_te"]   = transform_target_encoding(X_val["j"], j_art)
X_test["j_te"]  = transform_target_encoding(X_test["j"], j_art)

joblib.dump(j_art, os.path.join(ARTIFACTS_DIR, "j_target_encoding.joblib"))
print("Artefacto j guardado")

Artefacto j guardado


In [19]:
# --- Feature engineering e imputacion ---
for part in (X_train, X_val, X_test):
    part["monto_log1p"] = np.log1p(np.clip(part["monto"].astype(float), a_min=0, a_max=None))
    num_cols = part.select_dtypes(include=[np.number]).columns.tolist()
    for c in num_cols:
        part[c] = part[c].fillna(part[c].median())

In [21]:
# --- Scaling (fit en TRAIN, aplicar en VAL/TEST) ---
final_cols = [c for c in X_train.select_dtypes(include=[np.number]).columns if c not in ["monto_raw","monto"]]

scaler = StandardScaler().fit(X_train[final_cols])

X_train_scaled = pd.DataFrame(scaler.transform(X_train[final_cols]), columns=final_cols)
X_val_scaled   = pd.DataFrame(scaler.transform(X_val[final_cols]),   columns=final_cols)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test[final_cols]),  columns=final_cols)

joblib.dump(scaler, os.path.join(ARTIFACTS_DIR, "scaler.pkl"))
print("Scaler guardado")

Scaler guardado


In [23]:
# --- Guardar datasets procesados (comentado para revisión) ---
train_out = X_train_scaled.copy(); train_out["monto_raw"] = X_train["monto_raw"].values; train_out[TARGET] = y_train.values
val_out   = X_val_scaled.copy();   val_out["monto_raw"]   = X_val["monto_raw"].values;   val_out[TARGET]   = y_val.values
test_out  = X_test_scaled.copy();  test_out["monto_raw"]  = X_test["monto_raw"].values;  test_out[TARGET]  = y_test.values

train_out.to_csv(os.path.join(PROCESSED_DIR, "train_processed.csv"), index=False)
val_out.to_csv(os.path.join(PROCESSED_DIR, "val_processed.csv"), index=False)
test_out.to_csv(os.path.join(PROCESSED_DIR, "test_processed.csv"), index=False)
print("Procesados guardados (incluyen monto_raw).")

Procesados guardados (incluyen monto_raw).
