<a href="https://colab.research.google.com/github/dinav2/IA-avanzada-para-la-ciencia-de-datos/blob/main/Limpieza.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importar Librerías

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from imblearn.combine import SMOTETomek
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split




Funciones

In [3]:
def agrupar_categorias(df, threshold=0.01):
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        freqs = df[col].value_counts(normalize=True)
        rare_cats = freqs[freqs < threshold].index
        df[col] = df[col].apply(lambda x: 'otras' if x in rare_cats else x)
    return df

In [4]:
def one_hot_encode(df):
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    encoded = encoder.fit_transform(df[cat_cols])
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(cat_cols), index=df.index)
    df = df.drop(columns=cat_cols)
    df = pd.concat([df, encoded_df], axis=1)
    return df


## Leer Datos

In [5]:
base = pd.read_csv('data/Base.csv')
variantI = pd.read_csv('data/Variant I.csv')
variantII = pd.read_csv('data/Variant II.csv')
variantIII = pd.read_csv('data/Variant III.csv')
variantIV = pd.read_csv('data/Variant IV.csv')
variantV = pd.read_csv('data/Variant V.csv')

## Eliminar Duplicados y Nulos

In [6]:
base = base.drop_duplicates().dropna()
variantI = variantI.drop_duplicates().dropna()
variantII = variantII.drop_duplicates().dropna()
variantIII = variantIII.drop_duplicates().dropna()
variantIV = variantIV.drop_duplicates().dropna()
variantV = variantV.drop_duplicates().dropna()

In [7]:

# Lista de columnas a evaluar
cols = [
    "prev_address_months_count",
    "current_address_months_count",
    "intended_balcon_amount",
    "bank_months_count",
    "session_length_in_minutes",
    "device_distinct_emails_8w"
]

def handle_missing_with_fraud(df, cols):
    missing_counts = {}

    for col in cols:
        if col == "intended_balcon_amount":
            missing_mask = df[col] < 0
        else:
            missing_mask = df[col] == -1

        missing_counts[col] = missing_mask.sum()

        if (missing_mask.sum() / len(df)) > 0.5:
            df = df.drop(columns=[col])
            continue
        df = df.drop(df.loc[missing_mask & (df["fraud_bool"] == 0)].index)

        valid_values = df.loc[~missing_mask, col]
        if not valid_values.empty:
            median_val = valid_values.median()
            df.loc[missing_mask & (df["fraud_bool"] == 1), col] = median_val


    return df


In [8]:
base = handle_missing_with_fraud(base, cols)
variantI = handle_missing_with_fraud(variantI, cols)
variantII = handle_missing_with_fraud(variantII, cols)
variantIII = handle_missing_with_fraud(variantIII, cols)
variantIV = handle_missing_with_fraud(variantIV, cols)
variantV = handle_missing_with_fraud(variantV, cols)

## Agrupar Categorías con pocos valores

In [9]:
base = agrupar_categorias(base)
variantI = agrupar_categorias(variantI)
variantII = agrupar_categorias(variantII)
variantIII = agrupar_categorias(variantIII)
variantIV = agrupar_categorias(variantIV)
variantV = agrupar_categorias(variantV)

## Normalizar

In [10]:
binary_cols = [
    "fraud_bool", "phone_home_valid", "phone_mobile_valid",
    "has_other_cards", "foreign_request", "keep_alive_session", "email_is_free", "month"
]

In [11]:

scaler = StandardScaler()

# Normalizar cada dataframe (solo columnas numéricas)
num_cols = base.select_dtypes(include=np.number).columns.difference(binary_cols)
scaler = StandardScaler()
base[num_cols] = scaler.fit_transform(base[num_cols])

num_cols = variantI.select_dtypes(include=np.number).columns.difference(binary_cols)
scaler = StandardScaler()
variantI[num_cols] = scaler.fit_transform(variantI[num_cols])

num_cols = variantII.select_dtypes(include=np.number).columns.difference(binary_cols)
scaler = StandardScaler()
variantII[num_cols] = scaler.fit_transform(variantII[num_cols])

num_cols = variantIII.select_dtypes(include=np.number).columns.difference(binary_cols)
scaler = StandardScaler()
variantIII[num_cols] = scaler.fit_transform(variantIII[num_cols])

num_cols = variantIV.select_dtypes(include=np.number).columns.difference(binary_cols)
scaler = StandardScaler()
variantIV[num_cols] = scaler.fit_transform(variantIV[num_cols])

num_cols = variantV.select_dtypes(include=np.number).columns.difference(binary_cols)
scaler = StandardScaler()
variantV[num_cols] = scaler.fit_transform(variantV[num_cols])

One Hot encoder

In [12]:
base = one_hot_encode(base)
variantI = one_hot_encode(variantI)
variantII = one_hot_encode(variantII)
variantIII = one_hot_encode(variantIII)
variantIV = one_hot_encode(variantIV)
variantV = one_hot_encode(variantV)

## Aplicar SMOTE-Tomek  (Oversample y Undersample)

In [13]:
def split_and_balance(
    df, 
    target_col="fraud_bool", 
    test_size=0.3, 
    random_state=42, 
    save_train="train_balanced.csv",
    save_test_X="X_test.csv",
    save_test_y="y_test.csv",
    k_neighbors=2
):
    """
    Divide el dataset en train/test, balancea el train con SMOTE-Tomek,
    y guarda los resultados en CSV.
    """
    # Separar features y target
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Train/Test split estratificado
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    print("Distribución original en train:\n", y_train.value_counts())
    print("Distribución original en test:\n", y_test.value_counts())

    # Definir SMOTE con sampling_strategy = 0.2
    smote = SMOTE(
        random_state=random_state, 
        k_neighbors=k_neighbors, 
        sampling_strategy=0.2  # minoritaria = 20% de la mayoritaria
    )

    # Balanceo con SMOTE-Tomek
    smote_tomek = SMOTETomek(
        random_state=random_state, 
        smote=smote, 
        n_jobs=-1
    )
    X_train_bal, y_train_bal = smote_tomek.fit_resample(X_train, y_train)

    # Crear DataFrame balanceado
    df_train_bal = pd.DataFrame(X_train_bal, columns=X_train.columns)
    df_train_bal[target_col] = y_train_bal

    # Guardar datasets
    df_train_bal.to_csv(save_train, index=False)
    pd.DataFrame(X_test, columns=X_test.columns).to_csv(save_test_X, index=False)
    pd.DataFrame(y_test, columns=[target_col]).to_csv(save_test_y, index=False)

    print(f"\n Train balanceado (20%) guardado en '{save_train}'")
    print(f" X_test guardado en '{save_test_X}'")
    print(f" y_test guardado en '{save_test_y}'")

    return df_train_bal, X_test, y_test

## Split and balance base

In [14]:

# df_train_bal, X_test, y_test = split_and_balance(base, target_col="fraud_bool", save_train="train_balanced_base.csv", save_test_X="X_test_base.csv", save_test_y="y_test_base.csv")

In [15]:
# Filtrar el dataframe base para incluir solo filas donde 'month' está entre 0 y 5
#base_months_0_5 = base[base['month'].between(0, 5)]

# Ahora puedes hacer el split y balanceo solo con estos datos
#df_train_bal_months_0_5, X_test_months_0_5, y_test_months_0_5 = split_and_balance(
    #base_months_0_5,
    #target_col="fraud_bool",
    #save_train="train_balanced_base_months_0_5.csv",
   # save_test_X="X_test_base_months_0_5.csv",
    #save_test_y="y_test_base_months_0_5.csv"
#)

## Combinar Base, VariantI y VariantII

In [1]:

# Filtrar por meses 0-5
base_months_0_5 = base[base['month'].between(0, 5)]
variantI_months_0_5 = variantI[variantI['month'].between(0, 5)]
variantII_months_0_5 = variantII[variantII['month'].between(0, 5)]

# Tomar 80% de base, 30% de variantI y 30% de variantII
base_months_0_5 = base_months_0_5.sample(frac=0.8, random_state=42)
variantI_months_0_5 = variantI_months_0_5.sample(frac=0.3, random_state=42)
variantII_months_0_5 = variantII_months_0_5.sample(frac=0.3, random_state=42)


# Combine them
combined_months_0_5 = pd.concat([base_months_0_5, variantI_months_0_5, variantII_months_0_5], ignore_index=True)
print(combined_months_0_5.shape)

NameError: name 'base' is not defined

In [None]:


# Ahora puedes hacer el split y balanceo solo con estos datos
df_train_bal_com_months_0_5, X_test_com_months_0_5, y_test_com_months_0_5 = split_and_balance(
    combined_months_0_5,
    target_col="fraud_bool",
    save_train="train_balanced_combined_months_0_5.csv",
    save_test_X="X_test_combined_months_0_5.csv",
    save_test_y="y_test_combined_months_0_5.csv"
)

Distribución original en train:
 fraud_bool
0    577505
1      7996
Name: count, dtype: int64
Distribución original en test:
 fraud_bool
0    247503
1      3427
Name: count, dtype: int64


In [None]:
#base_balanced = pd.read_csv('train_balanced_combined.csv')


In [None]:
#base_balanced.shape

(1446004, 45)

In [None]:
#base_balanced["fraud_bool"].value_counts()


fraud_bool
0    723002
1    723002
Name: count, dtype: int64