# EDA – Bank Account Fraud (BAF)

EDA sobre el dataset BAF Dataset Suite.

In [None]:
import pandas as pd
import os

## Carga de datos local

Los 6 archivos CSV (base + variantes) desde la carpeta data/, por privacidad estos datos están en el gitignore.

In [None]:
DATA_DIR = "data"
files = [
    "Base.csv",
    "Variant I.csv",
    "Variant II.csv",
    "Variant III.csv",
    "Variant IV.csv",
    "Variant V.csv"
]
datasets = {}
for file in files:
    path = os.path.join(DATA_DIR, file)
    df_temp = pd.read_csv(path)
    datasets[file] = df_temp
    print(f"{file}: {df_temp.shape}")

## 1. Descripción general del dataset

- shape (filas/columnas)
- nombres y tipos de columnas
- nulos/missing y duplicados
- “missing” (-1 o negativos) en cols


In [None]:
for name, df in datasets.items():
    print(f"{name}: {df.shape[0]:,} filas, {df.shape[1]} columnas")


In [None]:
# columnas y tipos de Base.csv
datasets["Base.csv"].dtypes

In [None]:
# columnas y tipos de Variant III.csv
datasets["Variant III.csv"].dtypes

In [None]:
for name, df in datasets.items():
    print(f"-{name}")
    na_counts = df.isna().sum()
    na_counts = na_counts[na_counts > 0]
    if not na_counts.empty:
        print("Columnas con NaN y cantidad:")
        print(na_counts.sort_values(ascending=False))
    else:
        print("No hay NaN")

# 2. Resumen Estadístico:


In [None]:
import numpy as np
import matplotlib.pyplot as plt

for name, df in datasets.items():
    print(f"- {name} ")
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    # medidas estadísticas básicas
    stats = pd.DataFrame({
        "media": df[numeric_cols].mean(),
        "mediana": df[numeric_cols].median(),
        "moda": df[numeric_cols].mode().iloc[0],
        "desv": df[numeric_cols].std()
    })
    print(stats)
    
    # histogramas de todas las numéricas
    ncols = 5  
    nplots = len(numeric_cols)
    nrows = int(np.ceil(nplots / ncols))
    fig, axes = plt.subplots(nrows, ncols, figsize=(ncols*5, nrows*3))
    axes = np.atleast_1d(axes).ravel()

    for i, col in enumerate(numeric_cols):
        axes[i].hist(df[col].dropna(), bins=30)
        axes[i].set_title(col)
        axes[i].set_ylabel("Frecuencia")

    for j in range(nplots, len(axes)):
        axes[j].axis("off")

    fig.suptitle(f"Histogramas – {name}", y=1.02)
    plt.tight_layout()
    plt.show()


# 3. Análisis de Variables Categóricas:

In [None]:
import numpy as np
import matplotlib.pyplot as plt

for name, df in datasets.items():
    print(f"- {name} ")

    cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
    print(f"variables categóricas: {cat_cols}")

    # tabla de frecuencias para c/categórica
    for col in cat_cols:
        print(f"\nFrecuencias para {col}:")
        print(df[col].value_counts(dropna=False))

    ncols = 3  
    nplots = len(cat_cols)
    nrows = int(np.ceil(nplots / ncols))
    
    fig, axes = plt.subplots(nrows, ncols, figsize=(ncols*6, nrows*4))
    axes = np.atleast_1d(axes).ravel()
    
    for i, col in enumerate(cat_cols):
        value_counts = df[col].value_counts(dropna=False)
        few_data = value_counts[value_counts < 0.005 * len(df)]
        
        # categorías poco representadas
        if not few_data.empty:
            print(f"\n{col}: categorías con pocos datos:")
            print(few_data)
        
        axes[i].bar(value_counts.index.astype(str), value_counts.values)
        axes[i].set_title(col)
        axes[i].tick_params(axis="x", rotation=45)
        axes[i].set_ylabel("Frecuencia")
    
    for j in range(nplots, len(axes)): # para los empty axes
        axes[j].axis("off")
    
    fig.suptitle(f"Distribución de variables categóricas – {name}", y=1.02)
    plt.tight_layout()
    plt.show()

# 4. Relaciones entre Variables:

In [None]:
import matplotlib.pyplot as plt
import numpy as np

top_corr = 5 

for name, df in datasets.items():
    print(f"- {name}")
    num_df = df.select_dtypes(include=[np.number])
    mat_corr = num_df.corr()
    print(mat_corr)
    mask = np.triu(np.ones_like(mat_corr, dtype=bool), k=1)
    corr_pairs = (
        mat_corr.where(mask)
                .stack()
                .reset_index()
                .rename(columns={"level_0": "var1", "level_1": "var2", 0: "correlacion"})
                .sort_values(by="correlacion", ascending=False)
    )

    print("\nMayor correlación (pos):")
    print(corr_pairs.head(top_corr).to_string(index=False))

    n = min(top_corr, len(corr_pairs))
    ncols = 3
    nrows = int(np.ceil(n / ncols))
    fig, axes = plt.subplots(nrows, ncols, figsize=(ncols*6, nrows*4))
    axes = np.atleast_1d(axes).ravel()

    for ax, (_, row) in zip(axes, corr_pairs.head(n).iterrows()):
        var1, var2, corr_val = row["var1"], row["var2"], row["correlacion"]
        ax.scatter(df[var1], df[var2], alpha=0.25, s=5) 
        ax.set_xlabel(var1)
        ax.set_ylabel(var2)
        ax.set_title(f"{var1} vs {var2} (corr={corr_val:.2f})")

    for j in range(n, len(axes)):
        axes[j].axis("off")

    fig.suptitle(f"Scatter plots – Top correlaciones de {name}", y=1.02)
    plt.tight_layout()
    plt.show()


# 5. Detección de Fraudes

In [None]:
import pandas as pd

fraud_stats = []

for name, df in datasets.items():
    counts = df['fraud_bool'].value_counts()
    total = len(df)
    perc = (counts / total) * 100
    fraud_stats.append({
        "file": name,
        "no fraude": counts.get(0, 0),
        "fraude": counts.get(1, 0),
        "% nf": perc.get(0, 0),
        "% f": perc.get(1, 0)
    })

fraud_df = pd.DataFrame(fraud_stats)
print(fraud_df)

In [None]:
for name, df in datasets.items():
    print(f"- {name} ")
    
    num_cols = df.select_dtypes(include=[np.number]).columns.drop('fraud_bool', errors='ignore')
    print("\nMedias de variables numéricas por grupo (fraude=1, no fraude=0):")
    print(df.groupby('fraud_bool')[num_cols].mean())
    
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        print(f"\nFraude por categoría en {col}:")
        print(df.groupby(col)['fraud_bool'].mean().sort_values(ascending=False))



In [None]:
import numpy as np
import matplotlib.pyplot as plt

TOP_NUM = 5          # cuántas numéricas mostrar por dataset
MIN_COUNT_CAT = 100  # soporte mínimo por categoría para graficarla

for name, df in datasets.items():
    print(f"\n=== {name} ===")

    # -------------------------------
    # 1) NUMÉRICAS: diferencia de medias
    # -------------------------------
    if 'fraud_bool' not in df.columns:
        print("No está 'fraud_bool'.")
        continue

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'fraud_bool' in num_cols:
        num_cols.remove('fraud_bool')

    # medias por grupo
    means_by_group = df.groupby('fraud_bool')[num_cols].mean()

    # diferencia de medias: fraude(1) - no fraude(0)
    mean_diff = (means_by_group.loc[1] - means_by_group.loc[0]).sort_values(key=lambda s: s.abs(), ascending=False)

    # plot top N difs
    top = mean_diff.head(TOP_NUM)
    plt.figure(figsize=(8, 4 + 0.4*len(top)))
    plt.barh(top.index[::-1], top.values[::-1])
    plt.axvline(0, linestyle='--')
    plt.title(f"Diferencia de medias (fraude - no fraude) • {name}")
    plt.xlabel("Diferencia de medias")
    plt.ylabel("Variable numérica")
    plt.tight_layout()
    plt.show()

    # -------------------------------
    # 2) CATEGÓRICAS: tasa de fraude por categoría
    #    (ejemplo con payment_type y device_os si existen)
    # -------------------------------
    for cat_col in ['payment_type', 'device_os']:
        if cat_col not in df.columns:
            continue

        vc = df[cat_col].value_counts()
        keep_levels = vc[vc >= MIN_COUNT_CAT].index  # evita categorías rarísimas
        sub = df[df[cat_col].isin(keep_levels)]

        if sub.empty:
            continue

        rates = sub.groupby(cat_col)['fraud_bool'].mean().sort_values(ascending=False)

        plt.figure(figsize=(8, 4 + 0.35*len(rates)))
        plt.barh(rates.index[::-1], rates.values[::-1])
        plt.title(f"Tasa de fraude por categoría – {cat_col} • {name}")
        plt.xlabel("Tasa de fraude")
        plt.ylabel(cat_col)
        plt.tight_layout()
        plt.show()