In [1]:
# === Config + helpers ===
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Rutas (ajustá si hace falta)
BASE = Path("/mnt/data")
PATHS = {
    "zenodo_fahlenbrach": BASE / "zenodo_fahlenbrach_clean.csv",
    "icpsr_villanueva":   BASE / "icpsr_villanueva_clean.csv",
    "kaggle_yanmaksi":    BASE / "kaggle_yanmaksi_clean.csv",
}

OUT_DIR = BASE / "reports"
PLOTS_DIR = OUT_DIR / "plots"
OUT_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

def pick_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

def detect_money_cols(df):
    # Priorizar columnas normalizadas *_usd si existen
    cols = df.columns
    soft = pick_col(df, [c for c in cols if c.endswith("_usd") and ("soft" in c)] +
                        ["softcap_usd", "soft_cap_usd", "soft_cap", "softcap", "soft cap"])
    hard = pick_col(df, [c for c in cols if c.endswith("_usd") and ("hard" in c)] +
                        ["hardcap_usd", "hard_cap_usd", "hard_cap", "hardcap", "hard cap"])
    raised = pick_col(df, [c for c in cols if c.endswith("_usd") and ("raised" in c or "funds" in c)] +
                          ["raised_usd", "total_raised_usd", "raised", "total raised", "total_raised", "funds raised", "amount raised"])
    return soft, hard, raised

def series_stats(s):
    s = pd.to_numeric(s, errors="coerce")
    return pd.Series({
        "count_non_null": int(s.notna().sum()),
        "min": float(np.nanmin(s)) if s.notna().any() else np.nan,
        "median": float(np.nanmedian(s)) if s.notna().any() else np.nan,
        "mean": float(np.nanmean(s)) if s.notna().any() else np.nan,
        "max": float(np.nanmax(s)) if s.notna().any() else np.nan,
    })

def class_balance_plot(df, ds_name, success_col, outdir):
    vc = pd.to_numeric(df[success_col], errors="coerce").dropna().astype(int).value_counts().sort_index()
    if vc.empty:
        return None
    plt.figure()
    vc.plot(kind="bar")
    plt.title(f"Class balance — {ds_name}")
    plt.xlabel("ico_successful (0=fail, 1=success)")
    plt.ylabel("count")
    plt.tight_layout()
    fig_path = outdir / f"{ds_name}_class_balance.png"
    plt.savefig(fig_path, dpi=140)
    plt.show()
    return fig_path

def money_histogram(df, col, ds_name, outdir):
    s = pd.to_numeric(df[col], errors="coerce").dropna()
    if s.empty:
        return None
    plt.figure()
    plt.hist(s, bins=40)
    plt.title(f"{ds_name} — Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("count")
    plt.tight_layout()
    path = outdir / f"{ds_name}_{col}_hist.png"
    plt.savefig(path, dpi=140)
    plt.show()
    return path

PATHS


{'zenodo_fahlenbrach': WindowsPath('/mnt/data/zenodo_fahlenbrach_clean.csv'),
 'icpsr_villanueva': WindowsPath('/mnt/data/icpsr_villanueva_clean.csv'),
 'kaggle_yanmaksi': WindowsPath('/mnt/data/kaggle_yanmaksi_clean.csv')}