In [1]:
%%time
import pandas as pd, numpy as np, re
from datetime import datetime

IN_PATH  = "../join/ico_union_wide.csv"
OUT_PATH = "ico_union_canonical.csv"

# --- Load ---
df = pd.read_csv(IN_PATH)
df.columns = df.columns.str.strip()

# --- Suffix config (prioridad de fuentes) ---
SUFFIXES = ["_zenodo", "_icpsr", "_yan"]  # prioridad por orden

# --- Utilities ---
def split_base_and_suffix(col: str):
    for s in SUFFIXES:
        if col.endswith(s):
            return col[:-len(s)], s
    return col, ""  # columnas sin sufijo

# agrupar columnas por "base"
base_to_cols = {}
for c in df.columns:
    b, s = split_base_and_suffix(c)
    base_to_cols.setdefault(b, []).append(c)

def _first_nonnull(series_list):
    """coalesce: primera serie con dato no nulo por fila."""
    if not series_list:
        return pd.Series([np.nan]*len(df))
    out = pd.Series([np.nan]*len(df))
    for s in series_list:
        if s is None: 
            continue
        if isinstance(s, str) and s in df.columns:
            v = df[s]
        elif isinstance(s, pd.Series):
            v = s
        else:
            continue
        out = out.where(~out.isna(), v)
    return out

def choose_cols_by_priority(base_name, prefer_numeric=False, regex=False):
    """
    Devuelve lista de columnas (nombres) para un base_name, ordenadas por prioridad de dataset.
    - Si regex=True, base_name es un patrón y trae todas las bases que 'matchean'.
    """
    candidates = []
    # matching de base exacto o por regex
    bases = []
    if regex:
        pat = re.compile(base_name, flags=re.IGNORECASE)
        bases = [b for b in base_to_cols.keys() if pat.search(b)]
    else:
        if base_name in base_to_cols:
            bases = [base_name]
        else:
            # fallback: probar case-insensitive
            for b in base_to_cols:
                if b.lower() == base_name.lower():
                    bases = [b]; break
    # por cada base, ordenar por prioridad de sufijo
    for b in bases:
        cols = base_to_cols[b]
        # separar con sufijo y sin sufijo
        with_suf = [c for c in cols if any(c.endswith(s) for s in SUFFIXES)]
        no_suf   = [c for c in cols if c not in with_suf]
        # ordenar los con sufijo por prioridad
        ordered = []
        for s in SUFFIXES:
            ordered += [c for c in with_suf if c.endswith(s)]
        ordered += no_suf  # al final, sin sufijo
        # ordenar numéricos antes si se pide
        if prefer_numeric:
            numeric_first = [c for c in ordered if pd.api.types.is_numeric_dtype(df[c])]
            non_numeric   = [c for c in ordered if not pd.api.types.is_numeric_dtype(df[c])]
            ordered = numeric_first + non_numeric
        candidates += ordered
    return candidates

def parse_money_like(s):
    """intenta homogenizar strings de montos a float."""
    if pd.isna(s): return np.nan
    x = str(s).strip().lower().replace(",", "").replace("$", "")
    try:
        mult = 1
        if "billion" in x or (re.search(r"\d", x) and x.endswith("b")): mult = 1_000_000_000
        elif "million" in x or (re.search(r"\d", x) and x.endswith("m")): mult = 1_000_000
        elif re.search(r"\d", x) and x.endswith("k"): mult = 1_000
        nums = re.findall(r"[\d.]+", x)
        return float(nums[0]) * mult if nums else np.nan
    except Exception:
        return np.nan

def clean_date_like(s):
    if pd.isna(s): return np.nan
    x = str(s).strip()
    x = re.sub(r"^(ended|end|finished|finalized)\s*:?\s*", "", x, flags=re.IGNORECASE)
    x = x.replace("—","-").replace("–","-")
    return x

def parse_date_series(series, formats=("%d %b %Y", "%Y-%m-%d", "%d/%m/%Y", "%b %d, %Y")):
    """intenta parsear fechas a Timestamp, probando varios formatos."""
    out = pd.to_datetime(series, errors="coerce", dayfirst=True)
    # si sigue muy NaT y hay strings, intentar otros formatos
    if out.isna().any():
        s = series.astype(str)
        for fmt in formats:
            mask = out.isna()
            try:
                out.loc[mask] = pd.to_datetime(s[mask], format=fmt, errors="coerce", dayfirst=True)
            except Exception:
                pass
    return out

def boolify(x):
    if pd.isna(x): return np.nan
    if isinstance(x, (int, float)) and not pd.isna(x):
        return int(float(x) != 0)
    s = str(x).strip().lower()
    if s in {"1","true","yes","y","si","sí"}: return 1
    if s in {"0","false","no","n"}: return 0
    return np.nan

def extract_min_max(s):
    """parsea ranges tipo '0.1-10 ETH' -> (min, max) (en unidades sin conversión de divisa)."""
    if pd.isna(s): return (np.nan, np.nan)
    x = str(s).lower()
    nums = re.findall(r"[\d.]+", x)
    if not nums: return (np.nan, np.nan)
    if len(nums) == 1:
        v = float(nums[0]); return (v, v)
    return (float(nums[0]), float(nums[1]))

# --- Canonical schema (muy amplio) ---
CANON = {
    # Identidad
    "name_std":           [["name_std"]],
    "symbol_std":         [["symbol_std"]],

    # Fechas (start/end)
    "ico_start_date":     [["start_date","start","ico_start","sale_start","preico_start","token_sale_start"],
                           [r"start_end_date_coin_sell", r"date_range", r"ico_dates"]],  # tomaremos el extremo izquierdo si viene rango
    "ico_end_date":       [["end_date_parsed","end_date","end","ico_end","sale_end","token_sale_end"],
                           [r"start_end_date_coin_sell", r"date_range", r"ico_dates"]],  # tomaremos el extremo derecho si viene rango

    # Recaudación y objetivos
    "goal_usd":           [["fundraising_goal","goal","soft_cap","softcap","target"]],
    "hard_cap_usd":       [["hard_cap","hardcap","max_cap","maximum_cap"]],
    "amount_raised_usd":  [["received_money","amount_raised","raised","raised_usd","received_money.1"]],  # variantes
    "ico_successful":     [["ico_successful","success","successful"]],

    # Tokenomics
    "token_price_usd":    [["ico_token_price","token_price"]],
    "total_tokens":       [["total_tokens","supply_total","token_supply_total"]],
    "tokens_for_sale":    [["available_for_token_sale","token_sale_amount","for_sale"]],
    "min_investment_raw": [["min_investment","min_max_personal_cap","minimum_investment"]],
    "max_investment_raw": [["max_investment","min_max_personal_cap","maximum_investment"]],
    "token_type":         [["token_type","type"]],
    "role_of_token":      [["role_of_token","role"]],

    # Acceso / Compliance / Jurisdicción
    "whitelist":          [["whitelist"]],
    "kyc":                [["kyc"]],
    "jurisdiction":       [["jurisdiction","country"]],
    "accepts":            [["accepts","currencies_accepted"]],

    # Señales de ejecución / presencia
    "has_github":         [["has_github","github","github_available"]],
    "has_telegram":       [["has_telegram","telegram"]],
    "has_reddit":         [["has_reddit","reddit"]],
    "website_available":  [["website_available","website","site"]],

    # Equipo / rating / interés / docs
    "team_size":          [["team_size","teamsize"]],
    "rating":             [["rating","score","ico_rating"]],
    "interest":           [["interest"]],
    "discount_max_pct":   [["crowdsale max. discount","max_discount","discount"]],
    "roadmap_available":  [["development road map available","roadmap_available","has_roadmap"]],
    "whitepaper_available":[["whitepaper_available","whitepaper","has_whitepaper"]],
}

# --- Resolver columnas por base + prioridad fuente ---
def resolve_base_to_series(base_tokens):
    """
    base_tokens: lista de bases (strings exactos) o patrones regex (si empiezan con '^' o contienen '.*')
    Devuelve lista de columnas reales en orden de prioridad de fuente.
    """
    cols = []
    for bt in base_tokens:
        if re.search(r"[\^\$\.\*\+\|\(\)\[\]\?]", bt, flags=re.I):  # patrón regex
            cols += choose_cols_by_priority(bt, regex=True)
        else:
            cols += choose_cols_by_priority(bt, regex=False)
    # quitar duplicados manteniendo orden
    seen = set(); cols_unique = []
    for c in cols:
        if c not in seen and c in df.columns:
            seen.add(c); cols_unique.append(c)
    return cols_unique

# --- Construcción del DataFrame canónico ---
out = pd.DataFrame(index=df.index)

# Identidad directa
for canon, groups in CANON.items():
    if canon in ["name_std","symbol_std"]:
        cols = resolve_base_to_series(groups[0])
        out[canon] = _first_nonnull(cols)
    else:
        out[canon] = np.nan  # inicializamos

# Fechas: start/end con soporte de rango
def pick_date_left_right():
    # Start: primero intentamos columnas de inicio directas
    start_cols = resolve_base_to_series(CANON["ico_start_date"][0])
    start_series = _first_nonnull(start_cols)
    # Si todo NaT, intentamos rango (tomar izquierda del rango)
    if start_series.isna().all() and len(CANON["ico_start_date"])>1:
        rng_cols = resolve_base_to_series(CANON["ico_start_date"][1])
        if rng_cols:
            left = df[rng_cols[0]].astype(str).str.extract(r"^\s*([^-–—|to]+)", expand=False).map(clean_date_like)
            start_series = left
    start_series = parse_date_series(start_series)

    # End: primero columnas de fin directas
    end_cols = resolve_base_to_series(CANON["ico_end_date"][0])
    end_series = _first_nonnull(end_cols)
    # si NaT, intentar derecha del rango
    if end_series.isna().all() and len(CANON["ico_end_date"])>1:
        rng_cols = resolve_base_to_series(CANON["ico_end_date"][1])
        if rng_cols:
            right = df[rng_cols[0]].astype(str).str.extract(r"[-–—|to]\s*(.*)$", expand=False).map(clean_date_like)
            end_series = right
    end_series = parse_date_series(end_series)
    return start_series, end_series

out["ico_start_date"], out["ico_end_date"] = pick_date_left_right()

# Numéricos principales (coalesce con preferencia de columnas numéricas)
def coalesce_numeric(cand_groups):
    cols = []
    for g in cand_groups:
        cols += resolve_base_to_series(g)
    # ordenar numeric dtype primero por cada grupo de prioridad
    numeric_first = [c for c in cols if pd.api.types.is_numeric_dtype(df[c])]
    others = [c for c in cols if c not in numeric_first]
    cols_ordered = numeric_first + others
    ser = _first_nonnull(cols_ordered)
    # intentar parseo para strings tipo "3M"
    ser = ser.apply(parse_money_like)
    return ser

out["goal_usd"]          = coalesce_numeric(CANON["goal_usd"])
out["hard_cap_usd"]      = coalesce_numeric(CANON["hard_cap_usd"])
out["amount_raised_usd"] = coalesce_numeric(CANON["amount_raised_usd"])

# Etiqueta de éxito: si hay varias, priorizamos por fuentes (implícito en union_wide)
succ_cols = resolve_base_to_series(CANON["ico_successful"][0])
succ_series = _first_nonnull(succ_cols)
out["ico_successful"] = succ_series.map(boolify)

# Tokenomics
out["token_price_usd"] = coalesce_numeric(CANON["token_price_usd"])
out["total_tokens"]    = coalesce_numeric(CANON["total_tokens"])
out["tokens_for_sale"] = coalesce_numeric(CANON["tokens_for_sale"])

# Min/Max investment: si no hay columnas directas, intentar parsear min_max_personal_cap
min_cols = resolve_base_to_series(CANON["min_investment_raw"][0])
max_cols = resolve_base_to_series(CANON["max_investment_raw"][0])

if not min_cols and not max_cols:
    # buscar cualquier base que contenga 'min_max_personal_cap'
    mm = choose_cols_by_priority(r"min[_\- ]?max[_\- ]?personal[_\- ]?cap", regex=True)
    if mm:
        mn, mx = zip(*df[mm[0]].map(extract_min_max))
        out["min_investment_usd"] = pd.to_numeric(mn, errors="coerce")
        out["max_investment_usd"] = pd.to_numeric(mx, errors="coerce")
else:
    if min_cols:
        out["min_investment_usd"] = coalesce_numeric([min_cols])
    if max_cols:
        out["max_investment_usd"] = coalesce_numeric([max_cols])

# Categóricas / flags
for canon in ["token_type","role_of_token","jurisdiction","accepts","interest","rating"]:
    cols = []
    for g in CANON[canon]:
        cols += resolve_base_to_series(g)
    out[canon] = _first_nonnull(cols)

for canon in ["whitelist","kyc","has_github","has_telegram","has_reddit","website_available",
              "roadmap_available","whitepaper_available"]:
    cols = []
    for g in CANON[canon]:
        cols += resolve_base_to_series(g)
    out[canon] = _first_nonnull(cols).map(boolify)

# Descuento crowd-sale
disc_cols = []
for g in CANON["discount_max_pct"]:
    disc_cols += resolve_base_to_series(g)
disc = _first_nonnull(disc_cols)
disc = disc.astype(str).str.extract(r"([\d.]+)", expand=False)
out["discount_max_pct"] = pd.to_numeric(disc, errors="coerce")

# Reglas derivadas
out["hit_softcap"] = ((out["amount_raised_usd"] >= out["goal_usd"]) & out["amount_raised_usd"].notna() & out["goal_usd"].notna()).astype("Int64")
out["hit_hardcap"] = ((out["amount_raised_usd"] >= out["hard_cap_usd"]) & out["amount_raised_usd"].notna() & out["hard_cap_usd"].notna()).astype("Int64")

# Orden de columnas final (identidad -> fechas -> funding -> tokenomics -> acceso -> señales -> equipo/rating/docs)
ordered_cols = [
    "name_std","symbol_std",
    "ico_start_date","ico_end_date",
    "goal_usd","hard_cap_usd","amount_raised_usd","ico_successful","hit_softcap","hit_hardcap",
    "token_price_usd","total_tokens","tokens_for_sale","min_investment_usd","max_investment_usd",
    "token_type","role_of_token","whitelist","kyc","jurisdiction","accepts",
    "has_github","has_telegram","has_reddit","website_available",
    "team_size","rating","interest","discount_max_pct","roadmap_available","whitepaper_available",
]
# añade cualquier columna canónica que haya quedado fuera por no existir
ordered_cols = [c for c in ordered_cols if c in out.columns] + [c for c in out.columns if c not in ordered_cols]

out = out[ordered_cols].copy()

# --- Diagnostics ---
def missing_pct(s): 
    return round(100*s.isna().mean(), 2)

report = pd.DataFrame({
    "column": out.columns,
    "dtype": [str(out[c].dtype) for c in out.columns],
    "missing_%": [missing_pct(out[c]) for c in out.columns]
}).sort_values(["missing_%","column"], ascending=[False, True])

print(f"Filas: {len(out):,}  |  Columnas canónicas: {out.shape[1]}")
print("\nTop 20 columnas con más missing (%):")
display(report.head(20))

# --- Guardar ---
out.to_csv(OUT_PATH, index=False)
print(f"\n✅ Guardado canónico: {OUT_PATH}")


Filas: 2,692  |  Columnas canónicas: 33

Top 20 columnas con más missing (%):




Unnamed: 0,column,dtype,missing_%
21,has_github,float64,100.0
23,has_reddit,float64,100.0
22,has_telegram,float64,100.0
2,ico_start_date,datetime64[ns],100.0
32,max_investment_raw,float64,100.0
31,min_investment_raw,float64,100.0
26,rating,float64,100.0
25,team_size,float64,100.0
24,website_available,float64,100.0
17,whitelist,float64,100.0



✅ Guardado canónico: ico_union_canonical.csv
CPU times: total: 906 ms
Wall time: 1.02 s


In [10]:
%%time
import pandas as pd, numpy as np, re
from datetime import datetime

# ------------------ Config ------------------
IN_PATH  = "../join/ico_union_wide.csv"
OUT_PATH = "ico_union_canonical.csv"   # cambialo si querés

# ------------------ Load --------------------
df = pd.read_csv(IN_PATH)
df.columns = df.columns.str.strip()

# prioridad de fuentes (izq→der)
SUFFIXES = ["_zenodo", "_icpsr", "_yan"]

def split_base_and_suffix(col: str):
    for s in SUFFIXES:
        if col.endswith(s):
            return col[:-len(s)], s
    return col, ""   # sin sufijo

# indexar base -> columnas
base_to_cols = {}
for c in df.columns:
    b, s = split_base_and_suffix(c)
    base_to_cols.setdefault(b, []).append(c)

# ------------------ Helpers -----------------
def money_like_to_float(s):
    if pd.isna(s): return np.nan
    x = str(s).strip().lower().replace(",", "").replace("$", "")
    try:
        mult = 1
        if "billion" in x or (re.search(r"\d", x) and x.endswith("b")): mult = 1_000_000_000
        elif "million" in x or (re.search(r"\d", x) and x.endswith("m")): mult = 1_000_000
        elif re.search(r"\d", x) and x.endswith("k"): mult = 1_000
        nums = re.findall(r"[\d.]+", x)
        return float(nums[0]) * mult if nums else np.nan
    except Exception:
        return np.nan

def clean_date_like(s):
    if pd.isna(s): return np.nan
    x = str(s).strip()
    x = re.sub(r"^(ended|end|finished|finalized)\s*:?\s*", "", x, flags=re.IGNORECASE)
    x = x.replace("—","-").replace("–","-")
    return x

def boolify(x):
    if pd.isna(x): return np.nan
    if isinstance(x, (int, float)) and not pd.isna(x):
        return int(float(x) != 0)
    s = str(x).strip().lower()
    if s in {"1","true","yes","y","si","sí","present","available","ok"}: return 1
    if s in {"0","false","no","n","absent","unavailable","none"}: return 0
    return np.nan

def _first_nonnull(series_list):
    """Coalesce alineando SIEMPRE contra df.index."""
    out = pd.Series(np.nan, index=df.index)
    for s in series_list or []:
        if s is None: 
            continue
        if isinstance(s, str):
            if s in df.columns: v = df[s]
            else:               continue
        elif isinstance(s, pd.Series):
            v = s
        else:
            continue
        v = pd.Series(v).reindex(out.index)  # <- clave para evitar shape mismatch
        out = out.where(~out.isna(), v)
    return out

def order_by_priority(cols):
    """*_zenodo > *_icpsr > *_yan > sin sufijo, deduplicado y existentes."""
    with_suf = [c for c in cols if any(c.endswith(s) for s in SUFFIXES)]
    no_suf   = [c for c in cols if c not in with_suf]
    ordered = []
    for s in SUFFIXES:
        ordered += [c for c in with_suf if c.endswith(s)]
    ordered += no_suf
    seen, uniq = set(), []
    for c in ordered:
        if c in df.columns and c not in seen:
            uniq.append(c); seen.add(c)
    return uniq

def find_cols_by_regex(patterns):
    """patterns: lista de regex; devuelve columnas reales (con sufijos) ordenadas por prioridad."""
    cols = []
    for pat in patterns:
        r = re.compile(pat, flags=re.IGNORECASE)
        for base in base_to_cols.keys():
            if r.search(base):
                cols += base_to_cols[base]
    return order_by_priority(cols)

def coalesce_numeric_from_patterns(pattern_groups):
    cols = []
    for group in pattern_groups:
        cols += find_cols_by_regex(group)
    if not cols:
        return pd.Series(np.nan, index=df.index)
    ser = _first_nonnull(cols)
    # intentar convertir directo
    ser_num = pd.to_numeric(ser, errors="coerce")
    # si hay demasiados NaN, probar parseo tipo "3M"
    if ser_num.notna().sum() < 0.2 * len(ser_num):
        ser_num = ser.apply(money_like_to_float)
    return ser_num

def coalesce_text_from_patterns(pattern_groups):
    cols = []
    for group in pattern_groups:
        cols += find_cols_by_regex(group)
    if not cols:
        return pd.Series(np.nan, index=df.index)
    ser = _first_nonnull(cols).astype(str).replace({"nan": np.nan})
    return ser

def coalesce_bool_from_patterns(pattern_groups):
    cols = []
    for group in pattern_groups:
        cols += find_cols_by_regex(group)
    if not cols:
        return pd.Series(pd.array([pd.NA]*len(df), dtype="Int64"), index=df.index)
    ser = _first_nonnull(cols).map(boolify).astype("Int64")
    return ser

def extract_min_max(s):
    if pd.isna(s): return (np.nan, np.nan)
    x = str(s).lower()
    nums = re.findall(r"[\d.]+", x)
    if not nums: return (np.nan, np.nan)
    if len(nums) == 1:
        v = float(nums[0]); return (v, v)
    return (float(nums[0]), float(nums[1]))

def parse_dates_robust(s: pd.Series) -> pd.Series:
    """Parsea fechas con intentos explícitos (sin warnings) y luego fallback."""
    s = pd.Series(s, index=df.index)
    parsed = pd.to_datetime(s, format="%d %b %Y", errors="coerce", dayfirst=True)
    for fmt in ("%Y-%m-%d", "%d/%m/%Y", "%b %d, %Y"):
        need = parsed.isna()
        if need.any():
            parsed.loc[need] = pd.to_datetime(s[need], format=fmt, errors="coerce", dayfirst=True)
    need = parsed.isna()
    if need.any():
        parsed.loc[need] = pd.to_datetime(s[need], errors="coerce", dayfirst=True)
    return parsed

# ------------------ Aliases (regex) -----------------
ALIASES = {
    # Identidad
    "name_std":   [[r"^name_std$"]],
    "symbol_std": [[r"^symbol_std$"]],

    # Fechas
    "start_direct": [[r"^start_date", r"\bico_start\b", r"sale_start", r"preico_start"]],
    "end_direct":   [[r"^end_date_parsed$", r"^end_date\b", r"\bico_end\b", r"sale_end", r"token_sale_end"]],
    "date_range":   [[r"start_end_date_coin_sell", r"ico_dates", r"date_range"]],

    # Recaudación / objetivos
    "goal_usd":           [[r"fundraising[_ ]?goal", r"^goal$", r"soft[_ ]?cap", r"softcap", r"target"]],
    "hard_cap_usd":       [[r"hard[_ ]?cap", r"max[_ ]?cap", r"maximum[_ ]?cap"]],
    "amount_raised_usd":  [[r"received[_ ]?money(\.1)?$", r"amount[_ ]?raised(_usd)?$", r"^raised(_usd)?$"]],

    # Etiqueta éxito
    "ico_successful":     [[r"ico[_ ]?success(ful)?$", r"^success$", r"^successful$"]],

    # Tokenomics
    "token_price_usd":    [[r"ico[_ ]?token[_ ]?price", r"token[_ ]?price"]],
    "total_tokens":       [[r"total[_ ]?tokens", r"token[_ ]?supply[_ ]?total"]],
    "tokens_for_sale":    [[r"available[_ ]?for[_ ]?token[_ ]?sale", r"tokens?[_ ]?for[_ ]?sale", r"tokensfsale"]],
    "min_investment":     [[r"min[_ ]?investment", r"minimum[_ ]?investment", r"min[_ ]?max[_ ]?personal[_ ]?cap"]],
    "max_investment":     [[r"max[_ ]?investment", r"maximum[_ ]?investment", r"min[_ ]?max[_ ]?personal[_ ]?cap"]],
    "token_type":         [[r"token[_ ]?type", r"utility[_ ]?token[_ ]?enables", r"role[_ ]?of[_ ]?token", r"\btype\b"]],
    "role_of_token":      [[r"role[_ ]?of[_ ]?token", r"\brole\b"]],

    # Acceso / Compliance / Jurisdicción
    "whitelist":          [[r"whitelist", r"qualified[_ ]?investors[_ ]?only", r"us[_ ]?retail[_ ]?investors[_ ]?excluded"]],
    "kyc":                [[r"\bkyc\b", r"regulkyc", r"reg[_ ]?kyc"]],
    "jurisdiction":       [[r"jurisdiction", r"country", r"legal[_ ]?(form|entity)", r"registered[_ ]?in[_ ]?offshore"]],
    "accepts":            [[r"\baccepts\b", r"currencies[_ ]?accepted"]],

    # Señales de ejecución / presencia
    "has_github":         [[r"project[_ ]?code[_ ]?available", r"smart[_ ]?contract[_ ]?code[_ ]?available", r"github", r"code[_ ]?available"]],
    "has_telegram":       [[r"telegram"]],
    "has_reddit":         [[r"reddit"]],
    "website_available":  [[r"website[_ ]?available", r"\bwebsite\b", r"\bsite\b"]],

    # Equipo / rating / interés / docs
    "team_size":          [[r"team[_ ]?size", r"teamsize"]],
    "rating":             [[r"rating", r"ico[_ ]?rating", r"score"]],
    "interest":           [[r"interest"]],
    "discount_max_pct":   [[r"crowdsale[_ ]?max\.?[_ ]?discount", r"presale[_ ]?discount", r"max[_ ]?discount"]],
    "roadmap_available":  [[r"development[_ ]?road[_ ]?map[_ ]?available", r"roadmap[_ ]?available", r"has[_ ]?roadmap"]],
    "whitepaper_available":[[r"whitepaper[_ ]?available", r"whitepaper[_ ]?page", r"white[_ ]?paper"]],

    # Duraciones / IEO / RegTax
    "ico_length_actual":  [[r"length[_ ]?of[_ ]?ico.*\(calendar days, actual\)", r"ico[_ ]?length[_ ]?actual"]],
    "ico_length_planned": [[r"length[_ ]?of[_ ]?ico.*\(calendar days, planned\)", r"ico[_ ]?length[_ ]?planned"]],
    "ieo":                [[r"\bieo\b", r"initial[_ ]?exchange[_ ]?offering", r"used[_ ]?an[_ ]?exchange"]],
    "regtax":             [[r"reg[_ ]?tax", r"tax[_ ]?reg(ulation)?", r"regulation[_ ]?on[_ ]?transfer"]],
}

# ------------------ Build canónico ------------------
out = pd.DataFrame(index=df.index)

# Identidad (si existen en union_wide)
out["name_std"]   = _first_nonnull(order_by_priority(find_cols_by_regex(ALIASES["name_std"][0])))
out["symbol_std"] = _first_nonnull(order_by_priority(find_cols_by_regex(ALIASES["symbol_std"][0])))

# ---- Fechas (FIX robusto) ----
start_direct = coalesce_text_from_patterns(ALIASES["start_direct"])
end_direct   = coalesce_text_from_patterns(ALIASES["end_direct"])
date_range   = coalesce_text_from_patterns(ALIASES["date_range"])

start_direct = pd.Series(start_direct, index=df.index)
end_direct   = pd.Series(end_direct,   index=df.index)
date_range   = pd.Series(date_range,   index=df.index)

left_from_range  = pd.Series(np.nan, index=df.index)
right_from_range = pd.Series(np.nan, index=df.index)
lr = date_range.astype(str).str.extract(r"^\s*([^-–—|to]+)", expand=False)
rr = date_range.astype(str).str.extract(r"[-–—|to]\s*(.*)$",   expand=False)
left_from_range.loc[lr.index]  = lr
right_from_range.loc[rr.index] = rr

def _clean_date_series(s):
    s = pd.Series(s, index=df.index)
    return s.astype(str).map(clean_date_like).replace({"nan": np.nan})

start_txt = start_direct.copy()
start_txt = start_txt.mask(start_txt.notna(), start_txt).mask(start_txt.isna(), _clean_date_series(left_from_range))

end_txt = end_direct.copy()
end_txt = end_txt.mask(end_txt.notna(), end_txt).mask(end_txt.isna(), _clean_date_series(right_from_range))

out["ico_start_date"] = parse_dates_robust(start_txt)
out["ico_end_date"]   = parse_dates_robust(end_txt)

# ---- Recaudación / objetivos ----
out["goal_usd"]          = coalesce_numeric_from_patterns(ALIASES["goal_usd"])
out["hard_cap_usd"]      = coalesce_numeric_from_patterns(ALIASES["hard_cap_usd"])
out["amount_raised_usd"] = coalesce_numeric_from_patterns(ALIASES["amount_raised_usd"])

# ---- Éxito ----
succ = coalesce_text_from_patterns(ALIASES["ico_successful"])
out["ico_successful"] = succ.map(boolify)

# ---- Tokenomics ----
out["token_price_usd"] = coalesce_numeric_from_patterns(ALIASES["token_price_usd"])
out["total_tokens"]    = coalesce_numeric_from_patterns(ALIASES["total_tokens"])
out["tokens_for_sale"] = coalesce_numeric_from_patterns(ALIASES["tokens_for_sale"])

# ---- Min/Max investment ----
mininv_raw = coalesce_text_from_patterns(ALIASES["min_investment"])
maxinv_raw = coalesce_text_from_patterns(ALIASES["max_investment"])
mn, mx  = zip(*mininv_raw.map(extract_min_max))
mn2, mx2 = zip(*maxinv_raw.map(extract_min_max))
mx_final = pd.Series(mx, index=df.index).where(pd.notna(pd.Series(mx, index=df.index)), pd.Series(mx2, index=df.index))
out["min_investment_usd"] = pd.to_numeric(pd.Series(mn, index=df.index), errors="coerce")
out["max_investment_usd"] = pd.to_numeric(mx_final, errors="coerce")

# ---- Categóricas / flags ----
out["token_type"]    = coalesce_text_from_patterns(ALIASES["token_type"])
out["role_of_token"] = coalesce_text_from_patterns(ALIASES["role_of_token"])
out["whitelist"]     = coalesce_bool_from_patterns(ALIASES["whitelist"])
out["kyc"]           = coalesce_bool_from_patterns(ALIASES["kyc"])
out["jurisdiction"]  = coalesce_text_from_patterns(ALIASES["jurisdiction"])
out["accepts"]       = coalesce_text_from_patterns(ALIASES["accepts"])

# ---- Señales de ejecución / presencia ----
out["has_github"]        = coalesce_bool_from_patterns(ALIASES["has_github"])
out["has_telegram"]      = coalesce_bool_from_patterns(ALIASES["has_telegram"])
out["has_reddit"]        = coalesce_bool_from_patterns(ALIASES["has_reddit"])
out["website_available"] = coalesce_bool_from_patterns(ALIASES["website_available"])

# ---- Equipo / rating / interés / docs ----
out["team_size"] = pd.to_numeric(coalesce_text_from_patterns(ALIASES["team_size"]), errors="coerce")
out["rating"]    = pd.to_numeric(coalesce_text_from_patterns(ALIASES["rating"]), errors="coerce")
out["interest"]  = coalesce_text_from_patterns(ALIASES["interest"])

disc = coalesce_text_from_patterns(ALIASES["discount_max_pct"]).astype(str).str.extract(r"([\d.]+)", expand=False)
out["discount_max_pct"] = pd.to_numeric(disc, errors="coerce")

out["roadmap_available"] = coalesce_bool_from_patterns(ALIASES["roadmap_available"])
wp = coalesce_text_from_patterns(ALIASES["whitepaper_available"])
wp_num = pd.to_numeric(wp, errors="coerce")
out["whitepaper_available"] = pd.Series(
    np.where(wp_num.notna() & (wp_num>0), 1, np.where(wp.notna(), wp.map(boolify), np.nan)),
    dtype="Int64", index=df.index
)

# ---- Duraciones / IEO / RegTax ----
out["ico_length_actual_days"]  = pd.to_numeric(coalesce_text_from_patterns(ALIASES["ico_length_actual"]), errors="coerce")
out["ico_length_planned_days"] = pd.to_numeric(coalesce_text_from_patterns(ALIASES["ico_length_planned"]), errors="coerce")
out["is_ieo"]           = coalesce_bool_from_patterns(ALIASES["ieo"])
out["is_tax_regulated"] = coalesce_bool_from_patterns(ALIASES["regtax"])

# ---- Flags derivados ----
out["hit_softcap"] = ((out["amount_raised_usd"] >= out["goal_usd"]) & out["amount_raised_usd"].notna() & out["goal_usd"].notna()).astype("Int64")
out["hit_hardcap"] = ((out["amount_raised_usd"] >= out["hard_cap_usd"]) & out["amount_raised_usd"].notna() & out["hard_cap_usd"].notna()).astype("Int64")

# ---- Orden final ----
ordered = [
    "name_std","symbol_std",
    "ico_start_date","ico_end_date","ico_length_actual_days","ico_length_planned_days",
    "goal_usd","hard_cap_usd","amount_raised_usd","ico_successful","hit_softcap","hit_hardcap",
    "token_price_usd","total_tokens","tokens_for_sale","min_investment_usd","max_investment_usd",
    "token_type","role_of_token","whitelist","kyc","jurisdiction","accepts",
    "has_github","has_telegram","has_reddit","website_available",
    "team_size","rating","interest","discount_max_pct","roadmap_available","whitepaper_available",
    "is_ieo","is_tax_regulated",
]
ordered = [c for c in ordered if c in out.columns] + [c for c in out.columns if c not in ordered]
out = out[ordered].copy()

# ------------------ Reporte -----------------
def missing_pct(s): 
    return round(100*s.isna().mean(), 2)

report = pd.DataFrame({
    "column": out.columns,
    "dtype": [str(out[c].dtype) for c in out.columns],
    "missing_%": [missing_pct(out[c]) for c in out.columns]
}).sort_values(["missing_%","column"], ascending=[False, True])

print(f"Filas: {len(out):,}  |  Columnas canónicas: {out.shape[1]}")
print("\nTop 25 columnas con más missing (%):")
try:
    display(report.head(25))
except Exception:
    print(report.head(25).to_string(index=False))

# ------------------ Save --------------------
out.to_csv(OUT_PATH, index=False)
print(f"\n✅ Guardado canónico: {OUT_PATH}")




Filas: 2,692  |  Columnas canónicas: 35

Top 25 columnas con más missing (%):


Unnamed: 0,column,dtype,missing_%
25,has_reddit,Int64,100.0
24,has_telegram,Int64,100.0
4,ico_length_actual_days,float64,100.0
5,ico_length_planned_days,float64,100.0
2,ico_start_date,datetime64[ns],100.0
9,ico_successful,float64,100.0
28,rating,float64,100.0
26,website_available,Int64,100.0
16,max_investment_usd,float64,99.48
15,min_investment_usd,float64,99.48



✅ Guardado canónico: final/ico_union_canonical.csv
CPU times: total: 562 ms
Wall time: 581 ms


In [14]:
print(df.columns.tolist())

['__key__', 'name_std', 'symbol_std', 'name_other', 'name_cmc', 'ticker_symbol_cmc', 'ico_successful', 'soft_cap', 'hard_cap', 'cap_unit', 'cap_includes_presale', 'token_type', 'number_of_contributors', 'crowdsale_tokens_sold', 'total_number_of_tokens', 'token_standard', 'additional_token_emissions', 'crowdsale_token_price_min', 'crowdsale_token_price_max', 'crowdsale_actual_token_price_max', 'crowdsale is auction', 'has a presale', 'presale_tokens_sold', 'presale_token_price_min', 'presale_token_price_max', 'development road map available', 'whitepaper page count', 'product or prototype developed', 'product can be tried out', 'years since foundation', 'issuer has customers for product', 'business model available', 'utility token enables decentralization', 'smart contract code available', 'project code available', 'use of proceeds mentioned', 'use of proceeds disclosed in detail', 'token share team (ex ante)', 'token share crowdsale investors (ex ante)', 'token share presale investors 

In [2]:
%%time
import pandas as pd, numpy as np, re
from datetime import datetime

# ------------------ Config ------------------
IN_PATH  = "../join/ico_union_wide.csv"
OUT_PATH = "ico_union_canonical_v3.csv"

# ------------------ Load --------------------
df = pd.read_csv(IN_PATH)
df.columns = df.columns.str.strip()

# prioridad de fuentes (izq→der)
SUFFIXES = ["_zenodo", "_icpsr", "_yan"]

def split_base_and_suffix(col: str):
    for s in SUFFIXES:
        if col.endswith(s):
            return col[:-len(s)], s
    return col, ""   # sin sufijo

# indexar base -> columnas
base_to_cols = {}
for c in df.columns:
    b, s = split_base_and_suffix(c)
    base_to_cols.setdefault(b, []).append(c)

# ------------------ Helpers -----------------
def money_like_to_float(s):
    if pd.isna(s): return np.nan
    x = str(s).strip().lower().replace(",", "").replace("$", "")
    try:
        mult = 1
        if "billion" in x or (re.search(r"\d", x) and x.endswith("b")): mult = 1_000_000_000
        elif "million" in x or (re.search(r"\d", x) and x.endswith("m")): mult = 1_000_000
        elif re.search(r"\d", x) and x.endswith("k"): mult = 1_000
        nums = re.findall(r"[\d.]+", x)
        return float(nums[0]) * mult if nums else np.nan
    except Exception:
        return np.nan

def clean_date_like(s):
    if pd.isna(s): return np.nan
    x = str(s).strip()
    x = re.sub(r"^(ended|end|finished|finalized)\s*:?\s*", "", x, flags=re.IGNORECASE)
    x = x.replace("—","-").replace("–","-")
    return x

def boolify(x):
    if pd.isna(x): return np.nan
    if isinstance(x, (int, float)) and not pd.isna(x):
        return int(float(x) != 0)
    s = str(x).strip().lower()
    if s in {"1","true","yes","y","si","sí","present","available","ok"}: return 1
    if s in {"0","false","no","n","absent","unavailable","none"}: return 0
    return np.nan

def _first_nonnull(series_list):
    """Coalesce alineando SIEMPRE contra df.index."""
    out = pd.Series(np.nan, index=df.index)
    for s in series_list or []:
        if s is None: 
            continue
        if isinstance(s, str):
            if s in df.columns: v = df[s]
            else:               continue
        elif isinstance(s, pd.Series):
            v = s
        else:
            continue
        v = pd.Series(v).reindex(out.index)  # <- clave para evitar shape mismatch
        out = out.where(~out.isna(), v)
    return out

def order_by_priority(cols):
    """*_zenodo > *_icpsr > *_yan > sin sufijo, deduplicado y existentes."""
    with_suf = [c for c in cols if any(c.endswith(s) for s in SUFFIXES)]
    no_suf   = [c for c in cols if c not in with_suf]
    ordered = []
    for s in SUFFIXES:
        ordered += [c for c in with_suf if c.endswith(s)]
    ordered += no_suf
    seen, uniq = set(), []
    for c in ordered:
        if c in df.columns and c not in seen:
            uniq.append(c); seen.add(c)
    return uniq

def find_cols_by_regex(patterns):
    """patterns: lista de regex; devuelve columnas reales (con sufijos) ordenadas por prioridad."""
    cols = []
    for pat in patterns:
        r = re.compile(pat, flags=re.IGNORECASE)
        for base in base_to_cols.keys():
            if r.search(base):
                cols += base_to_cols[base]
    return order_by_priority(cols)

def coalesce_numeric_from_patterns(pattern_groups):
    cols = []
    for group in pattern_groups:
        cols += find_cols_by_regex(group)
    if not cols:
        return pd.Series(np.nan, index=df.index)
    ser = _first_nonnull(cols)
    # intentar convertir directo
    ser_num = pd.to_numeric(ser, errors="coerce")
    # si hay demasiados NaN, probar parseo tipo "3M"
    if ser_num.notna().sum() < 0.2 * len(ser_num):
        ser_num = ser.apply(money_like_to_float)
    return ser_num

def coalesce_text_from_patterns(pattern_groups):
    cols = []
    for group in pattern_groups:
        cols += find_cols_by_regex(group)
    if not cols:
        return pd.Series(np.nan, index=df.index)
    ser = _first_nonnull(cols).astype(str).replace({"nan": np.nan})
    return ser

def coalesce_bool_from_patterns(pattern_groups):
    cols = []
    for group in pattern_groups:
        cols += find_cols_by_regex(group)
    if not cols:
        return pd.Series(pd.array([pd.NA]*len(df), dtype="Int64"), index=df.index)
    ser = _first_nonnull(cols).map(boolify).astype("Int64")
    return ser

def extract_min_max(s):
    if pd.isna(s): return (np.nan, np.nan)
    x = str(s).lower()
    nums = re.findall(r"[\d.]+", x)
    if not nums: return (np.nan, np.nan)
    if len(nums) == 1:
        v = float(nums[0]); return (v, v)
    return (float(nums[0]), float(nums[1]))

def parse_dates_robust(s: pd.Series) -> pd.Series:
    """Parsea fechas con intentos explícitos (sin warnings) y luego fallback."""
    s = pd.Series(s, index=df.index)
    parsed = pd.to_datetime(s, format="%d %b %Y", errors="coerce", dayfirst=True)
    for fmt in ("%Y-%m-%d", "%d/%m/%Y", "%b %d, %Y"):
        need = parsed.isna()
        if need.any():
            parsed.loc[need] = pd.to_datetime(s[need], format=fmt, errors="coerce", dayfirst=True)
    need = parsed.isna()
    if need.any():
        parsed.loc[need] = pd.to_datetime(s[need], errors="coerce", dayfirst=True)
    return parsed

# ------------------ Aliases (regex) -----------------
# Basado en los nombres reales que pasaste de cada dataset
ALIASES = {
    # Identidad
    "name_std":   [[r"^name_std$"]],
    "symbol_std": [[r"^symbol_std$"]],

    # Fechas (Zenodo + ICPSR + Yan)
    "start_direct": [[r"^ico_start_date$", r"^ico starts$", r"\bico_start\b", r"sale_start", r"preico_start"]],
    "end_direct":   [[r"^ico_end_date_actual$", r"^ico_end_date_planned$", r"^ico ends$", r"^end_date_parsed$", r"^end_date\b", r"\bico_end\b", r"sale_end", r"token_sale_end"]],
    "date_range":   [[r"^start_end_date_coin_sell$", r"ico_dates", r"date_range"]],

    # Recaudación / objetivos
    "goal_usd":           [[r"^soft_cap_usd$", r"^soft_cap$", r"^softcap_usd$", r"^softcap$", r"^fundraising_goal$", r"^goal$"]],
    "hard_cap_usd":       [[r"^hard_cap_usd$", r"^hard_cap$", r"^hardcap_usd$", r"^hardcap$"]],
    "amount_raised_usd":  [[r"^total amount raised \(usdm\)$", r"^total amount raised \(usdm\)\.1$", r"^amount raised_usd$", r"^amount raised$", r"^received_money(\.1)?$"]],

    # Éxito (ICPSR/Zenodo/Yan)
    "ico_successful":     [[r"^ico_successful$", r"^ico success$", r"^success$", r"^successful$"]],

    # Tokenomics
    "token_price_usd":    [[r"^crowdsale_actual_token_price_max$", r"^price ico$", r"^ico_token_price$"]],
    "total_tokens":       [[r"^total_number_of_tokens$", r"^circsupply$", r"^total_tokens$"]],
    "tokens_for_sale":    [[r"^crowdsale_tokens_sold$", r"^tokens f sale$", r"^available_for_token_sale$"]],

    # Min/Max inversión
    "min_investment":     [[r"^mininvest$", r"min[_ ]?investment", r"minimum[_ ]?investment", r"min[_ ]?max[_ ]?personal[_ ]?cap"]],
    "max_investment":     [[r"^mininvest$", r"max[_ ]?investment", r"maximum[_ ]?investment", r"min[_ ]?max[_ ]?personal[_ ]?cap"]],  # Yan no trae max explícito

    # Tipo/rol token
    "token_type":         [[r"^token_type$", r"^protocol type$", r"utility token enables decentralization", r"\btype\b"]],
    "role_of_token":      [[r"^role_of_token$", r"\brole\b"]],

    # Compliance / Jurisdicción
    "whitelist":          [[r"^qualified investors only$", r"^us retail investors excluded$", r"^wlist$", r"^whitelist$"]],
    "kyc":                [[r"^kyc/aml procedure$", r"^kyc$", r"^regulkyc$"]],
    "jurisdiction":       [[r"^registration_country$", r"^country$", r"^legal_structure$"]],
    "accepts":            [[r"^accepting$", r"\baccepts\b", r"currencies[_ ]?accepted"]],

    # Señales de ejecución / presencia (GitHub/Website)
    "has_github":         [[r"^project code available$", r"^smart contract code available$", r"github", r"code[_ ]?available"]],
    "website_available":  [[r"website[_ ]?available", r"\bwebsite\b", r"\bsite\b"]],

    # Equipo / rating / interés / docs
    "team_size":          [[r"^team size$", r"^team_size$", r"teamsize"]],
    "rating":             [[r"^rating$", r"^ico[_ ]?rating$", r"^score$"]],
    "interest":           [[r"^interest$"]],
    "discount_max_pct":   [[r"^crowdsale max\. discount \(%\)$", r"^presale discount \(%\)$", r"max[_ ]?discount"]],
    "roadmap_available":  [[r"^development road map available$", r"roadmap[_ ]?available", r"has[_ ]?roadmap"]],
    "whitepaper_available":[[r"whitepaper[_ ]?available", r"white[_ ]?paper"]],
    "whitepaper_page_count": [[r"^whitepaper page count$"]],

    # Duraciones / IEO / RegTax
    "ico_length_actual":  [[r"^length of crowdsale \(calendar days, actual\)$", r"ico[_ ]?length[_ ]?actual"]],
    "ico_length_planned": [[r"^length of crowdsale \(calendar days, planned\)$", r"ico[_ ]?length[_ ]?planned"]],
    "ieo":                [[r"^ieo$", r"initial[_ ]?exchange[_ ]?offering", r"used[_ ]?an[_ ]?exchange"]],
    "regtax":             [[r"^regtax$", r"reg[_ ]?tax", r"tax[_ ]?reg(ulation)?", r"regulation[_ ]?on[_ ]?transfer"]],
}

# ------------------ Build canónico ------------------
out = pd.DataFrame(index=df.index)

# Identidad (si existen en union_wide)
out["name_std"]   = _first_nonnull(order_by_priority(find_cols_by_regex(ALIASES["name_std"][0])))
out["symbol_std"] = _first_nonnull(order_by_priority(find_cols_by_regex(ALIASES["symbol_std"][0])))

# ---- Fechas (robusto) ----
start_direct = coalesce_text_from_patterns(ALIASES["start_direct"])
end_direct   = coalesce_text_from_patterns(ALIASES["end_direct"])
date_range   = coalesce_text_from_patterns(ALIASES["date_range"])

start_direct = pd.Series(start_direct, index=df.index)
end_direct   = pd.Series(end_direct,   index=df.index)
date_range   = pd.Series(date_range,   index=df.index)

left_from_range  = pd.Series(np.nan, index=df.index)
right_from_range = pd.Series(np.nan, index=df.index)
lr = date_range.astype(str).str.extract(r"^\s*([^-–—|to]+)", expand=False)
rr = date_range.astype(str).str.extract(r"[-–—|to]\s*(.*)$",   expand=False)
left_from_range.loc[lr.index]  = lr
right_from_range.loc[rr.index] = rr

def _clean_date_series(s):
    s = pd.Series(s, index=df.index)
    return s.astype(str).map(clean_date_like).replace({"nan": np.nan})

start_txt = start_direct.copy()
start_txt = start_txt.mask(start_txt.isna(), _clean_date_series(left_from_range))

end_txt = end_direct.copy()
end_txt = end_txt.mask(end_txt.isna(), _clean_date_series(right_from_range))

out["ico_start_date"] = parse_dates_robust(start_txt)
out["ico_end_date"]   = parse_dates_robust(end_txt)

# ---- Recaudación / objetivos ----
out["goal_usd"]          = coalesce_numeric_from_patterns(ALIASES["goal_usd"])
out["hard_cap_usd"]      = coalesce_numeric_from_patterns(ALIASES["hard_cap_usd"])
out["amount_raised_usd"] = coalesce_numeric_from_patterns(ALIASES["amount_raised_usd"])

# ---- Éxito (ICPSR > Zenodo > Yan) ----
succ = coalesce_text_from_patterns(ALIASES["ico_successful"])
out["ico_successful"] = succ.map(boolify)

# ---- Tokenomics ----
out["token_price_usd"] = coalesce_numeric_from_patterns(ALIASES["token_price_usd"])
out["total_tokens"]    = coalesce_numeric_from_patterns(ALIASES["total_tokens"])
out["tokens_for_sale"] = coalesce_numeric_from_patterns(ALIASES["tokens_for_sale"])

# ---- Min/Max investment ----
mininv_raw = coalesce_text_from_patterns(ALIASES["min_investment"])
maxinv_raw = coalesce_text_from_patterns(ALIASES["max_investment"])
mn, mx  = zip(*mininv_raw.map(extract_min_max))
mn2, mx2 = zip(*maxinv_raw.map(extract_min_max))
mx_final = pd.Series(mx, index=df.index).where(pd.notna(pd.Series(mx, index=df.index)), pd.Series(mx2, index=df.index))
out["min_investment_usd"] = pd.to_numeric(pd.Series(mn, index=df.index), errors="coerce")
out["max_investment_usd"] = pd.to_numeric(mx_final, errors="coerce")

# ---- Categóricas / flags ----
out["token_type"]    = coalesce_text_from_patterns(ALIASES["token_type"])
out["role_of_token"] = coalesce_text_from_patterns(ALIASES["role_of_token"])
out["whitelist"]     = coalesce_bool_from_patterns(ALIASES["whitelist"])
out["kyc"]           = coalesce_bool_from_patterns(ALIASES["kyc"])
out["jurisdiction"]  = coalesce_text_from_patterns(ALIASES["jurisdiction"])
out["accepts"]       = coalesce_text_from_patterns(ALIASES["accepts"])

# ---- Señales de ejecución / presencia ----
out["has_github"]        = coalesce_bool_from_patterns(ALIASES["has_github"])
# (Telegram/Reddit casi no existen en estos tres; si aparecen en el futuro, se capturan por regex)
out["has_telegram"]      = pd.Series(pd.NA, index=df.index, dtype="Int64")
out["has_reddit"]        = pd.Series(pd.NA, index=df.index, dtype="Int64")
out["website_available"] = coalesce_bool_from_patterns(ALIASES["website_available"])

# ---- Equipo / rating / interés / docs ----
out["team_size"] = pd.to_numeric(coalesce_text_from_patterns(ALIASES["team_size"]), errors="coerce")
out["rating"]    = pd.to_numeric(coalesce_text_from_patterns(ALIASES["rating"]), errors="coerce")
out["interest"]  = coalesce_text_from_patterns(ALIASES["interest"])

disc = coalesce_text_from_patterns(ALIASES["discount_max_pct"]).astype(str).str.extract(r"([\d.]+)", expand=False)
out["discount_max_pct"] = pd.to_numeric(disc, errors="coerce")

out["roadmap_available"] = coalesce_bool_from_patterns(ALIASES["roadmap_available"])

# --- Whitepaper availability por "page count" + flags ---
wp_count = pd.to_numeric(coalesce_text_from_patterns(ALIASES["whitepaper_page_count"]), errors="coerce")
wp_flag  = (wp_count > 0).astype("Int64")
wp_other = coalesce_bool_from_patterns(ALIASES["whitepaper_available"])
out["whitepaper_available"] = wp_flag.where(wp_flag.notna(), wp_other)

# ---- Duraciones / IEO / RegTax ----
out["ico_length_actual_days"]  = pd.to_numeric(coalesce_text_from_patterns(ALIASES["ico_length_actual"]), errors="coerce")
out["ico_length_planned_days"] = pd.to_numeric(coalesce_text_from_patterns(ALIASES["ico_length_planned"]), errors="coerce")
out["is_ieo"]           = coalesce_bool_from_patterns(ALIASES["ieo"])
out["is_tax_regulated"] = coalesce_bool_from_patterns(ALIASES["regtax"])

# ---- Flags derivados ----
out["hit_softcap"] = ((out["amount_raised_usd"] >= out["goal_usd"]) & out["amount_raised_usd"].notna() & out["goal_usd"].notna()).astype("Int64")
out["hit_hardcap"] = ((out["amount_raised_usd"] >= out["hard_cap_usd"]) & out["amount_raised_usd"].notna() & out["hard_cap_usd"].notna()).astype("Int64")

# ---- Backfills mínimos para modelar ya ----
# A) si ico_successful está vacío, usar hit_softcap como proxy
if "ico_successful" in out.columns and "hit_softcap" in out.columns:
    mask_empty = out["ico_successful"].isna()
    out.loc[mask_empty, "ico_successful"] = out.loc[mask_empty, "hit_softcap"]

# B) si goal_usd es NaN y hard_cap_usd no, usar hardcap como proxy (opcional)
mask_goal_missing = out["goal_usd"].isna() & out["hard_cap_usd"].notna()
out.loc[mask_goal_missing, "goal_usd"] = out.loc[mask_goal_missing, "hard_cap_usd"]

# ---- Orden final ----
ordered = [
    "name_std","symbol_std",
    "ico_start_date","ico_end_date","ico_length_actual_days","ico_length_planned_days",
    "goal_usd","hard_cap_usd","amount_raised_usd","ico_successful","hit_softcap","hit_hardcap",
    "token_price_usd","total_tokens","tokens_for_sale","min_investment_usd","max_investment_usd",
    "token_type","role_of_token","whitelist","kyc","jurisdiction","accepts",
    "has_github","has_telegram","has_reddit","website_available",
    "team_size","rating","interest","discount_max_pct","roadmap_available","whitepaper_available",
    "is_ieo","is_tax_regulated",
]
ordered = [c for c in ordered if c in out.columns] + [c for c in out.columns if c not in ordered]
out = out[ordered].copy()

# ------------------ Reporte -----------------
def missing_pct(s): 
    return round(100*s.isna().mean(), 2)

report = pd.DataFrame({
    "column": out.columns,
    "dtype": [str(out[c].dtype) for c in out.columns],
    "missing_%": [missing_pct(out[c]) for c in out.columns]
}).sort_values(["missing_%","column"], ascending=[False, True])

print(f"Filas: {len(out):,}  |  Columnas canónicas: {out.shape[1]}")
print("\nTop 25 columnas con más missing (%):")
try:
    display(report.head(25))
except Exception:
    print(report.head(25).to_string(index=False))

# ------------------ Save --------------------
out.to_csv(OUT_PATH, index=False)
print(f"\n✅ Guardado canónico: {OUT_PATH}")


Filas: 2,692  |  Columnas canónicas: 35

Top 25 columnas con más missing (%):




Unnamed: 0,column,dtype,missing_%
25,has_reddit,Int64,100.0
24,has_telegram,Int64,100.0
28,rating,float64,100.0
26,website_available,Int64,100.0
29,interest,object,92.61
18,role_of_token,object,92.57
27,team_size,float64,89.52
30,discount_max_pct,float64,89.23
5,ico_length_planned_days,float64,88.74
23,has_github,Int64,88.63



✅ Guardado canónico: ico_union_canonical_v3.csv
CPU times: total: 234 ms
Wall time: 238 ms


In [3]:
print(out.columns.tolist)

<bound method IndexOpsMixin.tolist of Index(['name_std', 'symbol_std', 'ico_start_date', 'ico_end_date',
       'ico_length_actual_days', 'ico_length_planned_days', 'goal_usd',
       'hard_cap_usd', 'amount_raised_usd', 'ico_successful', 'hit_softcap',
       'hit_hardcap', 'token_price_usd', 'total_tokens', 'tokens_for_sale',
       'min_investment_usd', 'max_investment_usd', 'token_type',
       'role_of_token', 'whitelist', 'kyc', 'jurisdiction', 'accepts',
       'has_github', 'has_telegram', 'has_reddit', 'website_available',
       'team_size', 'rating', 'interest', 'discount_max_pct',
       'roadmap_available', 'whitepaper_available', 'is_ieo',
       'is_tax_regulated'],
      dtype='object')>


In [7]:
%%time
import pandas as pd, numpy as np, re
from datetime import datetime

# ------------------ Config ------------------
IN_PATH   = "../join/ico_union_wide.csv"
OUT_ALL   = "../join/ico_union_canonical_v6.csv"         # dataset canónico (todo)
OUT_EXANT = "../join/ico_exante_features_v3.csv"         # solo features pre-ICO + y

# ------------------ Load --------------------
df = pd.read_csv(IN_PATH)
df.columns = df.columns.str.strip()

# prioridad de fuentes (izq→der)
SUFFIXES = ["_zenodo", "_icpsr", "_yan"]

def split_base_and_suffix(col: str):
    for s in SUFFIXES:
        if col.endswith(s):
            return col[:-len(s)], s
    return col, ""

# indexar base -> columnas
base_to_cols = {}
for c in df.columns:
    b, s = split_base_and_suffix(c)
    base_to_cols.setdefault(b, []).append(c)

# ------------------ Helpers -----------------
def money_like_to_float(s):
    if pd.isna(s): return np.nan
    x = str(s).strip().lower().replace(",", "").replace("$", "")
    try:
        mult = 1
        if "billion" in x or (re.search(r"\d", x) and x.endswith("b")): mult = 1_000_000_000
        elif "million" in x or (re.search(r"\d", x) and x.endswith("m")): mult = 1_000_000
        elif re.search(r"\d", x) and x.endswith("k"): mult = 1_000
        nums = re.findall(r"[\d.]+", x)
        return float(nums[0]) * mult if nums else np.nan
    except Exception:
        return np.nan

def clean_date_like(s):
    if pd.isna(s): return np.nan
    x = str(s).strip()
    x = re.sub(r"^(ended|end|finished|finalized)\s*:?\s*", "", x, flags=re.IGNORECASE)
    x = x.replace("—","-").replace("–","-")
    return x

def boolify(x):
    if pd.isna(x): return np.nan
    if isinstance(x, (int, float)) and not pd.isna(x):
        return int(float(x) != 0)
    s = str(x).strip().lower()
    if s in {"1","true","yes","y","si","sí","present","available","ok"}: return 1
    if s in {"0","false","no","n","absent","unavailable","none"}: return 0
    return np.nan

def _first_nonnull(series_list):
    out = pd.Series(np.nan, index=df.index)
    for s in series_list or []:
        if s is None: 
            continue
        if isinstance(s, str):
            if s in df.columns: v = df[s]
            else:               continue
        elif isinstance(s, pd.Series):
            v = s
        else:
            continue
        v = pd.Series(v).reindex(out.index)
        out = out.where(~out.isna(), v)
    return out

def order_by_priority(cols):
    with_suf = [c for c in cols if any(c.endswith(s) for s in SUFFIXES)]
    no_suf   = [c for c in cols if c not in with_suf]
    ordered = []
    for s in SUFFIXES:
        ordered += [c for c in with_suf if c.endswith(s)]
    ordered += no_suf
    seen, uniq = set(), []
    for c in ordered:
        if c in df.columns and c not in seen:
            uniq.append(c); seen.add(c)
    return uniq

def find_cols_by_regex(patterns):
    cols = []
    for pat in patterns:
        r = re.compile(pat, flags=re.IGNORECASE)
        for base in base_to_cols.keys():
            if r.search(base):
                cols += base_to_cols[base]
    return order_by_priority(cols)

def coalesce_numeric_from_patterns(pattern_groups):
    cols = []
    for group in pattern_groups:
        cols += find_cols_by_regex(group)
    if not cols:
        return pd.Series(np.nan, index=df.index)
    ser = _first_nonnull(cols)
    ser_num = pd.to_numeric(ser, errors="coerce")
    if ser_num.notna().sum() < 0.2 * len(ser_num):
        ser_num = ser.apply(money_like_to_float)
    return ser_num

def coalesce_text_from_patterns(pattern_groups):
    cols = []
    for group in pattern_groups:
        cols += find_cols_by_regex(group)
    if not cols:
        return pd.Series(np.nan, index=df.index)
    ser = _first_nonnull(cols).astype(str).replace({"nan": np.nan})
    return ser

def coalesce_bool_from_patterns(pattern_groups):
    cols = []
    for group in pattern_groups:
        cols += find_cols_by_regex(group)
    if not cols:
        return pd.Series(pd.array([pd.NA]*len(df), dtype="Int64"), index=df.index)
    ser = _first_nonnull(cols).map(boolify).astype("Int64")
    return ser

def extract_min_max(s):
    if pd.isna(s): return (np.nan, np.nan)
    x = str(s).lower()
    nums = re.findall(r"[\d.]+", x)
    if not nums: return (np.nan, np.nan)
    if len(nums) == 1:
        v = float(nums[0]); return (v, v)
    return (float(nums[0]), float(nums[1]))

def parse_dates_robust(s: pd.Series) -> pd.Series:
    s = pd.Series(s, index=df.index)
    parsed = pd.to_datetime(s, format="%d %b %Y", errors="coerce", dayfirst=True)
    for fmt in ("%Y-%m-%d", "%d/%m/%Y", "%b %d, %Y"):
        need = parsed.isna()
        if need.any():
            parsed.loc[need] = pd.to_datetime(s[need], format=fmt, errors="coerce", dayfirst=True)
    need = parsed.isna()
    if need.any():
        parsed.loc[need] = pd.to_datetime(s[need], errors="coerce", dayfirst=True)
    return parsed

# ------------------ Aliases (regex) -----------------
ALIASES = {
    # Identidad
    "name_std":   [[r"^name_std$"]],
    "symbol_std": [[r"^symbol_std$"]],

    # Fechas
    "start_direct": [[r"^ico_start_date$", r"^ico starts$", r"\bico_start\b", r"sale_start", r"preico_start"]],
    "end_direct":   [[r"^ico_end_date_actual$", r"^ico_end_date_planned$", r"^ico ends$", r"^end_date_parsed$", r"^end_date\b", r"\bico_end\b", r"sale_end", r"token_sale_end"]],
    "date_range":   [[r"^start_end_date_coin_sell$", r"ico_dates", r"date_range"]],

    # Recaudación / objetivos
    "goal_usd":           [[r"^soft_cap_usd$", r"^soft_cap$", r"^softcap_usd$", r"^softcap$", r"^fundraising_goal$", r"^goal$"]],
    "hard_cap_usd":       [[r"^hard_cap_usd$", r"^hard_cap$", r"^hardcap_usd$", r"^hardcap$"]],
    "amount_raised_usd":  [[r"^total amount raised \(usdm\)$", r"^total amount raised \(usdm\)\.1$", r"^amount raised_usd$", r"^amount raised$", r"^received_money(\.1)?$"]],

    # Éxito
    "ico_successful":     [[r"^ico_successful$", r"^ico success$", r"^success$", r"^successful$"]],

    # Tokenomics
    "token_price_usd":    [[r"^crowdsale_actual_token_price_max$", r"^price ico$", r"^ico_token_price$"]],
    "total_tokens":       [[r"^total_number_of_tokens$", r"^circsupply$", r"^total_tokens$"]],
    "tokens_for_sale":    [[r"^crowdsale_tokens_sold$", r"^tokens f sale$", r"^available_for_token_sale$"]],

    # Min/Max inversión
    "min_investment":     [[r"^mininvest$", r"min[_ ]?investment", r"minimum[_ ]?investment", r"min[_ ]?max[_ ]?personal[_ ]?cap"]],
    "max_investment":     [[r"^mininvest$", r"max[_ ]?investment", r"maximum[_ ]?investment", r"min[_ ]?max[_ ]?personal[_ ]?cap"]],

    # Tipo/rol token
    "token_type":         [[r"^token_type$", r"^protocol type$", r"utility token enables decentralization", r"\btype\b"]],
    "role_of_token":      [[r"^role_of_token$", r"\brole\b"]],
    "industry":           [[r"^industry$"],[r"^category$"]],

    # Compliance / Jurisdicción
    "whitelist":          [[r"^qualified investors only$", r"^us retail investors excluded$", r"^wlist$", r"^whitelist$"]],
    "kyc":                [[r"^kyc/aml procedure$", r"^kyc$", r"^regulkyc$"]],
    "jurisdiction":       [[r"^registration_country$", r"^country$", r"^legal_structure$"]],
    "accepts":            [[r"^accepting$", r"\baccepts\b", r"currencies[_ ]?accepted"]],

    # Señales de ejecución / presencia
    "has_github":         [[r"^project code available$", r"^smart contract code available$", r"github", r"code[_ ]?available"]],
    "website_available":  [[r"website[_ ]?available", r"\bwebsite\b", r"\bsite\b"]],

    # Equipo / rating / interés / docs
    "team_size":          [[r"^team size$", r"^team_size$", r"teamsize"]],
    "rating":             [[r"^rating$", r"^ico[_ ]?rating$", r"^score$"]],
    "interest":           [[r"^interest$"]],
    "discount_max_pct":   [[r"^crowdsale max\. discount \(%\)$", r"^presale discount \(%\)$", r"max[_ ]?discount"]],
    "roadmap_available":  [[r"^development road map available$", r"roadmap[_ ]?available", r"has[_ ]?roadmap"]],
    "whitepaper_available":[[r"whitepaper[_ ]?available", r"white[_ ]?paper"]],
    "whitepaper_page_count": [[r"^whitepaper page count$"]],

    # Duraciones / IEO / RegTax
    "ico_length_actual":  [[r"^length of crowdsale \(calendar days, actual\)$", r"ico[_ ]?length[_ ]?actual"]],
    "ico_length_planned": [[r"^length of crowdsale \(calendar days, planned\)$", r"ico[_ ]?length[_ ]?planned"]],
    "ieo":                [[r"^ieo$", r"initial[_ ]?exchange[_ ]?offering", r"used[_ ]?an[_ ]?exchange"]],
    "regtax":             [[r"^regtax$", r"reg[_ ]?tax", r"tax[_ ]?reg(ulation)?", r"regulation[_ ]?on[_ ]?transfer"]],
}

# ------------------ Build canónico ------------------
out = pd.DataFrame(index=df.index)

# Identidad
out["name_std"]   = _first_nonnull(order_by_priority(find_cols_by_regex(ALIASES["name_std"][0])))
out["symbol_std"] = _first_nonnull(order_by_priority(find_cols_by_regex(ALIASES["symbol_std"][0])))

# Fechas
start_direct = coalesce_text_from_patterns(ALIASES["start_direct"])
end_direct   = coalesce_text_from_patterns(ALIASES["end_direct"])
date_range   = coalesce_text_from_patterns(ALIASES["date_range"])

left_from_range  = date_range.astype(str).str.extract(r"^\s*([^-–—|to]+)", expand=False)
right_from_range = date_range.astype(str).str.extract(r"[-–—|to]\s*(.*)$",   expand=False)

def _clean_date_series(s):
    return s.astype(str).map(clean_date_like).replace({"nan": np.nan})

start_txt = start_direct.copy()
start_txt = start_txt.mask(start_txt.notna(), start_txt).mask(start_txt.isna(), _clean_date_series(left_from_range))
end_txt   = end_direct.copy()
end_txt   = end_txt.mask(end_txt.notna(), end_txt).mask(end_txt.isna(), _clean_date_series(right_from_range))

out["ico_start_date"] = parse_dates_robust(start_txt)
out["ico_end_date"]   = parse_dates_robust(end_txt)

# Recaudación / objetivos
out["goal_usd"]          = coalesce_numeric_from_patterns(ALIASES["goal_usd"])
out["hard_cap_usd"]      = coalesce_numeric_from_patterns(ALIASES["hard_cap_usd"])
out["amount_raised_usd"] = coalesce_numeric_from_patterns(ALIASES["amount_raised_usd"])

# Éxito (target)
succ = coalesce_text_from_patterns(ALIASES["ico_successful"])
out["ico_successful"] = succ.map(boolify)

# Tokenomics
out["token_price_usd"] = coalesce_numeric_from_patterns(ALIASES["token_price_usd"])
out["total_tokens"]    = coalesce_numeric_from_patterns(ALIASES["total_tokens"])
out["tokens_for_sale"] = coalesce_numeric_from_patterns(ALIASES["tokens_for_sale"])

# Min/Max investment
mininv_raw = coalesce_text_from_patterns(ALIASES["min_investment"])
maxinv_raw = coalesce_text_from_patterns(ALIASES["max_investment"])
mn, mx  = zip(*mininv_raw.map(extract_min_max))
mn2, mx2 = zip(*maxinv_raw.map(extract_min_max))
mx_final = pd.Series(mx, index=df.index).where(pd.notna(pd.Series(mx, index=df.index)), pd.Series(mx2, index=df.index))
out["min_investment_usd"] = pd.to_numeric(pd.Series(mn, index=df.index), errors="coerce")
out["max_investment_usd"] = pd.to_numeric(mx_final, errors="coerce")

# Categóricas / flags
out["industry"]    = coalesce_text_from_patterns(ALIASES["industry"])
out["token_type"]    = coalesce_text_from_patterns(ALIASES["token_type"])
out["role_of_token"] = coalesce_text_from_patterns(ALIASES["role_of_token"])
out["whitelist"]     = coalesce_bool_from_patterns(ALIASES["whitelist"])
out["kyc"]           = coalesce_bool_from_patterns(ALIASES["kyc"])
out["jurisdiction"]  = coalesce_text_from_patterns(ALIASES["jurisdiction"])
out["accepts"]       = coalesce_text_from_patterns(ALIASES["accepts"])

# Señales de ejecución / presencia
out["has_github"]        = coalesce_bool_from_patterns(ALIASES["has_github"])
out["has_telegram"]      = pd.Series(pd.NA, index=df.index, dtype="Int64")
out["has_reddit"]        = pd.Series(pd.NA, index=df.index, dtype="Int64")
out["website_available"] = coalesce_bool_from_patterns(ALIASES["website_available"])

# Equipo / rating / interés / docs
out["team_size"] = pd.to_numeric(coalesce_text_from_patterns(ALIASES["team_size"]), errors="coerce")
out["rating"]    = pd.to_numeric(coalesce_text_from_patterns(ALIASES["rating"]), errors="coerce")
out["interest"]  = coalesce_text_from_patterns(ALIASES["interest"])
disc = coalesce_text_from_patterns(ALIASES["discount_max_pct"]).astype(str).str.extract(r"([\d.]+)", expand=False)
out["discount_max_pct"] = pd.to_numeric(disc, errors="coerce")
out["roadmap_available"] = coalesce_bool_from_patterns(ALIASES["roadmap_available"])

# Whitepaper: page_count > 0 OR flag explícito
wp_count = pd.to_numeric(coalesce_text_from_patterns(ALIASES["whitepaper_page_count"]), errors="coerce")
wp_flag  = (wp_count > 0).astype("Int64")
wp_other = coalesce_bool_from_patterns(ALIASES["whitepaper_available"])
out["whitepaper_available"] = wp_flag.where(wp_flag.notna(), wp_other)

# Duraciones / IEO / RegTax
out["ico_length_actual_days"]  = pd.to_numeric(coalesce_text_from_patterns(ALIASES["ico_length_actual"]), errors="coerce")
out["ico_length_planned_days"] = pd.to_numeric(coalesce_text_from_patterns(ALIASES["ico_length_planned"]), errors="coerce")
out["is_ieo"]           = coalesce_bool_from_patterns(ALIASES["ieo"])
out["is_tax_regulated"] = coalesce_bool_from_patterns(ALIASES["regtax"])

# Flags derivados (ojo: son post-ICO, NO usarlos como features ex-ante)
out["hit_softcap"] = ((out["amount_raised_usd"] >= out["goal_usd"]) & out["amount_raised_usd"].notna() & out["goal_usd"].notna()).astype("Int64")
out["hit_hardcap"] = ((out["amount_raised_usd"] >= out["hard_cap_usd"]) & out["amount_raised_usd"].notna() & out["hard_cap_usd"].notna()).astype("Int64")

# Backfills mínimos para el target
if "ico_successful" in out.columns and "hit_softcap" in out.columns:
    mask_empty = out["ico_successful"].isna()
    out.loc[mask_empty, "ico_successful"] = out.loc[mask_empty, "hit_softcap"]

# Si falta goal_usd pero hay hardcap, usar hardcap como proxy
mask_goal_missing = out["goal_usd"].isna() & out["hard_cap_usd"].notna()
out.loc[mask_goal_missing, "goal_usd"] = out.loc[mask_goal_missing, "hard_cap_usd"]

# Orden final y dedupe
ordered = [
    "name_std","symbol_std",
    "ico_start_date","ico_end_date","ico_length_actual_days","ico_length_planned_days",
    "goal_usd","hard_cap_usd","amount_raised_usd","ico_successful","hit_softcap","hit_hardcap",
    "token_price_usd","total_tokens","tokens_for_sale","min_investment_usd","max_investment_usd",
    "industry","token_type","role_of_token","whitelist","kyc","jurisdiction","accepts",
    "has_github","has_telegram","has_reddit","website_available",
    "team_size","rating","interest","discount_max_pct","roadmap_available","whitepaper_available",
    "is_ieo","is_tax_regulated",
]
ordered = [c for c in ordered if c in out.columns] + [c for c in out.columns if c not in ordered]
out = out[ordered].copy()

# Deduplicar por (name_std, symbol_std) conservando la primera ocurrencia con más info (heurística: menos NaN)
def _nan_count_row(r): return r.isna().sum()
out["_nan_cnt"] = out.apply(_nan_count_row, axis=1)
out.sort_values(by=["name_std","symbol_std","_nan_cnt"], ascending=[True, True, True], inplace=True)
out = out.drop_duplicates(subset=["name_std","symbol_std"], keep="first").drop(columns=["_nan_cnt"])


# ------------------ Reporte canónico -----------------
def missing_pct(s): 
    return round(100*s.isna().mean(), 2)

report_all = pd.DataFrame({
    "column": out.columns,
    "dtype": [str(out[c].dtype) for c in out.columns],
    "missing_%": [missing_pct(out[c]) for c in out.columns]
}).sort_values(["missing_%","column"], ascending=[False, True])

print(f'Columnas dataset inicial: {df.columns.tolist()}')
print(f'Columnas dataset finaaal: {out.columns.tolist()}')

#intersection = list(set(df.columns.tolist()).intersection(set(out.columns.tolist()))
diferencia = list(set(df.columns.tolist()) - set((out.columns.tolist())))
                  
#print(f'Interseccion de columnas: {intersection}')
print(f'Diferencia de columnas: {diferencia}')

print(f"[ALL] Filas: {len(out):,}  |  Columnas: {out.shape[1]}")
print("\nTop 20 columnas con más missing (%):")
try:
    display(report_all.head(20))
except Exception:
    print(report_all.head(20).to_string(index=False))


# ------------------ Construir EX-ANTE ------------------
# Columnas que consideramos POST-ICO (leakage) y se excluyen del feature set ex-ante:
LEAKY_COLS = {
    "amount_raised_usd",        # resultado
    "hit_softcap", "hit_hardcap",
    "ico_length_actual_days",   # duración real
    # 'ico_end_date' podría ser planificada o real según la fuente -> la EXCLUIMOS del ex-ante por prudencia
    "ico_end_date",
    # Derivadas que no guardamos acá (pct_goal_reached, hardcap_ratio, etc.) si existieran
}

# Mantenemos el target (ico_successful) para entrenamiento/validación, pero no como feature.
KEEP_ALWAYS = {"name_std","symbol_std","ico_successful"}

exante_cols = [c for c in out.columns if c not in LEAKY_COLS]  # removemos leaky
out_ex = out[exante_cols].copy()

# Reporte ex-ante (y lista de leaky removidas)
removed_present = sorted(list(LEAKY_COLS.intersection(set(out.columns))))
print("\n[EX-ANTE] Columnas removidas por leakage:", removed_present)
print(f"[EX-ANTE] Filas: {len(out_ex):,}  |  Columnas: {out_ex.shape[1]}")

report_ex = pd.DataFrame({
    "column": out_ex.columns,
    "dtype": [str(out_ex[c].dtype) for c in out_ex.columns],
    "missing_%": [missing_pct(out_ex[c]) for c in out_ex.columns]
}).sort_values(["missing_%","column"], ascending=[False, True])

try:
    display(report_ex.head(20))
except Exception:
    print(report_ex.head(20).to_string(index=False))


# Renombre de columnas para consistencia con el dataset de foundico
out.rename(columns={'goal_usd': 'soft_cap', 
                    'hard_cap_usd': 'hard_cap',
                    'hit_softcap': 'hit_soft_cap', 
                    'hit_hardcap': 'hit_hard_cap', 
                    'amount_raised_usd': 'amount_raised'}, 
              inplace=True)
out_ex.rename(columns={'goal_usd': 'soft_cap', 
                    'hard_cap_usd': 'hard_cap', 
                    'hit_softcap': 'hit_soft_cap', 
                    'hit_hardcap': 'hit_hard_cap', 
                    'amount_raised_usd': 'amount_raised'}, 
              inplace=True)


# ------------------ Save --------------------
out.to_csv(OUT_ALL, index=False)
out_ex.to_csv(OUT_EXANT, index=False)
print(f"\n✅ Guardados:\n - Canónico: {OUT_ALL}\n - Ex-ante:  {OUT_EXANT}")
print(f"\nColumnas en el dataset final ex-ante: {out_ex.columns.tolist()}")


Columnas dataset inicial: ['__key__', 'name_std', 'symbol_std', 'name_other', 'name_cmc', 'ticker_symbol_cmc', 'ico_successful', 'soft_cap', 'hard_cap', 'cap_unit', 'cap_includes_presale', 'token_type', 'number_of_contributors', 'crowdsale_tokens_sold', 'total_number_of_tokens', 'token_standard', 'additional_token_emissions', 'crowdsale_token_price_min', 'crowdsale_token_price_max', 'crowdsale_actual_token_price_max', 'crowdsale is auction', 'has a presale', 'presale_tokens_sold', 'presale_token_price_min', 'presale_token_price_max', 'development road map available', 'whitepaper page count', 'product or prototype developed', 'product can be tried out', 'years since foundation', 'issuer has customers for product', 'business model available', 'utility token enables decentralization', 'smart contract code available', 'project code available', 'use of proceeds mentioned', 'use of proceeds disclosed in detail', 'token share team (ex ante)', 'token share crowdsale investors (ex ante)', 'toke



Unnamed: 0,column,dtype,missing_%
26,has_reddit,Int64,100.0
25,has_telegram,Int64,100.0
29,rating,float64,100.0
27,website_available,Int64,100.0
30,interest,object,92.6
19,role_of_token,object,92.57
28,team_size,float64,89.52
31,discount_max_pct,float64,89.22
5,ico_length_planned_days,float64,88.74
24,has_github,Int64,88.62



[EX-ANTE] Columnas removidas por leakage: ['amount_raised_usd', 'hit_hardcap', 'hit_softcap', 'ico_end_date', 'ico_length_actual_days']
[EX-ANTE] Filas: 2,690  |  Columnas: 31


Unnamed: 0,column,dtype,missing_%
21,has_reddit,Int64,100.0
20,has_telegram,Int64,100.0
24,rating,float64,100.0
22,website_available,Int64,100.0
25,interest,object,92.6
14,role_of_token,object,92.57
23,team_size,float64,89.52
26,discount_max_pct,float64,89.22
3,ico_length_planned_days,float64,88.74
19,has_github,Int64,88.62



✅ Guardados:
 - Canónico: ../join/ico_union_canonical_v6.csv
 - Ex-ante:  ../join/ico_exante_features_v3.csv

Columnas en el dataset final ex-ante: ['name_std', 'symbol_std', 'ico_start_date', 'ico_length_planned_days', 'soft_cap', 'hard_cap', 'ico_successful', 'token_price_usd', 'total_tokens', 'tokens_for_sale', 'min_investment_usd', 'max_investment_usd', 'industry', 'token_type', 'role_of_token', 'whitelist', 'kyc', 'jurisdiction', 'accepts', 'has_github', 'has_telegram', 'has_reddit', 'website_available', 'team_size', 'rating', 'interest', 'discount_max_pct', 'roadmap_available', 'whitepaper_available', 'is_ieo', 'is_tax_regulated']
CPU times: total: 281 ms
Wall time: 295 ms


In [8]:

print(out["ico_successful"].value_counts(dropna=False))

ico_successful
0.0    2072
1.0     618
Name: count, dtype: int64
