In [13]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
from functools import reduce

# Rutas de entrada (ajustá si hace falta)
BASE = Path("../processed")
PATHS = {
    "zenodo":  BASE / "zenodo_fahlenbrach_clean.csv",
    "icpsr":   BASE / "icpsr_villanueva_clean.csv",
    "yan":     BASE / "kaggle_yanmaksi_clean.csv",
}

# Rutas de salida
OUT_DIR = Path("../join")
OUT_UNION = OUT_DIR / "ico_union_wide.csv"
OUT_MULTI = OUT_DIR / "present_in_multiple.csv"
OUT_COLL  = OUT_DIR / "value_collisions.csv"

def normalize_text(s: pd.Series) -> pd.Series:
    return (
        s.astype(str)
         .str.lower()
         .str.strip()
         .str.replace(r"[^a-z0-9]", "", regex=True)
         .replace({"nan": ""})
    )

def ensure_keys(df: pd.DataFrame) -> pd.DataFrame:
    """Asegura name_std y symbol_std si faltan, usando columnas alternativas."""
    cols = df.columns.str.lower()
    df.columns = cols

    if "name_std" not in df.columns:
        name_col = next((c for c in ["name","project name","project_name","ico_name","token"] if c in df.columns), None)
        df["name_std"] = normalize_text(df[name_col]) if name_col else ""

    if "symbol_std" not in df.columns:
        sym_col = next((c for c in ["symbol","ticker","ticker_symbol","coin_ticker","short","abbr"] if c in df.columns), None)
        df["symbol_std"] = normalize_text(df[sym_col]) if sym_col else ""

    # quitar whitespace/NaN residuales
    df["name_std"]   = df["name_std"].fillna("").astype(str)
    df["symbol_std"] = df["symbol_std"].fillna("").astype(str)
    return df

def load_tagged(path: Path, tag: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df = ensure_keys(df)
    df["__source__"] = tag
    return df

def make_key(df: pd.DataFrame) -> pd.Series:
    """Key primaria (name_std + symbol_std) con fallback si uno está vacío."""
    k = df["name_std"].fillna("") + "||" + df["symbol_std"].fillna("")
    # fallback: si ambos vacíos, usar índice para no perder filas (igual quedarán como no unibles)
    empty_mask = (df["name_std"] == "") & (df["symbol_std"] == "")
    if empty_mask.any():
        k = k.where(~empty_mask, "__row__:"+df.reset_index().index.astype(str))
    return k

def friendly_cols(df: pd.DataFrame) -> pd.DataFrame:
    # Evita columnas duplicadas exactas (mismo nombre) antes del merge
    return df.loc[:, ~df.columns.duplicated()]


In [14]:
### Chequeo rápido
dfs = {tag: load_tagged(path, tag) for tag, path in PATHS.items()}

for tag, d in dfs.items():
    print(tag, d.shape, "keys vacíos:", (d["name_std"]=="" ).sum(), (d["symbol_std"]=="").sum())

# Clave de unión por dataset
for tag in dfs:
    dfs[tag]["__key__"] = make_key(dfs[tag])
    dfs[tag] = friendly_cols(dfs[tag])

# Outer merge de los tres con sufijos distintos
order = ["zenodo","icpsr","yan"]
suf_map = {"zenodo":"_zenodo","icpsr":"_icpsr","yan":"_yan"}

# Arrancamos merge con el primero
merged = dfs[order[0]].copy()

for nxt in order[1:]:
    merged = merged.merge(
        dfs[nxt],
        how="outer",
        on="__key__",
        suffixes=("", suf_map[nxt])  # primer choque usa sufijo en el RHS
    )
    # Para evitar conflictos en siguientes merges, renombramos colisiones genéricas
    # (ya que pandas sólo aplica suffixes al segundo DF de cada merge)
    dup_cols = merged.columns[merged.columns.duplicated()].unique()
    if len(dup_cols) > 0:
        for c in dup_cols:
            # renombrar la col duplicada más a la derecha con sufijo del dataset
            cols = [i for i, name in enumerate(merged.columns) if name == c]
            # dejamos la primera tal cual, renombramos el resto con sufijo del dataset reciente
            for idxpos in cols[1:]:
                merged.columns.values[idxpos] = f"{c}{suf_map[nxt]}"

# Reordenar: keys y un pequeño set de columnas “clave” al frente si existen
front = [c for c in ["__key__","name_std","symbol_std"] if c in merged.columns]
other = [c for c in merged.columns if c not in front]
merged = merged[front + other]

# Guardar unión “wide” (todas las features con sufijos)
merged.to_csv(OUT_UNION, index=False)
print(f"✅ Guardado union wide: {OUT_UNION} — filas: {len(merged):,}, cols: {merged.shape[1]}")


zenodo (306, 132) keys vacíos: 0 0
icpsr (2186, 79) keys vacíos: 0 15
yan (200, 28) keys vacíos: 0 29
✅ Guardado union wide: ..\join\ico_union_wide.csv — filas: 2,692, cols: 240


In [15]:
# === Celda 3 (patched) — Tokens en >1 dataset + Colisiones de valores ===
# Fix mínimo: asegurar __key__ en dfs[*] y fallback si no existe 'merged'

# 0) Asegurar que 'dfs' exista (si no corriste la celda 2)
if 'dfs' not in globals():
    dfs = {tag: load_tagged(path, tag) for tag, path in PATHS.items()}  # usa helpers de Celda 1

# 1) Asegurar que cada df tenga __key__
for tag in list(dfs.keys()):
    d = dfs[tag]
    # normalizar claves mínimas si faltan
    if "name_std" not in d.columns or "symbol_std" not in d.columns:
        d = ensure_keys(d)
    if "__key__" not in d.columns:
        d["__key__"] = make_key(d)
    dfs[tag] = d

# 2) Intentar tener 'merged' disponible
minimal_merged = False
if 'merged' not in globals():
    try:
        # intentar cargar el union wide previamente guardado
        merged = pd.read_csv(OUT_UNION)
        if "__key__" not in merged.columns:
            # si el union no trae __key__, lo construimos desde name_std+symbol_std
            if "name_std" in merged.columns and "symbol_std" in merged.columns:
                merged["__key__"] = merged["name_std"].fillna("") + "||" + merged["symbol_std"].fillna("")
            else:
                minimal_merged = True
    except Exception:
        minimal_merged = True

# 2.b) Si no pudimos cargar un union wide, armamos un merged mínimo
if minimal_merged:
    # Unión de todas las keys y coalesce de name_std/symbol_std desde los dfs (prioridad zenodo>icpsr>yan)
    key_union = sorted(set().union(*[set(dfs[t]["__key__"]) for t in dfs]))
    merged = pd.DataFrame({"__key__": key_union})
    for col in ["name_std", "symbol_std"]:
        # coalesce por prioridad
        val = pd.Series("", index=merged.index)
        for t in ["zenodo","icpsr","yan"]:
            if col in dfs[t].columns:
                m = merged[["__key__"]].merge(dfs[t][["__key__", col]], on="__key__", how="left")[col].fillna("")
                val = np.where(val == "", m, val)
        merged[col] = val

# ----- (A) tokens presentes en más de 1 dataset -----
presence_cols = []
for tag in ["zenodo","icpsr","yan"]:
    col = f"present_{tag}"
    presence_cols.append(col)

# sets de keys originales por dataset
keys_sets = {tag: set(dfs[tag]["__key__"].tolist()) for tag in dfs}

# dataframe base para presencia
base_cols = [c for c in ["__key__", "name_std", "symbol_std"] if c in merged.columns]
present_df = merged[base_cols].copy()

for tag in ["zenodo","icpsr","yan"]:
    present_df[f"present_{tag}"] = present_df["__key__"].isin(keys_sets[tag]).astype(int)

present_df["in_n_datasets"] = present_df[presence_cols].sum(axis=1)
multi = present_df[present_df["in_n_datasets"] >= 2].copy()
multi.to_csv(OUT_MULTI, index=False)
print(f"✅ Guardado tokens en >1 dataset: {OUT_MULTI} — filas: {len(multi):,}")

# ----- (B) colisiones de valores (misma feature con valores distintos) -----
# Si no tenemos union wide (con sufijos), evitamos falso positivo y generamos vacío controlado
suffixes = ["_zenodo","_icpsr","_yan"]
has_wide = any(col.endswith(tuple(suffixes)) for col in merged.columns)

if not has_wide:
    coll_df = pd.DataFrame(columns=["__key__","name_std","symbol_std","feature_base"])
    coll_df.to_csv(OUT_COLL, index=False)
    print(f"ℹ️ No hay columnas con sufijos *_zenodo/_icpsr/_yan en 'merged'; "
          f"se guarda {OUT_COLL} vacío (corré la Celda 2 para generar el union wide).")
else:
    cols = merged.columns

    def base_name(c):
        for s in suffixes:
            if c.endswith(s):
                return c[: -len(s)]
        return None

    from collections import defaultdict
    by_base = defaultdict(list)
    for c in cols:
        b = base_name(c)
        if b:
            by_base[b].append(c)

    collisions = []
    for b, cols_same in by_base.items():
        if len(cols_same) < 2:
            continue
        sub_cols = [c for c in ["__key__","name_std","symbol_std"] if c in merged.columns] + cols_same
        sub = merged[sub_cols].copy()

        def norm_series(s):
            if pd.api.types.is_numeric_dtype(s):
                return pd.to_numeric(s, errors="coerce").round(8).astype(str)
            else:
                return s.astype(str).str.strip()

        for c in cols_same:
            sub[c] = norm_series(sub[c])

        nonnull_counts = sub[cols_same].replace({"": np.nan, "nan": np.nan}).notna().sum(axis=1)
        unique_counts = sub[cols_same].apply(lambda r: len(set([x for x in r if x not in ["", "nan"]])), axis=1)
        mask = (nonnull_counts >= 2) & (unique_counts >= 2)

        if mask.any():
            tmp = sub.loc[mask].copy()
            if "feature_base" not in tmp.columns:
                tmp.insert(len(sub.columns)-len(cols_same), "feature_base", b)
            collisions.append(tmp)

    coll_df = pd.concat(collisions, axis=0, ignore_index=True) if collisions else \
              pd.DataFrame(columns=["__key__","name_std","symbol_std","feature_base"])
    coll_df.to_csv(OUT_COLL, index=False)
    print(f"✅ Guardado colisiones de valores: {OUT_COLL} — filas: {len(coll_df):,}")

# Vista rápida
display(multi.head(10))
if 'coll_df' in locals():
    display(coll_df.head(10))


✅ Guardado tokens en >1 dataset: ..\join\present_in_multiple.csv — filas: 0
✅ Guardado colisiones de valores: ..\join\value_collisions.csv — filas: 0


Unnamed: 0,__key__,name_std,symbol_std,present_zenodo,present_icpsr,present_yan,in_n_datasets


Unnamed: 0,__key__,name_std,symbol_std,feature_base


In [16]:
print(f"Zenodo: {dfs[order[0]].columns.tolist()}\n")
print(f"ICPSR: {dfs[order[1]].columns.tolist()}\n")
print(f"Yan: {dfs[order[2]].columns.tolist()}")

Zenodo: ['name_other', 'name_cmc', 'ticker_symbol_cmc', 'ico_successful', 'soft_cap', 'hard_cap', 'cap_unit', 'cap_includes_presale', 'token_type', 'number_of_contributors', 'crowdsale_tokens_sold', 'total_number_of_tokens', 'token_standard', 'additional_token_emissions', 'crowdsale_token_price_min', 'crowdsale_token_price_max', 'crowdsale_actual_token_price_max', 'crowdsale is auction', 'has a presale', 'presale_tokens_sold', 'presale_token_price_min', 'presale_token_price_max', 'development road map available', 'whitepaper page count', 'product or prototype developed', 'product can be tried out', 'years since foundation', 'issuer has customers for product', 'business model available', 'utility token enables decentralization', 'smart contract code available', 'project code available', 'use of proceeds mentioned', 'use of proceeds disclosed in detail', 'token share team (ex ante)', 'token share crowdsale investors (ex ante)', 'token share presale investors (ex ante)', 'token share prod