In [3]:
import pandas as pd, numpy as np, re, time, requests, os, hmac, hashlib, base64, json
from difflib import SequenceMatcher
from datetime import datetime, timezone, UTC

#### Limpieza del dataset de Yan Maksi obtenido en Kaggle

In [4]:
# -------- CONFIG Y HELPERS --------

UTC = timezone.utc

dataset_path = "../raw/ico_yanmaksi_kaggle.csv"
output_path  = "../processed/kaggle_yanmaksi_clean.csv"

USER_AGENT = "Mozilla/5.0 (compatible; TFM-ICO/1.2)"
HEADERS    = {"User-Agent": USER_AGENT, "Accept": "application/json"}

CMC_MAP_URL = "https://pro-api.coinmarketcap.com/v1/cryptocurrency/map"
CMC_KEY = os.getenv("CMC_API_KEY") or "87f241ea-b56c-4a2f-9707-e25b4352ceb6"
SLEEP_CMC = 0.35

def parse_money(x):
    if pd.isna(x): return np.nan
    s = str(x).lower().strip().replace(",", "").replace("$", "")
    try:
        if "billion" in s or ("b" in s and re.search(r"\d", s)): base = 1_000_000_000
        elif "million" in s or ("m" in s and re.search(r"\d", s)): base = 1_000_000
        elif "k" in s and re.search(r"\d", s): base = 1_000
        else: base = 1
        num = re.findall(r"[\d.]+", s)
        return float(num[0]) * base if num else np.nan
    except Exception:
        return np.nan

def clean_end_date(s):
    if pd.isna(s): return np.nan
    s = str(s).strip()
    s = re.sub(r"^(ended|end|finished|finalized)\s*:?\s*", "", s, flags=re.IGNORECASE)
    s = s.replace("—", "-").replace("–", "-")
    return s

def normalize_text_series(s):
    return (
        s.astype(str)
         .str.lower()
         .str.strip()
         .str.replace(r"[^a-z0-9]", "", regex=True)
    )

def _similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# --- Foundico API ---
FD_BASE = "https://foundico.com/api/v1"
FD_ICOS = f"{FD_BASE}/icos/"
FOUNDICO_PUBLIC  = os.getenv("FOUNDICO_PUBLIC_KEY") or ""
FOUNDICO_PRIVATE = os.getenv("FOUNDICO_PRIVATE_KEY") or ""
def _sign_foundico(private_key: str, payload_json: str) -> str:
    mac = hmac.new(private_key.encode("utf-8"), payload_json.encode("utf-8"), hashlib.sha256).digest()
    return base64.b64encode(mac).decode("utf-8")

def _foundico_search_name(name: str, max_pages: int = 8, sleep: float = 0.4):
    if not (FOUNDICO_PUBLIC and FOUNDICO_PRIVATE) or not name:
        return None
    name_std = re.sub(r"[^a-z0-9]", "", name.lower())
    best = None; best_score = 0.0
    for page in range(1, max_pages + 1):
        payload = {"status": "past", "page": page}
        body = json.dumps(payload, ensure_ascii=False)
        headers = {
            "Content-Type": "application/json",
            "User-Agent": USER_AGENT,
            "X-Foundico-Public-Key": FOUNDICO_PUBLIC,
            "X-Foundico-Access-Key": _sign_foundico(FOUNDICO_PRIVATE, body),
        }
        try:
            r = requests.post(FD_ICOS, headers=headers, data=body, timeout=25)
            if r.status_code != 200:
                break
            data = r.json() or {}
            items = data.get("data") or []
            if not items:
                break
            for item in items:
                nm = (item.get("main", {}).get("name") or "").strip()
                sc = (item.get("finance", {}).get("ticker") or "").strip()
                score = _similar(name_std, re.sub(r"[^a-z0-9]", "", nm.lower()))
                if score > best_score and sc:
                    best_score = score
                    best = {"symbol": sc, "source": "foundico"}
            time.sleep(sleep)
        except Exception:
            break
    return best if (best and best.get("symbol")) else None
    
def _cmc_find_symbol_by_name(name: str, api_key: str, pages: int = 4, limit: int = 500):
    """
    Busca por NOMBRE en CMC paginando /v1/cryptocurrency/map y haciendo fuzzy por 'name'.
    Devuelve (SYMBOL, 'cmc') o (None, None). 'pages' * 'limit' controla cuánto escaneamos.
    """
    if not api_key or not name:
        return None, None
    headers = {"X-CMC_PRO_API_KEY": api_key, "Accept": "application/json", "User-Agent": USER_AGENT}
    name_std = re.sub(r"[^a-z0-9]", "", str(name).strip().lower())
    best_sym, best_score = None, 0.0

    start = 1
    for _ in range(max(1, pages)):
        params = {
            "listing_status": "active,inactive,untracked",
            "aux": "name,symbol,slug",
            "start": start,
            "limit": limit
        }
        try:
            r = requests.get(CMC_MAP_URL, headers=headers, params=params, timeout=25)
            if r.status_code != 200:
                break
            data = r.json() or {}
            arr = data.get("data") or []
            if not arr:
                break
            for it in arr:
                nm = (it.get("name") or "").strip()
                sym = (it.get("symbol") or "").strip()
                if not nm or not sym:
                    continue
                score = SequenceMatcher(None, name_std, re.sub(r"[^a-z0-9]", "", nm.lower())).ratio()
                if score > best_score:
                    best_score, best_sym = score, sym
            # siguiente página
            start += limit
            time.sleep(SLEEP_CMC)
        except Exception:
            break

    if best_sym and best_score >= 0.83:
        return best_sym.upper(), "cmc"
    return None, None

def resolve_symbol_by_name(name, sleep=0.30):
    """CoinGecko -> CoinMarketCap -> CoinPaprika -> Foundico. Devuelve (SYMBOL, source) o (None, None)."""
    if not name or str(name).strip() == "":
        return None, None
    name_q = str(name).strip()
    name_std = re.sub(r"[^a-z0-9]", "", name_q.lower())

    # 1) CoinGecko
    try:
        url = f"https://api.coingecko.com/api/v3/search?query={requests.utils.quote(name_q)}"
        r = requests.get(url, headers=HEADERS, timeout=20)
        if r.status_code == 200:
            coins = r.json().get("coins", [])
            if coins:
                best = max(coins, key=lambda c: _similar(name_std, re.sub(r"[^a-z0-9]", "", str(c.get("name","")).lower())))
                sym = (best.get("symbol") or "").strip()
                if sym:
                    time.sleep(sleep)
                    print(f"Symbol {sym} encontrado en CoinGecko para {name_q}")
                    return sym.upper(), "coingecko"
    except Exception as e:
        print(e)
    time.sleep(sleep)

    # 2) CoinMarketCap (by name, paginado)
    try:
        sym, src = _cmc_find_symbol_by_name(name_q, CMC_KEY, pages=4, limit=500)
        if sym:
            time.sleep(sleep)
            print(f"Symbol {sym} encontrado en CoinMarketCap para {name_q}")
            return sym, src
    except Exception as e:
        print(e)
        pass
    time.sleep(sleep)
    
    # 3) CoinPaprika
    try:
        url = "https://api.coinpaprika.com/v1/search"
        params = {"q": name_q, "c": "currencies,icos", "limit": 20}
        r = requests.get(url, headers=HEADERS, params=params, timeout=20)
        if r.status_code == 200:
            data = r.json() or {}
            cand = (data.get("currencies") or []) + (data.get("icos") or [])
            if cand:
                best = max(cand, key=lambda c: _similar(name_std, re.sub(r"[^a-z0-9]", "", str(c.get("name","")).lower())))
                sym = (best.get("symbol") or "").strip()
                if sym:
                    time.sleep(sleep)
                    print(f"Symbol {sym} encontrado en CoinPaprika para {name_q}")
                    return sym.upper(), "coinpaprika"
    except Exception as e:
        print(e)
        pass
    time.sleep(sleep)

    # 4) Foundico
    try:
        res = _foundico_search_name(name_q)
        if res and res.get("symbol"):
            print(f"Symbol {res["symbol"]} encontrado en FoundICO para {name_q}")
            return res["symbol"].upper(), "foundico"
    except Exception as e:
        print(e)
        pass

    print(f"No se encontro Symbol para {name_q}")
    return None, None


In [64]:
%%time

# -------- 1) CARGA --------
df = pd.read_csv(dataset_path)
df.columns = df.columns.str.strip().str.lower()
df = df.drop(df.columns[0], axis=1)
print(f"Dataset original: {df.shape[0]} filas, {df.shape[1]} columnas")

# -------- 2) LIMPIEZA BÁSICA (sin llamadas externas) --------
# Montos
for col in ["goal", "fundraising_goal", "received_money", "received_money.1"]:
    if col in df.columns:
        df[col] = df[col].apply(parse_money)

goal_col = "fundraising_goal" if "fundraising_goal" in df.columns else ("goal" if "goal" in df.columns else None)
recv_col = "received_money" if "received_money" in df.columns else ("received_money.1" if "received_money.1" in df.columns else None)

# Fechas
if "end_date" in df.columns:
    df["end_date_clean"] = df["end_date"].map(clean_end_date)
    df["end_date_parsed"] = pd.to_datetime(df["end_date_clean"], format="%d %b %Y", errors="coerce", dayfirst=True)
else:
    df["end_date_parsed"] = pd.NaT

if "start_end_date_coin_sell" in df.columns and df["end_date_parsed"].isna().any():
    need = df["end_date_parsed"].isna()
    rng = df.loc[need, "start_end_date_coin_sell"].astype(str)
    right = rng.str.extract(r".*[-–—]\s*(.*)$")[0].map(clean_end_date)
    df.loc[need, "end_date_parsed"] = pd.to_datetime(right, format="%d %b %Y", errors="coerce", dayfirst=True)

# Flags
today = pd.Timestamp(datetime.now(UTC).date())
ended = df["end_date_parsed"].notna() & (df["end_date_parsed"] <= today)

signals = []
for c in ["ico_token_price", "available_for_token_sale", "sold_coins", "start_end_date_coin_sell"]:
    signals.append(df[c].notna() if c in df.columns else pd.Series(False, index=df.index))
is_ico = np.logical_or.reduce(signals) if signals else pd.Series(False, index=df.index)

# Etiqueta
if goal_col and recv_col:
    df["ico_successful"] = ((df[recv_col] >= df[goal_col]) & df[recv_col].notna() & df[goal_col].notna()).astype(int)
else:
    df["ico_successful"] = np.nan

# -------- 3) NORMALIZAR NAME (sin API) --------
ticker_col = "coin_ticker" if "coin_ticker" in df.columns else None
if ticker_col:
    # en este dataset, coin_ticker es el NOMBRE, no el símbolo
    df["name_std"] = normalize_text_series(df[ticker_col])
else:
    df["name_std"] = ""

# -------- 4) DEDUPE (antes de cualquier API) --------
before = len(df)
df = df.drop_duplicates()  # idénticos
after_full = len(df)

if "name_std" in df.columns:
    cols_for_score = [c for c in df.columns if c not in ["name_std"]]
    score = df[cols_for_score].notna().sum(axis=1)
    df["_score"] = score
    df["_end"]  = pd.to_datetime(df.get("end_date_parsed"), errors="coerce")
    df = df.sort_values(by=["name_std","_score","_end"], ascending=[True, False, False])
    df = df.drop_duplicates(subset=["name_std"], keep="first")
    df = df.drop(columns=["_score","_end"], errors="ignore")
after_name = len(df)
print(f"De-dupe: {before} -> sin idénticos {after_full} -> por nombre único {after_name}")

# -------- 5) FILTRO FINAL (ICO & ENDED) --------
keep = (
    (df["end_date_parsed"].notna() & (df["end_date_parsed"] <= today))
    &
    (
        (df["ico_token_price"].notna() if "ico_token_price" in df.columns else False)
        | (df["available_for_token_sale"].notna() if "available_for_token_sale" in df.columns else False)
        | (df["sold_coins"].notna() if "sold_coins" in df.columns else False)
        | (df["start_end_date_coin_sell"].notna() if "start_end_date_coin_sell" in df.columns else False)
    )
)
df = df.loc[keep].copy()
print(f"Filtrado final (ICO & ended): de {after_name} a {len(df)} filas")

# -------- 6) RESOLVER SYMBOL SOLO PARA ESTAS FILAS ÚNICAS --------
unique_names = df["coin_ticker"].astype(str).fillna("").unique().tolist() if "coin_ticker" in df.columns else []
print(f"Buscando 'symbol' para {len(unique_names)} tokens.")
cache_sym = {}
for name in unique_names:
    key = name.strip().lower()
    if key and key not in cache_sym:
        sym, src = resolve_symbol_by_name(name)
        cache_sym[key] = (sym, src)

df["symbol_resolved"] = df["coin_ticker"].astype(str).fillna("").str.lower().map(lambda k: (cache_sym.get(k, (None,None))[0] if k else None))
df["symbol_resolved_source"] = df["coin_ticker"].astype(str).fillna("").str.lower().map(lambda k: (cache_sym.get(k, (None,None))[1] if k else None))
df["symbol_resolved"] = df["symbol_resolved"].fillna("")
df["symbol_resolved_source"] = df["symbol_resolved_source"].fillna("")

# std del símbolo
df["symbol_std"] = normalize_text_series(df["symbol_resolved"])

# -------- 7) RESUMEN + EXPORT --------
def pct(s):
    vc = s.value_counts(normalize=True) * 100
    return vc.round(2).to_dict()

print(f"\nFilas finales para export: {len(df)}")
if "ico_successful" in df.columns and df["ico_successful"].notna().any():
    print("ICOs Éxitosas (%):", pct(df["ico_successful"]))
    
if "symbol_resolved_source" in df.columns:
    print(df['symbol_resolved_source'].value_counts())

# df.to_csv(output_path, index=False)
print(f"✅ Dataset limpio guardado en: {output_path}")
print("   (incluye: dedupe previo, filtro ICO+ended y resolución de symbol por nombre con CoinGecko->Paprika->FoundICO)")


Dataset original: 12380 filas, 20 columnas
De-dupe: 12380 -> sin idénticos 539 -> por nombre único 539
Filtrado final (ICO & ended): de 539 a 200 filas
Buscando 'symbol' para 200 tokens.
Symbol ACA encontrado en CoinPaprika para Acala Network
Symbol BLD encontrado en CoinGecko para Agoric
Symbol CARAT encontrado en CoinGecko para Alaska Gold Rush
Symbol AMPL encontrado en CoinGecko para Ampleforth
Symbol THOL encontrado en CoinGecko para AngelBlock
Symbol API3 encontrado en CoinPaprika para API3
Symbol APT encontrado en CoinPaprika para Aptos
Symbol ARBI encontrado en CoinPaprika para ArbiPad
Symbol ARB encontrado en CoinPaprika para Arbitrum
Symbol ARCH encontrado en CoinPaprika para Archway
Symbol DANA encontrado en CoinPaprika para Ardana
Symbol $ARKEN encontrado en CoinGecko para Arken Finance
Symbol ARTT encontrado en CoinPaprika para ARTT Network
Symbol AURORA encontrado en CoinGecko para Aurora
Symbol AURY encontrado en CoinGecko para Aurory
Symbol AXL encontrado en CoinGecko pa

In [13]:
%%time

# -------- REINTENTO: completar symbol SOLO donde sigue faltando --------
print("-------- REINTENTO: completar symbol SOLO donde sigue faltando --------")
import pandas as pd
import numpy as np

dataset_path = "datasets/processed/kaggle_yanmaksi_clean.csv"
df = pd.read_csv(dataset_path)

# Asegurar columnas esperadas
for col in ["symbol_resolved", "symbol_resolved_source"]:
    if col not in df.columns:
        df[col] = ""

# Máscara de "aún no resuelto"
mask_missing = df["symbol_resolved_source"].fillna("").str.len().eq(0)

print("Antes del reintento:")
print(df["symbol_resolved_source"].fillna("").replace({"": "<EMPTY>"}).value_counts().head(10))

# Nombres únicos a consultar (solo de los faltantes)
if "coin_ticker" not in df.columns:
    raise ValueError("No encuentro la columna 'coin_ticker' (que aquí usamos como NOMBRE).")

names_to_query = (
    df.loc[mask_missing, "coin_ticker"]
      .astype(str).fillna("")
      .str.strip()
)
unique_names = sorted({n for n in names_to_query.tolist() if n})

print(f"Buscando 'symbol' para {len(unique_names)} tokens (únicos, solo faltantes).")

# Resolver en batch con cache
cache_sym = {}
for name in unique_names:
    key = name.lower()
    if key not in cache_sym:
        sym, src = resolve_symbol_by_name(name)  # CG -> CMC(name) -> Paprika -> Foundico
        cache_sym[key] = (sym, src)

# Construir series de resultado alineadas al df (SOLO para las filas faltantes)
def _sym_mapper(x):
    k = str(x).strip().lower()
    sym, _ = cache_sym.get(k, (None, None))
    return sym if sym else ""

def _src_mapper(x):
    k = str(x).strip().lower()
    _, src = cache_sym.get(k, (None, None))
    return src if src else ""

df.loc[mask_missing, "symbol_resolved"] = (
    df.loc[mask_missing, "coin_ticker"].astype(str).map(_sym_mapper)
)

df.loc[mask_missing, "symbol_resolved_source"] = (
    df.loc[mask_missing, "coin_ticker"].astype(str).map(_src_mapper)
)

# Normalizar symbol_std (opcional)
def normalize_text_series(s):
    return (
        s.astype(str)
         .str.lower()
         .str.strip()
         .str.replace(r"[^a-z0-9]", "", regex=True)
    )

df["symbol_std"] = normalize_text_series(df["symbol_resolved"])

# Métrica de cuántos nuevos symbols se resolvieron
new_resolved = df.loc[mask_missing, "symbol_resolved_source"].replace("", np.nan).notna().sum()
print("\nDespués del reintento:")
print(df["symbol_resolved_source"].fillna("").replace({"": "<EMPTY>"}).value_counts().head(10))
print(f"Se resolvieron {new_resolved} nuevos symbols en esta pasada.")

# Guardar SOBRE el mismo CSV
df.to_csv(dataset_path, index=False)
print(f"✅ Dataset actualizado: {dataset_path}")


-------- REINTENTO: completar symbol SOLO donde sigue faltando --------
Antes del reintento:
symbol_resolved_source
coinpaprika    110
coingecko       61
<EMPTY>         29
Name: count, dtype: int64
Buscando 'symbol' para 29 tokens (únicos, solo faltantes).
No se encontro Symbol para BladeDAO
No se encontro Symbol para Blockstack
No se encontro Symbol para BlueSale
No se encontro Symbol para Cogito Protocol
No se encontro Symbol para Coniun
No se encontro Symbol para CowSwap
No se encontro Symbol para EGO (Paysenger)
No se encontro Symbol para G4AL
No se encontro Symbol para Gearbox Protocol
No se encontro Symbol para Glory Finance
No se encontro Symbol para Goldfinch Finance
No se encontro Symbol para Goracle
No se encontro Symbol para HydraDX
No se encontro Symbol para Ikonic
No se encontro Symbol para KryptAI
No se encontro Symbol para Manta Network Crowdloan
No se encontro Symbol para Oiler Network
No se encontro Symbol para PARMA Fan Token
No se encontro Symbol para PolyGame
No se



In [None]:
# %%time

dataset_path = "datasets/raw/ico_yanmaksi_kaggle.csv"
output_path = "datasets/processed/kaggle_yanmaksi_clean.csv"

# --- Cargar dataset ---
df = pd.read_csv(dataset_path)
print(f"Dataset original: {df.shape[0]} filas, {df.shape[1]} columnas")

df.columns = df.columns.str.strip().str.lower()

# ---------- helpers ----------
def parse_money(x):
    """Convierte montos tipo '$5M', '1,200,000', '3 million' -> float (USD-asumido)."""
    if pd.isna(x): return np.nan
    s = str(x).lower().strip()
    s = s.replace(",", "").replace("$", "")
    try:
        if "billion" in s: base = 1_000_000_000
        elif "million" in s: base = 1_000_000
        elif "b" in s and re.search(r"\d", s): base = 1_000_000_000
        elif "m" in s and re.search(r"\d", s): base = 1_000_000
        elif "k" in s and re.search(r"\d", s): base = 1_000
        else: base = 1
        num = re.findall(r"[\d.]+", s)
        return float(num[0]) * base if num else np.nan
    except Exception:
        return np.nan

def clean_end_date(s):
    """Limpia strings tipo 'Ended: 3 Jun 2023' o 'End: 07/02/2018' y devuelve sólo la fecha."""
    if pd.isna(s): return np.nan
    s = str(s).strip()
    s = re.sub(r"^(ended|end|finished|finalized)\s*:?\s*", "", s, flags=re.IGNORECASE)
    s = s.replace("—", "-").replace("–", "-")
    return s

def normalize_text(s):
    return (
        s.astype(str)
         .str.lower()
         .str.strip()
         .str.replace(r"[^a-z0-9]", "", regex=True)
    )

# ---------- limpiar montos ----------
for col in ["goal", "fundraising_goal", "received_money", "received_money.1"]:
    if col in df.columns:
        df[col] = df[col].apply(parse_money)

goal_col = "fundraising_goal" if "fundraising_goal" in df.columns else ("goal" if "goal" in df.columns else None)
recv_col = "received_money" if "received_money" in df.columns else ("received_money.1" if "received_money.1" in df.columns else None)

# ---------- limpiar y parsear fechas ----------
if "end_date" in df.columns:
    df["end_date_clean"] = df["end_date"].map(clean_end_date)
    df["end_date_parsed"] = pd.to_datetime(df["end_date_clean"], format="%d %b %Y", errors="coerce", dayfirst=True)
else:
    df["end_date_parsed"] = pd.NaT

# Si hay rango "start_end_date_coin_sell" (por ejemplo "Jan 10, 2018 - Feb 12, 2018"), tomar el extremo derecho
if "start_end_date_coin_sell" in df.columns and df["end_date_parsed"].isna().any():
    need = df["end_date_parsed"].isna()
    rng = df.loc[need, "start_end_date_coin_sell"].astype(str)
    right = rng.str.extract(r".*[-–—]\s*(.*)$")[0].map(clean_end_date)
    df.loc[need, "end_date_parsed"] = pd.to_datetime(right, format="%d %b %Y", errors="coerce", dayfirst=True)

# ---------- determinar si la ICO está finalizada por su fecha de "end_date" ----------
today = pd.Timestamp(datetime.now(UTC).date())
ended = df["end_date_parsed"].notna() & (df["end_date_parsed"] <= today)

# ---------- heurística: es ICO (si tiene precio/token info) ----------
signals = []
for c in ["ico_token_price", "available_for_token_sale", "sold_coins", "start_end_date_coin_sell"]:
    signals.append(df[c].notna() if c in df.columns else pd.Series(False, index=df.index))
is_ico = np.logical_or.reduce(signals) if signals else pd.Series(False, index=df.index)

# ---------- derivar éxito ----------
if goal_col and recv_col:
    df["ico_successful"] = ((df[recv_col] >= df[goal_col]) & df[recv_col].notna() & df[goal_col].notna()).astype(int)
else:
    df["ico_successful"] = np.nan

# ---------- normalizaciones de texto ----------
ticker_col = "coin_ticker" if "coin_ticker" in df.columns else None
if ticker_col:
    df["name_std"] = normalize_text(df[ticker_col])
    df["symbol_std"] = normalize_text(df[ticker_col])
else:
    df["name_std"] = ""
    df["symbol_std"] = ""

# ---------- filtro final ----------
keep = is_ico & ended
df_out = df.loc[keep].copy()

# ---------- resumen ----------
def pct(s):
    vc = s.value_counts(normalize=True) * 100
    return vc.round(2).to_dict()

print(f"\nFilas con evidencia de ICO: {int(is_ico.sum())} / {len(df)}")
print(f"Filas 'ended' por fecha:     {int(ended.sum())} / {len(df)}")
print(f"Filas finales (ICO & ended): {len(df_out)}, de un total de {len(df)}")
if df_out["ico_successful"].notna().any():
    print("ICOs Éxitosas (%):", pct(df_out["ico_successful"]))

# ---------- export ----------
# df_out.to_csv(output_path, index=False)
print(f"\n✅ Dataset limpio guardado en: {output_path}")


#### Limpieza del dataset de Vanessa Villanueva obtenido en ICPSR

In [17]:
%%time

# Rutas de entrada/salida
dataset_path = "datasets/raw/ICO_VillanuevaVanessa_OpenICPSR.xlsx"
output_path  = "datasets/processed/icpsr_villanueva_clean.csv"

# ---------- helpers ----------
def parse_money(x):
    """Convierte '$5M', '1,200,000', '3 million', '0.75B' -> float (asumiendo USD)."""
    if pd.isna(x): return np.nan
    s = str(x).lower().strip()
    s = s.replace(",", "").replace("$", "")
    try:
        if "billion" in s or re.search(r"\db", s): base = 1_000_000_000
        elif "million" in s or re.search(r"\dm", s): base = 1_000_000
        elif re.search(r"\dk", s): base = 1_000
        else: base = 1
        nums = re.findall(r"[\d.]+", s)
        return float(nums[0]) * base if nums else np.nan
    except Exception:
        return np.nan

def normalize_text(s):
    return (
        s.astype(str)
         .str.lower().str.strip()
         .str.replace(r"[^a-z0-9]", "", regex=True)
    )
    
def split_symbol(s):
    return (
        s.astype(str)
         .str.split(' ', 1)
    )

def first_col(df, candidates):
    """Devuelve el primer nombre de columna presente en df según una lista de candidatos (lowercase)."""
    for c in candidates:
        if c in df.columns: 
            return c
    return None

# ---------- cargar ----------
df = pd.read_excel(dataset_path)
print(f"Dataset original (ICPSR): {df.shape[0]} filas, {df.shape[1]} columnas")

# normalizar encabezados
df.columns = df.columns.str.strip().str.lower()

# eliminar columnas 'unnamed' residuales
drop_unnamed = [c for c in df.columns if c.startswith("unnamed")]
if drop_unnamed:
    df = df.drop(columns=drop_unnamed)

# columnas clave
name_col   = 'name'
symbol_col = 'ticker'
succ_col   = 'ico success'
soft_col   = 'softcap'
hard_col   = 'hardcap'
raised_col = 'amount raised'

# ---------- estandarizar keys de join ----------
df["name_std"]   = normalize_text(df[name_col])

# -- algunas filas tienen la palabra "token" luego del simbolo del token, por eso se elimina. --
df["symbol_std"] = df[symbol_col].astype("string").str.split(n=1).str[0]

# ---------- parseo de montos (si existen) ----------
for col in [soft_col, hard_col, raised_col]:
    if col:
        df[col + "_usd"] = df[col].apply(parse_money)

# ---------- etiqueta de éxito ----------
if succ_col:
    # usar la que trae el dataset (preferido)
    df["ico_successful"] = (
        pd.to_numeric(df[succ_col], errors="coerce")
          .fillna(0).clip(0,1).astype(int)
    )
else:
    # derivar: raised >= soft cap (si ambos existen)
    if raised_col and soft_col:
        s = pd.to_numeric(df[soft_col + "_usd"], errors="coerce")
        r = pd.to_numeric(df[raised_col + "_usd"], errors="coerce")
        df["ico_successful"] = ((r >= s) & r.notna() & s.notna()).astype(int)
    else:
        df["ico_successful"] = np.nan  # no se puede inferir de forma fiable

# ---------- normalización leve de categóricas útiles ----------
for cat in ["industry","platform","country","kyc","ieo","regtax","regulk yc","regulkyc","regulation","jurisdiction"]:
    if cat in df.columns and df[cat].dtype == "O":
        df[cat] = df[cat].astype(str).str.strip()

df["industry"] = df["industry"].replace({0: np.nan,
    1:"Artificial Intelligence", 2:"Art", 3:"Banking", 4:"Big data",
    5:"Business services", 6:"charity", 7:"Communication", 8:"Cryptocurrency",
    9:"Education", 10:"Electronics", 11:"Energy", 12:"Enterntainment",
    13:"Health", 14:"Infrastructre", 15:"Internet", 16:"Investment",
    17:"Legal", 18:"Manufacturing", 19:"Media", 20:"Platform",
    21:"real estate", 22:"Retail", 23:"smart contract", 24:"software",
    25:"sports", 26:"Tourism", 27:"virtual reality", 28:"other",
})

df["country"] = df["country"].replace({0: np.nan,
    1:"Singapore", 2:"UK", 3:"USA", 4:"Estonia", 5:"Switzerland", 6:"Russia",
    7:"Hong Kong", 8:"Germany", 9:"Cayman Islands", 10:"Australia",
    11:"Malta", 12:"Netherlands", 13:"Canada", 14:"Gibraltar",
    15:"United Arab Emirates", 16:"Taiwan", 17:"British virgin Islands",
    18:"New Zealand", 19:"Belize",
})

df["platform"] = df["platform"].replace({0: "Others",
    1: "Monero", 2: "Nem", 3: "Waves",
    4: "Stellar", 5: "Neo", 6: "Ethereum"
})

# ---------- fechas (opcional; si existieran columnas con 'date') ----------
date_cols = [c for c in df.columns if "date" in c]
for c in date_cols:
    df[c + "_parsed"] = pd.to_datetime(df[c], errors="coerce", dayfirst=True)

# ---------- resumen ----------
print("\nColumnas detectadas:")
print("  name_col   :", name_col)
print("  symbol_col :", symbol_col)
print("  succ_col   :", succ_col)
print("  soft_col   :", soft_col)
print("  hard_col   :", hard_col)
print("  raised_col :", raised_col)

if df["ico_successful"].notna().any():
    dist = df["ico_successful"].value_counts(normalize=True) * 100
    print("\nÉxito (%):", dist.round(2).to_dict())

# sanity check de montos
def quick_stats(series):
    s = pd.to_numeric(series, errors="coerce")
    return {"count": int(s.notna().sum()),
            "min": float(np.nanmin(s)) if s.notna().any() else None,
            "median": float(np.nanmedian(s)) if s.notna().any() else None,
            "max": float(np.nanmax(s)) if s.notna().any() else None}

if raised_col:
    print("\nStats raised_usd:", quick_stats(df[raised_col + "_usd"]))
if soft_col:
    print("Stats softcap_usd:", quick_stats(df[soft_col + "_usd"]))
if hard_col:
    print("Stats hardcap_usd:", quick_stats(df[hard_col + "_usd"]))

# ---------- export ----------
df_out = df.copy()
df_out.to_csv(output_path, index=False)
print(f"\n✅ Guardado: {output_path}")
print(f"Filas finales: {len(df_out):,}")


Dataset original (ICPSR): 2186 filas, 71 columnas

Columnas detectadas:
  name_col   : name
  symbol_col : ticker
  succ_col   : ico success
  soft_col   : softcap
  hard_col   : hardcap
  raised_col : amount raised

Éxito (%): {0: 61.39, 1: 38.61}

Stats raised_usd: {'count': 2183, 'min': 0.0, 'median': 0.0, 'max': 1000000000.0}
Stats softcap_usd: {'count': 1417, 'min': 0.0, 'median': 2409600.0, 'max': 60000000008.0}
Stats hardcap_usd: {'count': 1811, 'min': 12543.0, 'median': 30000000.0, 'max': 100000000000.0}

✅ Guardado: datasets/processed/icpsr_villanueva_clean.csv
Filas finales: 2,186
CPU times: total: 938 ms
Wall time: 982 ms


In [10]:
%%time
# === Preprocesado Fahlenbrach (Zenodo) ===
import pandas as pd, numpy as np, re
from datetime import datetime

PATH_ZENODO = "datasets/raw/ICO_Fahlenbrach_Zenodo.xlsx"
OUT_ZENODO  = "datasets/processed/zenodo_fahlenbrach_clean.csv"

def normalize_text(s):
    return s.astype(str).str.lower().str.strip().str.replace(r"[^a-z0-9]", "", regex=True)

def parse_money(x):
    if pd.isna(x): return np.nan
    s = str(x).lower().replace(",", "").replace("$", "").strip()
    try:
        if "b" in s: base = 1_000_000_000
        elif "m" in s: base = 1_000_000
        elif "k" in s: base = 1_000
        else: base = 1
        nums = re.findall(r"[\d.]+", s)
        return float(nums[0]) * base if nums else np.nan
    except: 
        return np.nan

df = pd.read_excel(PATH_ZENODO)
print(f"Dataset original (Zenodo): {df.shape[0]} filas, {df.shape[1]} columnas")
df.columns = df.columns.str.strip().str.lower()

# Columnas clave
name_col   = "name_other"
symbol_col = "ticker_symbol_cmc"
succ_col   = "ico_successful"
soft_col   = "soft_cap"
hard_col   = "hard_cap"
raised_col = "independent custodian for ico funds"

df["name_std"]   = normalize_text(df[name_col]) if name_col else ""
df["symbol_std"] = normalize_text(df[symbol_col]) if symbol_col else ""

# convertir montos a numéricos si existen
for col in [soft_col, hard_col, raised_col]:
    if col: df[col + "_usd"] = df[col].apply(parse_money)

# detectar éxito
if succ_col:
    df["ico_successful"] = pd.to_numeric(df[succ_col], errors="coerce")
    print("\nDistribución original de ico_successful:")
    print(df["ico_successful"].value_counts(dropna=False))
    # Validación cruzada simple
    if raised_col and "percentage" not in raised_col:
        mean_success = df.groupby("ico_successful")[raised_col + "_usd"].mean()
        print("\nMedia de recaudación por etiqueta:")
        print(mean_success)
else:
    # derivar: éxito si raised >= soft cap
    if raised_col and soft_col:
        r = pd.to_numeric(df[raised_col + "_usd"], errors="coerce")
        s = pd.to_numeric(df[soft_col + "_usd"], errors="coerce")
        df["ico_successful"] = ((r >= s) & r.notna() & s.notna()).astype(int)
    else:
        df["ico_successful"] = np.nan

# normalizar la etiqueta: asegurar que 1 = éxito
if df.groupby("ico_successful").mean(numeric_only=True)[raised_col + "_usd"].idxmax() == 0:
    df["ico_successful"] = 1 - df["ico_successful"]  # invertir codificación

# exportar limpio
df.to_csv(OUT_ZENODO, index=False)
print(f"\n✅ Guardado: {OUT_ZENODO}")
print(f"Filas finales: {len(df):,}")


Dataset original (Zenodo): 306 filas, 126 columnas

Distribución original de ico_successful:
ico_successful
1    300
0      6
Name: count, dtype: int64

Media de recaudación por etiqueta:
ico_successful
0    0.000000
1    0.036667
Name: independent custodian for ico funds_usd, dtype: float64

✅ Guardado: datasets/processed/zenodo_fahlenbrach_clean.csv
Filas finales: 306
CPU times: total: 1.59 s
Wall time: 2.76 s
