In [66]:
# TODO - Eliminar claves antes de commitear!
%env CMC_API_KEY="87f241ea-b56c-4a2f-9707-e25b4352ceb6"
%env FOUNDI_PUBLIC_KEY="5e6fc744cf305a8fea1e5669057c5d11"
%env FOUNDI_PRIVATE_KEY="3f824123907803e8b9a402ed7bdb0dd2"

env: CMC_API_KEY="87f241ea-b56c-4a2f-9707-e25b4352ceb6"
env: FOUNDI_PUBLIC_KEY="5e6fc744cf305a8fea1e5669057c5d11"
env: FOUNDI_PRIVATE_KEY="3f824123907803e8b9a402ed7bdb0dd2"


In [67]:
# %% [markdown]
# # Enriquecimiento ex-ante de ICOs (CG, CMC, Paprika, FoundICO)
# Completa datos faltantes del dataset final sin pisar valores existentes (a menos que se indique).
# Respeta ex-ante si se activa EXANTE_ONLY para evitar leakage.

# %%
import os, re, json, time, base64, hmac, hashlib
from typing import Dict, Any, Optional
import numpy as np
import pandas as pd
import requests
import unicodedata
from difflib import SequenceMatcher
from urllib.parse import quote

# === Rutas ===
#IN_PATH  = "../join/ico_exante_features_v1.csv"       # dataset final actual
#OUT_PATH = "../join/ico_exante_enriched_v1.csv"       # salida enriquecida
CACHE_PATH = ".cache_enrich.json"                      # cache local
IN_PATH  = "../final/ico_dataset_final_v2_clean.csv"       # dataset final actual
OUT_PATH = "../final/ico_dataset_final_v2_clean_enriquecido.csv"       # salida enriquecida

# === Flags ===
EXANTE_ONLY = True          # limitar a features pre-ICO (evita leakage)
OVERWRITE_MISSING = True    # solo completar vacíos (recomendado)
MAX_ROWS = 0                # 0 = todas las filas, >0 para depurar parcial
FOUN_FIND_MAX_PAGES = 40    # paginado máximo en FoundICO (búsqueda fuzzy)

# === Credenciales (setearlas como variables de entorno) ===
CMC_API_KEY  = os.getenv("CMC_API_KEY")
FOUN_PUB     = os.getenv("FOUNDI_PUBLIC_KEY")
FOUN_PRIV    = os.getenv("FOUNDI_PRIVATE_KEY")
CG_API_KEY   = os.getenv("CG_API_KEY")

CMC_API_KEY  = '87f241ea-b56c-4a2f-9707-e25b4352ceb6'
FOUN_PUB     = '5e6fc744cf305a8fea1e5669057c5d11'
FOUN_PRIV    = '3f824123907803e8b9a402ed7bdb0dd2'

# Rate limits suaves
SLEEP_CG   = 0.25
SLEEP_CMC  = 0.35
SLEEP_PAP  = 0.25
SLEEP_FOUN = 0.45

USER_AGENT = "TFM-ICO-Enricher/1.0 (+local)"
HEADERS_JSON = {
    "User-Agent": USER_AGENT,
    "Accept": "application/json",
    "Content-Type": "application/json",
}


In [68]:
# %% Utilidades (cache, normalizadores, parseos)

def load_cache(path: str) -> Dict[str, Any]:
    if os.path.exists(path):
        try:
            with open(path, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return {}
    return {}

def save_cache(cache: Dict[str, Any], path: str):
    try:
        with open(path, "w", encoding="utf-8") as f:
            json.dump(cache, f, ensure_ascii=False, indent=2)
    except Exception:
        pass

def normalize_text(x) -> str:
    if x is None or (isinstance(x, float) and np.isnan(x)): return ""
    s = str(x).strip()
    return re.sub(r"\s+", " ", s)

def similar(a: str, b: str) -> float:
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def parse_money_like(x) -> Optional[float]:
    if x is None or (isinstance(x, float) and np.isnan(x)): return None
    s = str(x).lower().replace(",", "").replace("$", "").strip()
    mult = 1.0
    if s.endswith("b"): mult = 1e9; s = re.sub(r"b$", "", s)
    elif s.endswith("m"): mult = 1e6; s = re.sub(r"m$", "", s)
    elif s.endswith("k"): mult = 1e3; s = re.sub(r"k$", "", s)
    nums = re.findall(r"[\d.]+", s)
    return float(nums[0]) * mult if nums else None

def to_int(x) -> Optional[int]:
    try:
        if x is None or (isinstance(x, float) and np.isnan(x)): return None
        return int(float(x))
    except Exception:
        return None

def boolify(x) -> Optional[int]:
    if x is None or (isinstance(x, float) and np.isnan(x)): return None
    s = str(x).strip().lower()
    if s in {"1", "true", "yes", "y", "si", "sí"}: return 1
    if s in {"0", "false", "no", "n"}: return 0
    return None

def to_slug(s: str) -> str:
    if s is None: return ""
    s = str(s).strip().lower()
    # normalizar acentos
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    s = s.replace("&","and")
    s = re.sub(r"[^a-z0-9]+", "-", s)  # mantener guiones
    s = re.sub(r"-+", "-", s).strip("-")
    return s

def is_nonempty(x):
    return x not in [None, "", [], {}, np.nan]

In [76]:
# %% Clientes API (CoinGecko, CMC, CoinPaprika, FoundICO)

class CoinGecko:
    def __init__(self, api_key: Optional[str] = None):
        self.api_key = api_key
        self.base = "https://api.coingecko.com/api/v3"
        self.headers = {"User-Agent": USER_AGENT}
        if api_key:
            self.headers["x-cg-pro-api-key"] = api_key

    def search(self, query: str):
        url = f"{self.base}/search?query={quote(query)}"
        r = requests.get(url, headers=self.headers, timeout=30)
        if r.status_code != 200: time.sleep(SLEEP_CG); return None
        data = r.json()
        coins = data.get("coins", [])
        time.sleep(SLEEP_CG)
        return coins[0] if coins else None

    def get_coin(self, cg_id: str):
        url = f"{self.base}/coins/{cg_id}"
        params = {"localization":"false","tickers":"false","market_data":"false","community_data":"true","developer_data":"true","sparkline":"false"}
        r = requests.get(url, headers=self.headers, params=params, timeout=30)
        if r.status_code != 200: time.sleep(SLEEP_CG); return None
        payload = r.json()
        time.sleep(SLEEP_CG)
        return payload

    def resolve(self, symbol: str, name: str):
        q = symbol or name
        if not q: return None
        hit = self.search(q)
        if not hit and name:
            hit = self.search(name)
        if not hit: return None
        cg_id = hit.get("id")
        full = self.get_coin(cg_id) if cg_id else None
        if not full: return None
        links = (full.get("links") or {})
        return {
            "source": "coingecko",
            "cg_id": cg_id,
            "token_symbol": full.get("symbol"),
            "token_name": full.get("name"),
            "market_cap_rank": links.get("market_cap_rank"),
            "sentiment_votes_up_percentage": links.get("sentiment_votes_up_percentage"),
            "sentiment_votes_down_percentage": links.get("sentiment_votes_down_percentage"),
            "homepage": (links.get("homepage") or [None])[0],
            "whitepaper_url": links.get("whitepaper"),
            "github_url": (links.get("repos_url") or {}).get("github", [None])[0] if links.get("repos_url") else None,
            "twitter_url": links.get("twitter_screen_name") and f"https://twitter.com/{links.get('twitter_screen_name')}",
            "telegram_url": links.get("telegram_channel_identifier") and f"https://t.me/{links.get('telegram_channel_identifier')}",
            "reddit_url": links.get("subreddit_url"),
            "has_github": 1 if (links.get("repos_url") and links["repos_url"].get("github")) else None,
        }

# ===== CoinMarketCap =====
class CoinMarketCap:
    def __init__(self, api_key: Optional[str]):
        self.api_key = api_key
        self.base = "https://pro-api.coinmarketcap.com"
        self.headers = {
            "User-Agent": "TFM-ICO-Enricher/1.1",
            "Accept": "application/json"
        }
        if api_key:
            self.headers["X-CMC_PRO_API_KEY"] = api_key
        # cache en memoria para evitar hits repetidos
        self._cache_symbol = {}
        self._cache_slug   = {}
        self._cache_mapall = None

    def _get(self, path, params):
        if not self.api_key:
            return None
        try:
            r = requests.get(f"{self.base}{path}", headers=self.headers, params=params, timeout=30)
            if r.status_code != 200:
                # log corto para depurar
                # print(f"[CMC] HTTP {r.status_code} path={path} params={params} resp={r.text}")
                time.sleep(SLEEP_CMC)
                return None
            data = r.json()
            time.sleep(SLEEP_CMC)
            #print(f"[CMC] {data}")
            return data
        except Exception as e:
            #print(f"[CMC] error: {e}")
            return None

    def _info_by_id(self, cmc_id: int):
        data = self._get("/v2/cryptocurrency/info", {"id": str(cmc_id)})
        if not data: return None
        return (data.get("data") or {}).get(str(cmc_id))

    def _info_by_symbol(self, symbol: str):
        # v2/info acepta 'symbol' (coma-separado), aprovechamos para exact match
        data = self._get("/v2/cryptocurrency/info", {"symbol": symbol.upper()})
        if not data: return None
        dd = data.get("data") or {}
        # puede devolver múltiples entradas para el mismo símbolo
        # elegimos la primera
        for _, v in dd.items():
            return v  # primer match
        return None

    def _map_by_symbol(self, symbol: str):
        symbol = symbol.upper().strip()
        if symbol in self._cache_symbol:
            return self._cache_symbol[symbol]
        data = self._get("/v1/cryptocurrency/map", {
            "symbol": symbol,
            "listing_status": "active,inactive,untracked",
            "aux": "platform,first_historical_data,last_historical_data,is_active,status"
        })
        arr = (data or {}).get("data") or []
        out = arr[0] if arr else None
        self._cache_symbol[symbol] = out
        return out

    def _map_all(self):
        if self._cache_mapall is not None:
            return self._cache_mapall
        data = self._get("/v1/cryptocurrency/map", {
            "listing_status": "active,inactive,untracked",
            "aux": "platform,first_historical_data,last_historical_data,is_active,status"
        })
        df = None
        if data and data.get("data"):
            df = pd.DataFrame(data["data"])
        self._cache_mapall = df
        return df

    def _map_by_slug(self, slug: str):
        slug = slug.strip().lower()
        if slug in self._cache_slug:
            return self._cache_slug[slug]
        df = self._map_all()
        if df is None or df.empty:
            self._cache_slug[slug] = None
            return None
        hit = df[df["slug"].astype(str).str.lower() == slug]
        out = hit.iloc[0].to_dict() if not hit.empty else None
        self._cache_slug[slug] = out
        return out

    def resolve(self, symbol: str, name: str):
        if not self.api_key:
            return None

        symbol = (symbol or "").strip().upper()
        name   = (name or "").strip()
        # 1) intento rápido: info por símbolo (devuelve urls directas)
        if symbol:
            info = self._info_by_symbol(symbol)
            if info:
                # CoinMarketCap puede devolver:
                # - dict con IDs (p.ej. {'1027': {...}})
                # - lista de dicts (p.ej. [{'id':..., 'name':...}])
                # - dict plano (p.ej. {'id':..., 'name':...})
                if isinstance(info, dict):
                    # caso 1: dict con ID como clave
                    if all(isinstance(k, str) and isinstance(v, dict) for k, v in info.items()):
                        meta = list(info.values())[0]
                    else:
                        meta = info
                elif isinstance(info, list) and len(info) > 0:
                    meta = info[0]
                else:
                    meta = {}
            
                urls = (meta.get("urls") or {}) if isinstance(meta, dict) else {}
                return {
                    "source": "coinmarketcap",
                    "cmc_id": meta.get("id"),
                    "token_symbol": meta.get("symbol"),
                    "token_name": meta.get("name"),
                    "homepage": (urls.get("website") or [None])[0] if urls else None,
                    "whitepaper_url": (urls.get("technical_doc") or [None])[0] if urls else None,
                    "github_url": (urls.get("source_code") or [None])[0] if urls else None,
                    "twitter_url": (urls.get("twitter") or [None])[0] if urls else None,
                    "reddit_url": (urls.get("reddit") or [None])[0] if urls else None,
                    "telegram_url": (urls.get("chat") or [None])[0] if urls else None,
                    "contract_address": (meta.get("symbol") or [None])[0]
                }

        # 2) map por symbol
        if symbol:
            m = self._map_by_symbol(symbol)
            if m:
                info = self._info_by_id(m["id"])
                urls = (info or {}).get("urls") or {}
                return {
                    "source": "coinmarketcap",
                    "cmc_id": m.get("id"),
                    "token_symbol": m.get("symbol"),
                    "token_name": m.get("name"),
                    "homepage": (urls.get("website") or [None])[0] if urls else None,
                    "whitepaper_url": (urls.get("technical_doc") or [None])[0] if urls else None,
                    "github_url": (urls.get("source_code") or [None])[0] if urls else None,
                    "twitter_url": (urls.get("twitter") or [None])[0] if urls else None,
                    "reddit_url": (urls.get("reddit") or [None])[0] if urls else None,
                    "telegram_url": (urls.get("chat") or [None])[0] if urls else None,
                    "contract_address": (meta.get("symbol") or [None])[0]
                }

        # 3) slug por nombre
        if name:
            slug = to_slug(name)
            m = self._map_by_slug(slug)
            if m:
                info = self._info_by_id(m["id"])
                urls = (info or {}).get("urls") or {}
                return {
                    "source": "coinmarketcap",
                    "cmc_id": m.get("id"),
                    "token_symbol": m.get("symbol"),
                    "token_name": m.get("name"),
                    "homepage": (urls.get("website") or [None])[0] if urls else None,
                    "whitepaper_url": (urls.get("technical_doc") or [None])[0] if urls else None,
                    "github_url": (urls.get("source_code") or [None])[0] if urls else None,
                    "twitter_url": (urls.get("twitter") or [None])[0] if urls else None,
                    "reddit_url": (urls.get("reddit") or [None])[0] if urls else None,
                    "telegram_url": (urls.get("chat") or [None])[0] if urls else None,
                    "contract_address": (meta.get("symbol") or [None])[0]
                }

        # 4) fallback: map_all + fuzzy por nombre (una sola vez cacheado)
        df = self._map_all()
        if df is not None and not df.empty and name:
            nm = normalize_text(name)
            df["sim"] = df["name"].astype(str).map(lambda x: similar(x, nm))
            cand = df.sort_values("sim", ascending=False).head(1)
            if not cand.empty and cand.iloc[0]["sim"] >= 0.85:
                m = cand.iloc[0].to_dict()
                info = self._info_by_id(m["id"])
                urls = (info or {}).get("urls") or {}
                return {
                    "source": "coinmarketcap",
                    "cmc_id": m.get("id"),
                    "token_symbol": m.get("symbol"),
                    "token_name": m.get("name"),
                    "homepage": (urls.get("website") or [None])[0] if urls else None,
                    "whitepaper_url": (urls.get("technical_doc") or [None])[0] if urls else None,
                    "github_url": (urls.get("source_code") or [None])[0] if urls else None,
                    "twitter_url": (urls.get("twitter") or [None])[0] if urls else None,
                    "reddit_url": (urls.get("reddit") or [None])[0] if urls else None,
                    "telegram_url": (urls.get("chat") or [None])[0] if urls else None,
                    "contract_address": (meta.get("symbol") or [None])[0]
                }

        return None

# ===== CoinPaprika =====
class CoinPaprika:
    def __init__(self):
        self.base = "https://api.coinpaprika.com/v1"
        self.headers = {"User-Agent": "TFM-ICO-Enricher/1.1", "Accept":"application/json"}

    def _get(self, path):
        try:
            r = requests.get(f"{self.base}{path}", headers=self.headers, timeout=30)
            if r.status_code != 200:
                print(f"[Paprika] HTTP {r.status_code} path={path} resp={r.text[:160]}")
                time.sleep(SLEEP_PAP)
                return None
            data = r.json()
            time.sleep(SLEEP_PAP)
            return data
        except Exception as e:
            #print(f"[Paprika] error: {e}")
            return None

    def search(self, q: str, limit=10):
        q = quote(q)
        data = self._get(f"/search?q={q}&c=currencies,icos&limit={limit}")
        if not data: return None
        coins = (data or {}).get("currencies") or []  # Paprika separa por tipo
        if not coins:
            coins = (data or {}).get("coins") or []    # algunas respuestas usan 'coins'
        return coins

    def coin(self, coin_id: str):
        return self._get(f"/coins/{coin_id}")

    def resolve(self, symbol: str, name: str):
        symbol = (symbol or "").strip().upper()
        name   = (name or "").strip()
        # 1) buscar por symbol exacto en search
        if symbol:
            hits = self.search(symbol, limit=5) or []
            # exact symbol
            for h in hits:
                if str(h.get("symbol","")).upper() == symbol:
                    det = self.coin(h.get("id"))
                    if not det: continue
                    links = det.get("links") or {}
                    return {
                        "source": "coinpaprika",
                        "paprika_id": h.get("id"),
                        "token_symbol": det.get("symbol"),
                        "token_name": det.get("name"),
                        "homepage": (links.get("website") or [None])[0] if links.get("website") else None,
                        "reddit_url": (links.get("reddit") or [None])[0] if links.get("reddit") else None,
                    }
        # 2) fuzzy por nombre
        if name:
            hits = self.search(name, limit=10) or []
            if hits:
                name_n = normalize_text(name)
                hits.sort(key=lambda h: similar(name_n, normalize_text(h.get("name",""))), reverse=True)
                best = hits[0]
                if similar(name_n, normalize_text(best.get("name",""))) >= 0.85:
                    det = self.coin(best.get("id"))
                    if det:
                        links = det.get("links") or {}
                        return {
                            "source": "coinpaprika",
                            "paprika_id": best.get("id"),
                            "token_symbol": det.get("symbol"),
                            "token_name": det.get("name"),
                            "homepage": (links.get("website") or [None])[0] if links.get("website") else None,
                            "reddit_url": (links.get("reddit") or [None])[0] if links.get("reddit") else None,
                        }
        return None

class FoundICO:
    BASE = "https://foundico.com/api/v1"

    def __init__(self, public_key: Optional[str], private_key: Optional[str]):
        self.pub = public_key
        self.priv = private_key

    def _headers(self, body_dict: Dict[str, Any]) -> Dict[str, str]:
        payload = json.dumps(body_dict, separators=(",", ":"), ensure_ascii=False)
        signature = base64.b64encode(hmac.new(
            key=self.priv.encode("utf-8"),
            msg=payload.encode("utf-8"),
            digestmod=hashlib.sha256
        ).digest()).decode("ascii")
        return {
            "User-Agent": USER_AGENT,
            "Content-Type": "application/json",
            "X-Foundico-Public-Key": self.pub,
            "X-Foundico-Access-Key": signature
        }

    def list_icos(self, page:int=1, status:str="past"):
        if not (self.pub and self.priv): return None
        url = f"{self.BASE}/icos/"
        body = {"page": page, "status": status}
        r = requests.post(url, headers=self._headers(body), data=json.dumps(body), timeout=30)
        if r.status_code != 200: time.sleep(SLEEP_FOUN); return None
        data = r.json()
        time.sleep(SLEEP_FOUN)
        return data

    def profile(self, ico_id:int):
        if not (self.pub and self.priv): return None
        url = f"{self.BASE}/ico/"
        body = {"id": int(ico_id)}
        r = requests.post(url, headers=self._headers(body), data=json.dumps(body), timeout=30)
        if r.status_code != 200: time.sleep(SLEEP_FOUN); return None
        data = r.json()
        time.sleep(SLEEP_FOUN)
        return data

    def search_by_name(self, name: str, max_pages:int=30, status:str="past"):
        if not (self.pub and self.priv): return None
        best, best_sim = None, 0.0
        for page in range(1, max_pages+1):
            lst = self.list_icos(page=page, status=status)
            if not lst or not lst.get("data"): break
            for item in lst["data"]:
                nm = (item.get("main") or {}).get("name") or ""
                sc = similar(nm, name)
                if sc > best_sim:
                    best_sim, best = sc, item
            if best_sim >= 0.9: break
        if not best: return None
        prof = self.profile(int(best["id"]))
        if not prof or not prof.get("data"): return None
        d = prof["data"]
        main = d.get("main") or {}
        links = d.get("links") or {}
        finance = d.get("finance") or {}
        return {
            "source": "foundico",
            "foundico_id": d.get("id"),
            "token_symbol": finance.get("ticker"),
            "token_name": main.get("name"),
            "homepage": links.get("website"),
            "whitepaper_url": links.get("whitepaper"),
            "token_type": finance.get("token_type"),
            "tokens_for_sale": to_int(finance.get("tokens_for_sale")),
            "token_price_usd": float(finance.get("token_price")) if finance.get("token_price") not in [None,""] else None,
            "soft_cap_usd": parse_money_like(finance.get("soft_cap")),
            "hard_cap_usd": parse_money_like(finance.get("hard_cap")),
            "accepted_currencies": None,
            "kyc": boolify(main.get("kyc")),
            "whitelist": boolify(main.get("whitelist")),
            "jurisdiction": main.get("location"),
            "category": main.get("category"),
            "has_github": None,
            "rating": main.get("ico_score") and float(main.get("ico_score")),
            "whitepaper_available": 1 if links.get("whitepaper") else None,
            "roadmap_available": 1 if d.get("roadmap") else None,
            "team_size": len(d.get("team") or []),
        }


In [77]:
# %% Lógica de enriquecimiento por fila

def enrich_row(
    row: pd.Series,
    cg: CoinGecko,
    cmc: CoinMarketCap,
    pap: CoinPaprika,
    fou: FoundICO,
    exante_only: bool,
    overwrite_missing: bool,
    cache: Dict[str, Any],
    foundico_max_pages: int = 30,
) -> Dict[str, Any]:
    """
    Devuelve dict {col: nuevo_valor} para completar en la fila.
    """
    name = normalize_text(row.get("name_std"))
    symbol = normalize_text(row.get("symbol_std")).upper()

    # cache key
    ck = f"resolve::{symbol or '_'}::{name or '_'}"
    if ck in cache:
        hits = cache[ck]
    else:
        hits = []
        try:
            fou_need = not symbol or symbol == ""
            # FoundICO primero si falta symbol o no hay nada
            # if fou_need and (FOUN_PUB and FOUN_PRIV):
            #     fou_hit = fou.search_by_name(name or symbol, max_pages=foundico_max_pages, status="past")
            #     if fou_hit: hits.append(fou_hit)
        except Exception:
            pass
        try:
            if symbol or name:
                cg_hit = cg.resolve(symbol=symbol, name=name)
                if cg_hit:
                    #print(f'CoinGecko: {cg_hit}')
                    hits.append(cg_hit)
                else:
                    print('Nada en CoinGecko')
        except Exception as e:
            #print(f'Error CoinGecko: {e}')
            pass
        try:
            cmc_hit = cmc.resolve(symbol=symbol, name=name) if CMC_API_KEY else None
            if cmc_hit: 
                #print(f'CoinMarketCap: {cmc_hit}')
                hits.append(cmc_hit)
            else:
                print('Nada en CoinMarketCap')
        except Exception as e:
            #print(f'Error CoinMarketCap: {e}')
            pass
            
        # try:
        #     pap_hit = pap.resolve(symbol=symbol, name=name)
        #     if pap_hit: 
        #         print(f'CoinPaprika: {pap_hit}')
        #         hits.append(pap_hit)
        #     else:
        #         print('Nada en CoinPaprika')
        # except Exception as e:
        #     #print(f'Error CoinPaprika: {e}')
        #     pass
            
        # FoundICO al final si todavía no hay nada útil
        # if not hits and (FOUN_PUB and FOUN_PRIV):
         #    try:
          #       fou_hit = fou.search_by_name(name or symbol, max_pages=foundico_max_pages, status="past")
           #      if fou_hit: 
            #         print(f'FoundICO: {fou_hit}')
             #        hits.append(fou_hit)
              #   else:
               #      print('Nada en FoundICO')
            # except Exception as e:
             #    #print(f'Error FoundICO: {e}')
              #   pass

        cache[ck] = hits

    # Consolidación por prioridad de tokenomics de ICO
    priority = ["foundico", "coingecko", "coinmarketcap", "coinpaprika"]
    best = {}
    for src in priority:
        for h in hits:
            if h.get("source") == src:
                for k, v in h.items():
                    if v not in [None, "", [], {}]:
                        best.setdefault(k, v)

    # Campos ex-ante a completar
    exante_fields = [
        "token_symbol","token_name","homepage","whitepaper_url","github_url","twitter_url","telegram_url","reddit_url",
        "market_cap_rank", "sentiment_votes_up_percentage", "sentiment_votes_down_percentage", "contract_address", "has_github"
        #"token_type","tokens_for_sale","token_price_usd","soft_cap_usd","hard_cap_usd","accepted_currencies",
        #"kyc","whitelist","jurisdiction","category","team_size","has_github","roadmap_available","whitepaper_available","rating"
    ]
    fields = exante_fields if exante_only else exante_fields  # hook para futuro

    patch = {}
    for f in fields:
        cur = row.get(f)
        new = best.get(f)
        if overwrite_missing:
            if (cur in [None, "", np.nan]) and (new not in [None, "", np.nan]):
                patch[f] = new
        else:
            if (cur in [None, "", np.nan]) and (new not in [None, "", np.nan]):
                patch[f] = new

    # Si falta symbol_std y obtuvimos token_symbol → rellenar
    if (row.get("symbol_std") in [None, "", np.nan]) and best.get("token_symbol"):
        patch["symbol_std"] = str(best["token_symbol"]).upper()

    return patch


In [78]:
# %% Proceso: carga dataset, enriquece, rellena y guarda el nuevo dataset

# Instanciar clientes API
cg  = CoinGecko(api_key=CG_API_KEY)
cmc = CoinMarketCap(api_key=CMC_API_KEY)
pap = CoinPaprika()
fou = FoundICO(public_key=FOUN_PUB, private_key=FOUN_PRIV)

# Cargar dataset
df = pd.read_csv(IN_PATH)
#df = df.sample(10)
if "name_std" not in df.columns and "symbol_std" not in df.columns:
    raise RuntimeError("Se requieren columnas 'name_std' y/o 'symbol_std'.")

# Asegurar columnas destino (si no existen)
target_cols = [
    "token_symbol","token_name","homepage","whitepaper_url","github_url","twitter_url","telegram_url","reddit_url",
    "token_type","tokens_for_sale","token_price_usd","soft_cap_usd","hard_cap_usd","accepted_currencies",
    "kyc","whitelist","jurisdiction","category","team_size","has_github","roadmap_available","whitepaper_available","rating"
]
for c in target_cols:
    if c not in df.columns:
        df[c] = np.nan

#data = [{"name_std": "Binance","symbol_std": "BNB"}]
#df = pd.DataFrame(data)
#df = pd.DataFrame(data)
#df.append(data)
#df["name_std"] = "Bitcoin"
#df["symbol_std"] = "BTC"

cache = load_cache(CACHE_PATH)
MAX_ROWS=0
n = len(df) if MAX_ROWS == 0 else min(MAX_ROWS, len(df))
updates = 0
t0 = time.time()
print(n)
cg_cmc_findings = []

for i in range(n):
    row = df.iloc[i]
    print(f'{row["name_std"]} - {row["symbol_std"]}')
    patch = enrich_row(
        row=row,
        cg=cg, cmc=cmc, pap=pap, fou=fou,
        exante_only=EXANTE_ONLY,
        overwrite_missing=OVERWRITE_MISSING,
        cache=cache,
        foundico_max_pages=FOUN_FIND_MAX_PAGES
    )
    if patch:
        for k, v in patch.items():
            print(f"Hay patch: {k} - {v}")
            df.at[row.name, k] = v
        updates += 1
        cg_cmc_findings.append(patch)

    if (i+1) % 50 == 0:
        save_cache(cache, CACHE_PATH)
        print(f"[{i+1}/{n}] guardado cache...")

df_cg_cmc = pd.DataFrame(cg_cmc_findings)
df_cg_cmc_PATH = "../final/df_cg_cmc.csv"
df_cg_cmc.to_csv(df_cg_cmc_PATH, index=False)
save_cache(cache, CACHE_PATH)

# Post: normalizar tipos básicos
for bcol in ["kyc","whitelist","whitepaper_available","roadmap_available","has_github"]:
    if bcol in df.columns:
        df[bcol] = df[bcol].apply(lambda x: boolify(x) if pd.notna(x) else x)

for mcol in ["soft_cap_usd","hard_cap_usd","token_price_usd","tokens_for_sale","team_size"]:
    if mcol in df.columns:
        df[mcol] = pd.to_numeric(df[mcol], errors="coerce")

df.to_csv(OUT_PATH, index=False)
dt = time.time() - t0

print(f"✅ Guardado enriquecido: {OUT_PATH}")
print(f"Filas procesadas: {n} | Filas con cambios: {updates} | Tiempo: {dt:.1f}s")

# Resumen de cobertura
filled = []
for c in target_cols:
    if c in df.columns:
        pct = (df[c].notna() & (df[c].astype(str).str.strip() != "")).mean()*100
        filled.append((c, round(pct,2)))
filled.sort(key=lambda x: -x[1])

print("\nCobertura (no-nulos, %):")
for c, p in filled:
    print(f"{c:24s} {p:6.2f}%")


5430
baby gamium - BABYGMM
Nada en CoinGecko
Nada en CoinMarketCap
e-commerce of global consulting business - BEEPOWER
Nada en CoinGecko
Nada en CoinMarketCap
pepebinky - BEKY
Nada en CoinGecko
Nada en CoinMarketCap
bitdoge - BITDOGE
Nada en CoinGecko
Nada en CoinMarketCap
bankii - BKNY
Nada en CoinGecko
Nada en CoinMarketCap
bold - BOLD
Nada en CoinGecko
Hay patch: contract_address - B
bitcoin bsc - BTCBSC
Nada en CoinGecko
bitcoincopy - BTCC
Nada en CoinMarketCap
bitcoin minetrix - BTCMTX
Nada en CoinGecko
Nada en CoinMarketCap
c+charge - CCHG
Nada en CoinGecko
Hay patch: contract_address - C
$cvpad - CVPAD
Nada en CoinGecko
Nada en CoinMarketCap
deboard - DEVAX
Nada en CoinGecko
Nada en CoinMarketCap
dogeverse - DOGEVERSE
Nada en CoinGecko
Hay patch: contract_address - D
freedum fighters - DUM
Nada en CoinGecko
Nada en CoinMarketCap
$ecom - ECOM
Nada en CoinGecko
Hay patch: contract_address - E
ecoterra - ECOTERRA
Nada en CoinGecko
Hay patch: contract_address - E
eywa - EYWA
Nada en

In [80]:
# %%
aud = pd.read_csv(OUT_PATH)
audit_cols = ["symbol_std","token_symbol","token_name","homepage","whitepaper_url","github_url",
              "token_type","token_price_usd","soft_cap_usd","hard_cap_usd","kyc","whitelist","jurisdiction"]
coverage = aud[audit_cols].isna().mean().map(lambda x: round(100*(1-x),2)).sort_values(ascending=False)
print("Cobertura % no-nulos (desc):\n", coverage)

# Tokens aún sin símbolo resuelto
left = aud[(aud["symbol_std"].isna() | (aud["symbol_std"].astype(str).str.strip()=="")) &
           (aud["token_symbol"].isna() | (aud["token_symbol"].astype(str).str.strip()==""))]
print(f"\nPendientes de símbolo: {len(left)}")
left[["name_std"]].head(20)


Cobertura % no-nulos (desc):
 kyc                100.00
symbol_std          99.19
token_type          64.05
token_symbol         0.00
token_name           0.00
homepage             0.00
whitepaper_url       0.00
github_url           0.00
token_price_usd      0.00
soft_cap_usd         0.00
hard_cap_usd         0.00
whitelist            0.00
jurisdiction         0.00
dtype: float64

Pendientes de símbolo: 44


Unnamed: 0,name_std
5386,asseta
5387,bladedao
5388,blockstack
5389,bluesale
5390,cogitoprotocol
5391,coniun
5392,cowswap
5393,dangelfund
5394,deeptoken
5395,egopaysenger


In [59]:
bnb_cmc = pd.DataFrame({'status': {'timestamp': '2025-10-18T16:51:43.785Z', 'error_code': 0, 'error_message': None, 'elapsed': 22, 'credit_count': 1, 'notice': None}, 'data': {'BNB': [{'id': 1839, 'name': 'BNB', 'symbol': 'BNB', 'category': 'coin', 'description': 'BNB (BNB) is a cryptocurrency . BNB has a current supply of 139,180,845.25. The last known price of BNB is 1,083.71292096 USD and is up 0.65 over the last 24 hours. It is currently trading on 2825 active market(s) with $4,063,453,973.33 traded over the last 24 hours. More information can be found at https://bnbchain.org/en.', 'slug': 'bnb', 'logo': 'https://s2.coinmarketcap.com/static/img/coins/64x64/1839.png', 'subreddit': 'bnbchainofficial', 'notice': '', 'tags': ['marketplace', 'centralized-exchange', 'payments', 'smart-contracts', 'alameda-research-portfolio', 'multicoin-capital-portfolio', 'bnb-chain-ecosystem', 'layer-1', 'alleged-sec-securities', 'celsius-bankruptcy-estate', 'binance-ecosystem', 'binance-listing', 'made-in-china'], 'tag-names': ['Marketplace', 'Centralized Exchange (CEX) Token', 'Payments', 'Smart Contracts', 'Alameda Research Portfolio', 'Multicoin Capital Portfolio', 'BNB Chain Ecosystem', 'Layer 1', 'Alleged SEC Securities', 'Celsius Bankruptcy Estate', 'Binance Ecosystem', 'Binance Listing', 'Made in China'], 'tag-groups': ['INDUSTRY', 'CATEGORY', 'INDUSTRY', 'CATEGORY', 'CATEGORY', 'CATEGORY', 'PLATFORM', 'CATEGORY', 'CATEGORY', 'CATEGORY', 'CATEGORY', 'CATEGORY', 'CATEGORY'], 'urls': {'website': ['https://bnbchain.org/en'], 'twitter': ['https://twitter.com/bnbchain'], 'message_board': [], 'chat': ['https://t.me/BNBchaincommunity', 'https://t.me/bnbchain'], 'facebook': [], 'explorer': ['https://explorer.bnbchain.org/', 'https://app.nansen.ai/token-god-mode?chain=ethereum&tab=transactions&tokenAddress=0xb8c77482e45f1f44de1745f52c74426c631bdd52', 'https://bsctrace.com/', 'https://bscscan.com/token/0xbb4CdB9CBd36B01bD1cBaEBF2De08d9173bc095c', 'https://www.okx.com/web3/explorer/bsc'], 'reddit': ['https://reddit.com/r/bnbchainofficial'], 'technical_doc': [], 'source_code': ['https://github.com/bnb-chain'], 'announcement': []}, 'platform': None, 'date_added': '2017-07-25T00:00:00.000Z', 'twitter_username': 'bnbchain', 'is_hidden': 0, 'date_launched': None, 'contract_address': [{'contract_address': '0xb8c77482e45f1f44de1745f52c74426c631bdd52', 'platform': {'name': 'Ethereum', 'coin': {'id': '1027', 'name': 'Ethereum', 'symbol': 'ETH', 'slug': 'ethereum'}}}, {'contract_address': '0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee', 'platform': {'name': 'BNB Smart Chain (BEP20)', 'coin': {'id': '1839', 'name': 'BNB', 'symbol': 'BNB', 'slug': 'bnb'}}}], 'self_reported_circulating_supply': None, 'self_reported_tags': None, 'self_reported_market_cap': None, 'infinite_supply': False}, {'id': 38157, 'name': 'BNB AI', 'symbol': 'BNB', 'category': 'token', 'description': 'BNB AI (BNB) is a cryptocurrency launched in 2025and operates on the BNB Smart Chain (BEP20) platform. BNB AI has a current supply of 139,000,000 with 0 in circulation. The last known price of BNB AI is 0.00013723 USD and is up 1.95 over the last 24 hours. It is currently trading on 1 active market(s) with $0.00 traded over the last 24 hours. More information can be found at https://bnbaitoken.xyz.', 'slug': 'bnb-ai', 'logo': 'https://s2.coinmarketcap.com/static/img/coins/64x64/38157.png', 'subreddit': '', 'notice': 'Please note that this asset is not affiliated with [BNB Chain](https://www.bnbchain.org/en) in any way. Please exercise caution and [DYOR](https://support.coinmarketcap.com/hc/en-us/articles/360043659351-Listings-Criteria).', 'tags': ['memes', 'bnb-chain-ecosystem'], 'tag-names': ['Memes', 'BNB Chain Ecosystem'], 'tag-groups': ['INDUSTRY', 'PLATFORM'], 'urls': {'website': ['https://bnbaitoken.xyz'], 'twitter': ['https://twitter.com/BNBai_off'], 'message_board': [], 'chat': ['https://t.me/bnbaitoken'], 'facebook': [], 'explorer': ['https://bscscan.com/token/0x3376C5632f8047D009713884440c0020f2C66c85', 'https://app.nansen.ai/token-god-mode?chain=bnb&tab=transactions&tokenAddress=0x3376C5632f8047D009713884440c0020f2C66c85'], 'reddit': [], 'technical_doc': [], 'source_code': [], 'announcement': []}, 'platform': {'id': '1839', 'name': 'BNB', 'slug': 'bnb', 'symbol': 'BNB', 'token_address': '0x3376C5632f8047D009713884440c0020f2C66c85'}, 'date_added': '2025-08-21T07:23:17.000Z', 'twitter_username': 'BNBai_off', 'is_hidden': 0, 'date_launched': '2025-08-19T00:00:00.000Z', 'contract_address': [{'contract_address': '0x3376C5632f8047D009713884440c0020f2C66c85', 'platform': {'name': 'BNB Smart Chain (BEP20)', 'coin': {'id': '1839', 'name': 'BNB', 'symbol': 'BNB', 'slug': 'bnb'}}}], 'self_reported_circulating_supply': 139000000, 'self_reported_tags': None, 'self_reported_market_cap': 19075.277806922164, 'infinite_supply': False}, {'id': 38210, 'name': 'BNBTiger Inu', 'symbol': 'BNB', 'category': 'token', 'description': 'BNBTiger Inu (BNB) is a cryptocurrency launched in 2025and operates on the BNB Smart Chain (BEP20) platform. BNBTiger Inu has a current supply of 100,000,000 with 0 in circulation. The last known price of BNBTiger Inu is 0.00007918 USD and is down -0.88 over the last 24 hours. It is currently trading on 1 active market(s) with $0.00 traded over the last 24 hours. More information can be found at https://bnbtigerinu.xyz/.', 'slug': 'bnb-tiger-inu', 'logo': 'https://s2.coinmarketcap.com/static/img/coins/64x64/38210.png', 'subreddit': '', 'notice': '', 'tags': ['memes', 'bnb-chain-ecosystem'], 'tag-names': ['Memes', 'BNB Chain Ecosystem'], 'tag-groups': ['INDUSTRY', 'PLATFORM'], 'urls': {'website': ['https://bnbtigerinu.xyz/'], 'twitter': ['https://twitter.com/BnbTigerInu_off'], 'message_board': [], 'chat': ['https://t.me/bnbtigerinu1'], 'facebook': [], 'explorer': ['https://bscscan.com/token/0xa05cCD2F8ac92afE092A7240E948aA3E17cEF843', 'https://app.nansen.ai/token-god-mode?chain=bnb&tab=transactions&tokenAddress=0xa05cCD2F8ac92afE092A7240E948aA3E17cEF843'], 'reddit': [], 'technical_doc': [], 'source_code': [], 'announcement': []}, 'platform': {'id': '1839', 'name': 'BNB', 'slug': 'bnb', 'symbol': 'BNB', 'token_address': '0xa05cCD2F8ac92afE092A7240E948aA3E17cEF843'}, 'date_added': '2025-08-27T07:15:15.000Z', 'twitter_username': 'BnbTigerInu_off', 'is_hidden': 0, 'date_launched': '2025-08-24T00:00:00.000Z', 'contract_address': [{'contract_address': '0xa05cCD2F8ac92afE092A7240E948aA3E17cEF843', 'platform': {'name': 'BNB Smart Chain (BEP20)', 'coin': {'id': '1839', 'name': 'BNB', 'symbol': 'BNB', 'slug': 'bnb'}}}], 'self_reported_circulating_supply': 100000000, 'self_reported_tags': ['Memes', 'Binance Chain', 'AI Memes', 'Animal Memes', 'Binance Alpha'], 'self_reported_market_cap': 7917.733362145475, 'infinite_supply': False}, {'id': 33346, 'name': 'BINOVA', 'symbol': 'BNB', 'category': 'token', 'description': 'BINOVA (BNB) is a cryptocurrency launched in 2024and operates on the BNB Smart Chain (BEP20) platform. BINOVA has a current supply of 1,000,000,000 with 0 in circulation. The last known price of BINOVA is 0.00004727 USD and is up 0.00 over the last 24 hours. More information can be found at https://seagull-sam.com/.', 'slug': 'seagull-sam', 'logo': 'https://s2.coinmarketcap.com/static/img/coins/64x64/33346.png', 'subreddit': '', 'notice': '', 'tags': ['memes'], 'tag-names': ['Memes'], 'tag-groups': ['INDUSTRY'], 'urls': {'website': ['https://seagull-sam.com/'], 'twitter': ['https://twitter.com/seagull_sam_'], 'message_board': [], 'chat': ['https://t.me/seagull_sam'], 'facebook': [], 'explorer': ['https://bscscan.com/token/0x2bb8862f055be989383092402ff75c7ccfea790e', 'https://app.nansen.ai/token-god-mode?chain=bnb&tab=transactions&tokenAddress=0x2bb8862f055be989383092402ff75c7ccfea790e'], 'reddit': [], 'technical_doc': [], 'source_code': [], 'announcement': []}, 'platform': {'id': '1839', 'name': 'BNB', 'slug': 'bnb', 'symbol': 'BNB', 'token_address': '0x2bb8862f055be989383092402ff75c7ccfea790e'}, 'date_added': '2024-10-08T05:39:38.000Z', 'twitter_username': 'seagull_sam_', 'is_hidden': 0, 'date_launched': '2024-09-06T00:00:00.000Z', 'contract_address': [{'contract_address': '0x2bb8862f055be989383092402ff75c7ccfea790e', 'platform': {'name': 'BNB Smart Chain (BEP20)', 'coin': {'id': '1839', 'name': 'BNB', 'symbol': 'BNB', 'slug': 'bnb'}}}], 'self_reported_circulating_supply': 100000000, 'self_reported_tags': None, 'self_reported_market_cap': 0, 'infinite_supply': False}]}})
print(bnb_cmc['data'].columns.tolist())

AttributeError: 'Series' object has no attribute 'columns'