In [None]:
# from huggingface_hub import HfApi
# api = HfApi()
# models = api.list_models(
#     search="open"
# )

# models = list(models)
# print(len(models))
# # print(models[0].modelId)

def safe_str(x: Optional[str]) -> str:
    return "" if x is None else str(x)

def read_readme_text(repo_id: str) -> str:
    """Download README.md for a repo and return its text. Returns '' if missing."""
    try:
        readme_path = hf_hub_download(repo_id=repo_id, filename="README.md", repo_type="model")
        with open(readme_path, "r", encoding="utf-8", errors="replace") as f:
            return f.read()
    except HfHubHTTPError:
        return ""
    except Exception:
        return ""
    

def get_license(info):
    # First try cardData (most common place)
    if info.cardData and "license" in info.cardData:
        return info.cardData["license"]
    # Then try config
    if info.config and "license" in info.config:
        return info.config["license"]
    return "unknown"

def iso_or_blank(dt) -> str:
    """Format datetime to ISO 8601 or return blank."""
    try:
        if dt is None:
            return ""
        if isinstance(dt, str):
            return dt
        return dt.isoformat()
    except Exception:
        return ""

# %% Configuration
QUERY = "open source"          # search phrase
LIMIT = None                   # None means no limit; set an int to cap results
OUT_CSV = "hf_models.csv"      # output CSV path
SLEEP = 0.2                    # seconds to sleep between requests to be polite
INCLUDE_README = False         # set True to include full README text
README_SNIPPET_CHARS = 0       # set >0 to include a short snippet
HF_TOKEN = os.getenv("HF_TOKEN", None)  # or set a string token here

# %% Search and collect
api = HfApi(token=HF_TOKEN)

print(f"Searching for models matching: {QUERY!r} (limit={LIMIT})")
if LIMIT is not None:
    models_iter = api.list_models(search=QUERY, full=True, direction=-1, limit=LIMIT)
else:
    # omit limit for no cap
    models_iter = api.list_models(search=QUERY, full=True, direction=-1)

rows = []
count = 0

for m in models_iter:
    repo_id = m.modelId  # e.g., "bert-base-uncased"
    try:
        info = model_info(repo_id, token=HF_TOKEN)
    except HfHubHTTPError as e:
        print(f"Skipping {repo_id}: {e}")
        time.sleep(SLEEP)
        continue

    # Extract metadata
    model_name = info.modelId
    author = getattr(info, "author", "") or (model_name.split("/")[0] if "/" in model_name else "")
    pipeline_tag = getattr(info, "pipeline_tag", None)
    library_name = getattr(info, "library_name", None)
    # license_name = getattr(info, "license", None)
    license_name = resolve_license(info)
    created_at = iso_or_blank(getattr(info, "created_at", None))
    last_modified = iso_or_blank(getattr(info, "lastModified", None))
    downloads = getattr(info, "downloads", None)
    likes = getattr(info, "likes", None)
    private = getattr(info, "private", None)
    gated = getattr(info, "gated", None)

    # README / model card text
    if INCLUDE_README or README_SNIPPET_CHARS > 0:
        readme_text = read_readme_text(repo_id)
        readme_snippet = readme_text[: README_SNIPPET_CHARS] if README_SNIPPET_CHARS > 0 else ""
    else:
        readme_text = ""
        readme_snippet = ""

    # cardData is parsed front matter from the model card
    card_data = getattr(info, "cardData", None)
    try:
        import json
        card_data_json = json.dumps(card_data, ensure_ascii=False) if card_data is not None else ""
    except Exception:
        card_data_json = ""

    row = {
        "model_id": model_name,
        "author": author,
        "type_pipeline_tag": safe_str(pipeline_tag),
        "library_name": safe_str(library_name),
        "license": license_name,
        "created_at": created_at,
        "last_modified": last_modified,
        "downloads": downloads if downloads is not None else "",
        "likes": likes if likes is not None else "",
        "private": private if private is not None else "",
        "gated": gated if gated is not None else "",
        "readme_snippet": readme_snippet,
        "readme_text": readme_text if INCLUDE_README else "",
        "card_data_json": card_data_json,
    }
    rows.append(row)
    count += 1

    # polite pacing to reduce throttling risk
    time.sleep(SLEEP)

print(f"Collected {len(rows)} models.")

# %% Save to CSV and preview
fieldnames = [
    "model_id",
    "author",
    "type_pipeline_tag",
    "library_name",
    "license",
    "created_at",
    "last_modified",
    "downloads",
    "likes",
    "private",
    "gated",
    "readme_snippet",
    "readme_text",
    "card_data_json",
]

df = pd.DataFrame(rows, columns=fieldnames)
df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"Wrote {len(df)} rows to {OUT_CSV}")

# Show a quick preview
df.head(10)


20069
