# Set Up
Run `pip --install hugging_face_hub` if needed

## Usage

#!/usr/bin/env python3
"""
Search Hugging Face models for a phrase (default: "open source"),
gather metadata + README text, and save to CSV.

Usage:
  python hf_search_models.py --query "open source" --limit 200 --out models_open_source.csv

Optional:
  export HF_TOKEN=hf_xxx   # if you want to include private models or higher rate limits
"""

In [10]:

# %% Imports and helpers
import os
import time
import csv
from typing import Optional
from datetime import datetime

import pandas as pd
from huggingface_hub import HfApi, model_info, hf_hub_download
from huggingface_hub.utils import HfHubHTTPError

In [11]:
import re
from typing import Optional

try:
    import yaml  # pip install pyyaml
except Exception:
    yaml = None

def _normalize_license(val):
    """Return a clean license string from various shapes."""
    if val is None:
        return ""
    if isinstance(val, str):
        return val.strip()
    if isinstance(val, (list, tuple)):
        uniq = [str(s).strip() for s in val if s and str(s).strip()]
        return ", ".join(sorted(set(uniq)))
    if isinstance(val, dict):
        for k in ("id", "name", "license", "value"):
            if k in val and val[k]:
                return str(val[k]).strip()
        try:
            import json
            return json.dumps(val, ensure_ascii=False)
        except Exception:
            return str(val)
    return str(val).strip()

def _parse_yaml_front_matter(text: str) -> dict:
    """
    Parse YAML front-matter from README markdown text.
    Supports:
      ---\n<yaml>\n---   or
      ---\n<yaml>\n...
    Returns {} if not found or parse fails.
    """
    if not text:
        return {}
    # Match front matter at the very top of the file
    m = re.match(r"^\s*---\s*\n(.*?)\n(?:---|\.\.\.)\s*(?:\n|$)", text, re.DOTALL)
    if not m:
        return {}
    yaml_block = m.group(1)
    # Prefer PyYAML if present
    if yaml is not None:
        try:
            data = yaml.safe_load(yaml_block) or {}
            if isinstance(data, dict):
                return data
            return {}
        except Exception:
            pass
    # Minimal fallback: parse simple key: value and lists with "- item"
    data = {}
    current_key = None
    for line in yaml_block.splitlines():
        if re.match(r"^\s*#", line) or not line.strip():
            continue
        kv = re.match(r"^([A-Za-z0-9_\-]+)\s*:\s*(.*)$", line)
        if kv:
            key, value = kv.group(1), kv.group(2).strip()
            if value == "" or value == "|":
                data[key] = []
                current_key = key
            elif value.startswith("[") and value.endswith("]"):
                # simple inline list: [a, b]
                items = [s.strip() for s in value[1:-1].split(",") if s.strip()]
                data[key] = items
                current_key = None
            else:
                data[key] = value
                current_key = None
            continue
        # list item
        if current_key and re.match(r"^\s*-\s+", line):
            item = re.sub(r"^\s*-\s+", "", line).strip()
            data.setdefault(current_key, [])
            data[current_key].append(item)
    return data

def resolve_license(info, fallback_tags=None, readme_text: Optional[str] = None):
    """
    Resolve license using multiple sources, in order:
      1) info.cardData (license, licenses, license_name, license_id)
      2) info.config.license
      3) tags entries like 'license:apache-2.0'
      4) README YAML front-matter keys: license, licenses, license_name, license_id
      5) presence of LICENSE file (returns 'license-file' if found)
    Also returns an optional license_link if present in YAML (e.g., 'license_link').
    """
    # 1) cardData
    cd = getattr(info, "cardData", None)
    if isinstance(cd, dict):
        for key in ("license", "licenses", "license_name", "license_id"):
            if key in cd and cd[key]:
                lic = _normalize_license(cd[key])
                if lic:
                    return lic, cd.get("license_link", "")

    # 2) config
    cfg = getattr(info, "config", None)
    if isinstance(cfg, dict) and cfg.get("license"):
        lic = _normalize_license(cfg["license"])
        if lic:
            return lic, ""

    # 3) tags (handles the common 'license:apache-2.0' pattern)
    tags = fallback_tags or getattr(info, "tags", None)
    if tags:
        for t in tags:
            if isinstance(t, str) and t.lower().startswith("license:"):
                lic = t.split(":", 1)[1].strip()
                if lic:
                    return lic, ""

    # 4) README YAML front-matter
    if readme_text:
        meta = _parse_yaml_front_matter(readme_text)
        if meta:
            for key in ("license", "licenses", "license_name", "license_id"):
                if key in meta and meta[key]:
                    lic = _normalize_license(meta[key])
                    link = _normalize_license(meta.get("license_link", ""))
                    if lic:
                        return lic, link

    # 5) LICENSE file present?
    try:
        siblings = getattr(info, "siblings", None) or []
        has_license_file = any(
            hasattr(s, "rfilename") and s.rfilename.upper() == "LICENSE" for s in siblings
        )
        if has_license_file:
            return "license-file", ""
    except Exception:
        pass

    return "", ""  # unknown

# Hugging Face model search in a Jupyter notebook
# - No argparse; configure via variables below
# - No default limit (LIMIT=None)
# - Optional README download for NLP
# - Saves results to CSV and shows a preview as a DataFrame

In [12]:
# from huggingface_hub import HfApi, model_info, hf_hub_download
# from huggingface_hub.utils import HfHubHTTPError


def safe_str(x: Optional[str]) -> str:
    return "" if x is None else str(x)

def read_readme_text(repo_id: str) -> str:
    """Download README.md for a repo and return its text. Returns '' if missing."""
    try:
        readme_path = hf_hub_download(repo_id=repo_id, filename="README.md", repo_type="model")
        with open(readme_path, "r", encoding="utf-8", errors="replace") as f:
            return f.read()
    except HfHubHTTPError:
        return ""
    except Exception:
        return ""
    

def get_license(info):
    # First try cardData (most common place)
    if info.cardData and "license" in info.cardData:
        return info.cardData["license"]
    # Then try config
    if info.config and "license" in info.config:
        return info.config["license"]
    return "unknown"

def iso_or_blank(dt) -> str:
    """Format datetime to ISO 8601 or return blank."""
    try:
        if dt is None:
            return ""
        if isinstance(dt, str):
            return dt
        return dt.isoformat()
    except Exception:
        return ""

# %% Configuration
QUERY = "open"          # search phrase
LIMIT = None                   # None means no limit; set an int to cap results
OUT_CSV = "hf_models.csv"      # output CSV path
SLEEP = 0.2                    # seconds to sleep between requests to be polite
INCLUDE_README = False         # set True to include full README text
README_SNIPPET_CHARS = 0       # set >0 to include a short snippet
HF_TOKEN = os.getenv("HF_TOKEN", None)  # or set a string token here

# %% Search and collect
api = HfApi(token=HF_TOKEN)

print(f"Searching for models matching: {QUERY!r} (limit={LIMIT})")
if LIMIT is not None:
    models_iter = api.list_models(search=QUERY, full=True, direction=-1, limit=LIMIT)
else:
    # omit limit for no cap
    models_iter = api.list_models(search=QUERY, full=True, direction=-1)

rows = []
count = 0

for m in models_iter:
    repo_id = m.modelId  # e.g., "bert-base-uncased"
    try:
        info = model_info(repo_id, token=HF_TOKEN)
    except HfHubHTTPError as e:
        print(f"Skipping {repo_id}: {e}")
        time.sleep(SLEEP)
        continue

    # Extract metadata
    model_name = info.modelId
    author = getattr(info, "author", "") or (model_name.split("/")[0] if "/" in model_name else "")
    pipeline_tag = getattr(info, "pipeline_tag", None)
    library_name = getattr(info, "library_name", None)
    # license_name = getattr(info, "license", None)
    license_name = resolve_license(info)
    created_at = iso_or_blank(getattr(info, "created_at", None))
    last_modified = iso_or_blank(getattr(info, "lastModified", None))
    downloads = getattr(info, "downloads", None)
    likes = getattr(info, "likes", None)
    private = getattr(info, "private", None)
    gated = getattr(info, "gated", None)

    # README / model card text
    if INCLUDE_README or README_SNIPPET_CHARS > 0:
        readme_text = read_readme_text(repo_id)
        readme_snippet = readme_text[: README_SNIPPET_CHARS] if README_SNIPPET_CHARS > 0 else ""
    else:
        readme_text = ""
        readme_snippet = ""

    # cardData is parsed front matter from the model card
    card_data = getattr(info, "cardData", None)
    try:
        import json
        card_data_json = json.dumps(card_data, ensure_ascii=False) if card_data is not None else ""
    except Exception:
        card_data_json = ""

    row = {
        "model_id": model_name,
        "author": author,
        "type_pipeline_tag": safe_str(pipeline_tag),
        "library_name": safe_str(library_name),
        "license": license_name,
        "created_at": created_at,
        "last_modified": last_modified,
        "downloads": downloads if downloads is not None else "",
        "likes": likes if likes is not None else "",
        "private": private if private is not None else "",
        "gated": gated if gated is not None else "",
        "readme_snippet": readme_snippet,
        "readme_text": readme_text if INCLUDE_README else "",
        "card_data_json": card_data_json,
    }
    rows.append(row)
    count += 1

    # polite pacing to reduce throttling risk
    time.sleep(SLEEP)

print(f"Collected {len(rows)} models.")

# %% Save to CSV and preview
fieldnames = [
    "model_id",
    "author",
    "type_pipeline_tag",
    "library_name",
    "license",
    "created_at",
    "last_modified",
    "downloads",
    "likes",
    "private",
    "gated",
    "readme_snippet",
    "readme_text",
    "card_data_json",
]

df = pd.DataFrame(rows, columns=fieldnames)
df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"Wrote {len(df)} rows to {OUT_CSV}")

# Show a quick preview
df.head(10)



Searching for models matching: 'open' (limit=None)


Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


Skipping OpenGVLab/InternImage: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/models/OpenGVLab/InternImage (Request ID: Root=1-68e00fa3-31c0e2ef55d250161fa0a416;daa947e3-759d-4d9b-b928-2aede1d81086)

We had to rate limit your IP (152.3.43.52). To continue using our service, create a HF account or login to your existing account, and make sure you pass a HF_TOKEN if you're using the API.
Skipping josephvore/openapi: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/models/josephvore/openapi (Request ID: Root=1-68e00fa3-2cd29d0e04fe5f821af9e9a2;1fdad4dc-66f0-4f49-ac5c-d65ed686020a)

We had to rate limit your IP (152.3.43.52). To continue using our service, create a HF account or login to your existing account, and make sure you pass a HF_TOKEN if you're using the API.
Skipping openhuman/humanengine: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/models/openhuman/humanengine (Request ID: Root=1-68e00fa3-0d2556236d7975

HTTP Error 429 thrown while requesting GET https://huggingface.co/api/models?search=open&direction=-1&full=True&sort=trendingScore&cursor=eyIkb3IiOlt7InRyZW5kaW5nU2NvcmUiOjAsIl9pZCI6eyIkZ3QiOiI2NDk3MWQ4ODU1NTFhYzBkNDhiY2FlYzMifX0seyJ0cmVuZGluZ1Njb3JlIjp7IiRsdCI6MH19LHsidHJlbmRpbmdTY29yZSI6bnVsbH1dfQ%3D%3D
Retrying in 1s [Retry 1/20].
HTTP Error 429 thrown while requesting GET https://huggingface.co/api/models?search=open&direction=-1&full=True&sort=trendingScore&cursor=eyIkb3IiOlt7InRyZW5kaW5nU2NvcmUiOjAsIl9pZCI6eyIkZ3QiOiI2NDk3MWQ4ODU1NTFhYzBkNDhiY2FlYzMifX0seyJ0cmVuZGluZ1Njb3JlIjp7IiRsdCI6MH19LHsidHJlbmRpbmdTY29yZSI6bnVsbH1dfQ%3D%3D
Retrying in 2s [Retry 2/20].
HTTP Error 429 thrown while requesting GET https://huggingface.co/api/models?search=open&direction=-1&full=True&sort=trendingScore&cursor=eyIkb3IiOlt7InRyZW5kaW5nU2NvcmUiOjAsIl9pZCI6eyIkZ3QiOiI2NDk3MWQ4ODU1NTFhYzBkNDhiY2FlYzMifX0seyJ0cmVuZGluZ1Njb3JlIjp7IiRsdCI6MH19LHsidHJlbmRpbmdTY29yZSI6bnVsbH1dfQ%3D%3D
Retrying in 4s [Retry

HTTPError: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/models?search=open&direction=-1&full=True&sort=trendingScore&cursor=eyIkb3IiOlt7InRyZW5kaW5nU2NvcmUiOjAsIl9pZCI6eyIkZ3QiOiI2NDk3MWQ4ODU1NTFhYzBkNDhiY2FlYzMifX0seyJ0cmVuZGluZ1Njb3JlIjp7IiRsdCI6MH19LHsidHJlbmRpbmdTY29yZSI6bnVsbH1dfQ%3D%3D