In [None]:
#Getting started, making sure we're getting results
from huggingface_hub import HfApi
api = HfApi()
#Was getting only 68 results with 'open source' so trying 'open' and then filtering later
models = api.list_models(
    search="open"
)

#See how many models we get back, we'll do further filtering down the road
models = list(models)
print(len(models))
#20069 models!



20069


# Grabbing Model Data
- Name
- ID
- Model type
- License
    - Always prioritize LICENSE file
    - Then YAML on Readme.md file
- Date released
- Date last modified
- No. Downloads
- Author
- License file content
- Readme file content

In [None]:
import os
import csv
from typing import Optional, List
import re, time, random

from huggingface_hub import HfApi, model_info
from huggingface_hub.utils import HfHubHTTPError

SEARCH_TERM = "open"
OUT_CSV = "hf_models_open_raw.csv"
HF_TOKEN = "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"  # optional: set your token in env for higher limits

api = HfApi(token=HF_TOKEN)

def iso(dt) -> str:
    if dt is None:
        return ""
    if isinstance(dt, str):
        return dt
    try:
        return dt.isoformat()
    except Exception:
        return str(dt)

def resolve_license_from_metadata(info, tags: Optional[List[str]] = None) -> str:
    cd = getattr(info, "cardData", None)
    if isinstance(cd, dict):
        for key in ("license", "licenses", "license_id", "license_name"):
            val = cd.get(key)
            if val:
                if isinstance(val, (list, tuple)):
                    return ", ".join([str(x) for x in val if x]).strip()
                return str(val).strip()
    cfg = getattr(info, "config", None)
    if isinstance(cfg, dict):
        lic = cfg.get("license")
        if lic:
            return str(lic).strip()
    for t in (tags or []):
        if isinstance(t, str) and t.lower().startswith("license:"):
            return t.split(":", 1)[1].strip()
    return ""

# -------- minimal rate-limit helpers --------
def _status_code(exc) -> Optional[int]:
    try:
        return getattr(getattr(exc, "response", None), "status_code", None)
    except Exception:
        return None

def _retry_after_seconds(exc) -> Optional[float]:
    try:
        hdrs = getattr(getattr(exc, "response", None), "headers", {}) or {}
        ra = hdrs.get("Retry-After")
        return float(ra) if ra not in (None, "") else None
    except Exception:
        return None

def _parse_rate_limit(headers: dict):
    """
    RateLimit: "api|pages|resolvers";r=1234;t=108
    RateLimit-Policy: "fixed window";"api|pages|resolvers";q=5000;w=300
    -> returns (remaining:int|None, reset_sec:int|None)
    """
    rl = headers.get("RateLimit", "") or ""
    rem = None
    reset = None
    m_r = re.search(r"r=(\d+)", rl)
    m_t = re.search(r"t=(\d+)", rl)
    if m_r: rem = int(m_r.group(1))
    if m_t: reset = int(m_t.group(1))
    return rem, reset

def _sleep_from_headers(exc, attempt: int, max_backoff: float = 60.0) -> float:
    # 1) Retry-After header wins
    wait = _retry_after_seconds(exc)
    if wait is not None:
        return wait + random.uniform(0, 0.5)
    # 2) RateLimit header t=seconds until reset
    headers = getattr(getattr(exc, "response", None), "headers", {}) or {}
    _, reset_sec = _parse_rate_limit(headers)
    if reset_sec is not None:
        return float(reset_sec) + random.uniform(0, 0.5)
    # 3) fallback exponential backoff with jitter
    return min(max_backoff, (2 ** attempt)) + random.uniform(0, 0.25)

# Simple retry for model_info
def safe_model_info(repo_id: str, token: Optional[str], max_retries: int = 8):
    attempt = 0
    while True:
        try:
            return model_info(repo_id, token=token)
        except HfHubHTTPError as e:
            code = _status_code(e)
            if code in (429, 502, 503, 504) and attempt < max_retries:
                attempt += 1
                wait = _sleep_from_headers(e, attempt)
                print(f"[retry] model_info {repo_id} -> {code}; sleeping {wait:.1f}s (attempt {attempt}/{max_retries})")
                time.sleep(wait)
                continue
            raise

# Resilient iterator for list_models (very simple)
def iter_models_with_retry(api: HfApi, *, search: str, full: bool, sort: str, direction: int, max_retries: int = 8):
    seen = set()
    attempt = 0
    while True:
        try:
            for m in api.list_models(search=search, full=full, sort=sort, direction=direction):
                mid = getattr(m, "modelId", None)
                if not mid or mid in seen:
                    continue
                seen.add(mid)
                yield m
            return
        except HfHubHTTPError as e:
            code = _status_code(e)
            if code in (429, 502, 503, 504) and attempt < max_retries:
                attempt += 1
                wait = _sleep_from_headers(e, attempt)
                print(f"[retry] list_models -> {code}; sleeping {wait:.1f}s (attempt {attempt}/{max_retries})")
                time.sleep(wait)
                # loop restarts and continues; 'seen' avoids duplicates
                continue
            raise

FIELDNAMES = [
    "name",
    "id",
    "model_type",
    "license",
    "date_released",
    "date_last_modified",
    "downloads",
    "author",
]

count = 0
written = 0

with open(OUT_CSV, "w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
    writer.writeheader()

    for summary in iter_models_with_retry(api, search=SEARCH_TERM, full=True, sort="last_modified", direction=-1):
        mid = getattr(summary, "modelId", None)
        if not mid:
            continue
        count += 1

        try:
            info = safe_model_info(mid, token=HF_TOKEN)
        except HfHubHTTPError:
            continue

        repo_id = info.modelId
        name = repo_id.split("/")[-1] if "/" in repo_id else repo_id
        author = getattr(info, "author", "") or (repo_id.split("/")[0] if "/" in repo_id else "")
        model_type = getattr(info, "pipeline_tag", "") or ""
        license_val = resolve_license_from_metadata(info, tags=getattr(summary, "tags", None))
        date_released = iso(getattr(info, "created_at", None))
        date_last_modified = iso(getattr(info, "lastModified", None))
        downloads = getattr(info, "downloads", "") or ""

        row = {
            "name": name,
            "id": repo_id,
            "model_type": model_type,
            "license": license_val,
            "date_released": date_released,
            "date_last_modified": date_last_modified,
            "downloads": downloads,
            "author": author,
        }
        writer.writerow(row)
        written += 1

        # tiny, optional pacing to avoid hammering
        if written % 500 == 0:
            time.sleep(1.0)

print(f"Done. Wrote {written} rows to {OUT_CSV}")


Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


[retry] model_info mradermacher/openthoughts3_1k_llama3-GGUF -> 429; sleeping 169.1s (attempt 1/8)
[retry] model_info OpenMed/OpenMed-NER-GenomeDetect-TinyMed-82M -> 429; sleeping 217.3s (attempt 1/8)


Invalid model-index. Not loading eval results into CardData.


[retry] model_info mradermacher/openthoughts2_1k_32B-i1-GGUF -> 429; sleeping 221.2s (attempt 1/8)
[retry] model_info fh1628/MNLP_M3_open_model_test -> 429; sleeping 220.4s (attempt 1/8)
[retry] model_info open-unlearning/unlearn_tofu_Llama-3.2-1B-Instruct_forget10_RMU_lr1e-05_layer5_scoeff10_epoch5 -> 429; sleeping 221.4s (attempt 1/8)
[retry] model_info DevQuasar/nvidia.OpenMath-Nemotron-14B-Kaggle-GGUF -> 429; sleeping 223.5s (attempt 1/8)
[retry] model_info Godreign/gemma-2-2b-it-openvino-int8-model -> 429; sleeping 225.0s (attempt 1/8)
[retry] model_info viethq5/Qwen2.5-1.5B-Open-R1-Distill -> 429; sleeping 220.0s (attempt 1/8)
[retry] model_info caijanfeng/Qwen2.5-1.5B-Open-R1-Distill-repeat -> 429; sleeping 226.2s (attempt 1/8)
[retry] model_info OzzyGT/opensketch -> 429; sleeping 224.2s (attempt 1/8)
[retry] model_info furmaniak/openalex_pretrain_model_qwen2.5_vX -> 429; sleeping 246.4s (attempt 1/8)
[retry] model_info oodeh/openshift-qe-r32-a16-epoch10-merged-model -> 429; sle

Invalid model-index. Not loading eval results into CardData.


[retry] model_info houghtonweihu/openchat_gemma_adapter_1 -> 429; sleeping 243.5s (attempt 1/8)
[retry] model_info huangyt/falcon-7b-Open-Platypus_2.5w-r16-query_key_value -> 429; sleeping 248.3s (attempt 1/8)
[retry] model_info maddes8cht/openlm-research-open_llama_7b_v2-gguf -> 429; sleeping 240.4s (attempt 1/8)
[retry] model_info madh34rt/open-reverse-proxy -> 429; sleeping 246.5s (attempt 1/8)
[retry] model_info openclimatefix/power_perceiver -> 429; sleeping 243.5s (attempt 1/8)


Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


Done. Wrote 20070 rows to hf_models_open_test-2.csv
