In [None]:
# %% config
QUERY = "open source"           # your search phrase
MODE = "full_text"              # "api" or "full_text"
SAVE_TO = "open_source_models.txt"  # set None to skip saving
VERBOSE = True

# Provide your HF token (optional but boosts API limits). Will prompt if not in env.
import os
from getpass import getpass
HF_TOKEN = os.getenv("HF_TOKEN") or (getpass("Enter your HF token (hf_..., or leave blank): ").strip() or None)

# %% imports
import time, random, re
from typing import Optional, Iterable, List
from urllib.parse import quote_plus, urlparse, parse_qs, urljoin

import requests
from bs4 import BeautifulSoup
from huggingface_hub import HfApi
from huggingface_hub.utils import HfHubHTTPError

# %% generic retry/backoff (works for both requests and hf_hub)
def _retry_wait(attempt, base=1.0, backoff=2.0, max_delay=60.0, jitter=0.25):
    wait = min(max_delay, base * (backoff ** attempt)) + random.uniform(0, jitter)
    return wait

def with_retry_requests(session: requests.Session, max_retries=8, verbose=True):
    def _get(url, **kw):
        attempt = 0
        while True:
            r = session.get(url, **kw)
            if r.status_code in (429, 502, 503, 504) and attempt < max_retries:
                ra = r.headers.get("Retry-After")
                wait = float(ra) if ra else _retry_wait(attempt)
                attempt += 1
                if verbose:
                    print(f"[retry] GET {url} => {r.status_code}; sleep {wait:.1f}s ({attempt}/{max_retries})")
                time.sleep(wait)
                continue
            r.raise_for_status()
            return r
    return _get

def iterate_list_models_with_retry(api: HfApi, search: str, *, full=False, direction=-1, max_retries=8, verbose=True):
    """
    Resilient generator around api.list_models(search=...).
    If pagination hiccups, restart and dedupe.
    """
    seen = set()
    attempt = 0
    while True:
        try:
            for m in api.list_models(search=search, full=full, direction=direction):
                mid = getattr(m, "modelId", None)
                if not mid or mid in seen:
                    continue
                seen.add(mid)
                yield m
            return
        except HfHubHTTPError as e:
            code = getattr(getattr(e, "response", None), "status_code", None)
            if code in (429, 502, 503, 504) and attempt < max_retries:
                attempt += 1
                wait = _retry_wait(attempt)
                if verbose:
                    print(f"[retry] list_models => {code}; sleep {wait:.1f}s ({attempt}/{max_retries})")
                time.sleep(wait)
                continue
            raise

# %% helpers
def extract_repo_ids_from_fulltext_html(html: str) -> List[str]:
    """
    Parse /search/full-text HTML and extract model repo ids like 'owner/name'.
    We look for anchors that look like '/owner/repo' (models) and dedupe.
    """
    soup = BeautifulSoup(html, "html.parser")
    repo_ids = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        # Only plain model repo paths (exclude /datasets/, /spaces/, /orgs, etc.)
        if re.match(r"^/[A-Za-z0-9][A-Za-z0-9_\-\.]*/[A-Za-z0-9][A-Za-z0-9_\-\.]*$", href):
            repo_ids.add(href.strip("/"))
    return sorted(repo_ids)

def find_next_page_url(current_url: str, html: str) -> Optional[str]:
    soup = BeautifulSoup(html, "html.parser")
    next_link = None
    for a in soup.find_all("a"):
        if a.text.strip().lower() == "next":
            next_link = a.get("href")
            break
    if not next_link:
        return None
    # Handle relative links
    base = "https://huggingface.co"
    return urljoin(base, next_link)

# %% main
def get_models_api(query: str, token: Optional[str]) -> List[str]:
    """
    API mode: uses /api/models search (matches repo IDs + usernames).
    This does NOT do full-text across README. (Smaller result set.)
    """
    api = HfApi(token=token)
    out = []
    for m in iterate_list_models_with_retry(api, search=query, full=False, direction=-1, verbose=VERBOSE):
        out.append(m.modelId)
        if VERBOSE and len(out) % 100 == 0:
            print(f"...{len(out)} (API)")
    return out

def get_models_full_text(query: str) -> List[str]:
    """
    Full-text mode: scrapes /search/full-text?type=model&q=... (what the website shows).
    We paginate until 'Next' disappears.
    """
    base = "https://huggingface.co/search/full-text"
    session = requests.Session()
    getter = with_retry_requests(session, verbose=VERBOSE)
    url = f"{base}?q={quote_plus(query)}&type=model"
    all_ids, seen_pages = [], set()

    while url and url not in seen_pages:
        seen_pages.add(url)
        r = getter(url, timeout=30)
        repo_ids = extract_repo_ids_from_fulltext_html(r.text)
        all_ids.extend(repo_ids)
        if VERBOSE:
            print(f"...{len(repo_ids)} on page; total {len(set(all_ids))}")
        url = find_next_page_url(url, r.text)

    # de-dupe and sort
    return sorted(set(all_ids))

if MODE == "api":
    models = get_models_api(QUERY, HF_TOKEN)
else:
    models = get_models_full_text(QUERY)

print(f"\nFound {len(models)} models for '{QUERY}' in mode={MODE}\n")
for mid in models:
    print(mid)

if SAVE_TO:
    with open(SAVE_TO, "w", encoding="utf-8") as f:
        f.write("\n".join(models))
    print(f"\nSaved to {SAVE_TO}")



Total models: 68

jd-opensource/JSL-joysafety-v1
opensourcerelease/DeepSeek-V3-bf16
opensource/extract_names
OpenSourceMentorShip/gpt4all
haoliu/coh_llama_on_open_source_data
Opensourced/wormgpt-24
mixtralyanis/bart_opensource
mixtralyanis/flant5-opensource
mixtralyanis/flant5-opensource-and-tuned
mrdas/open-source
OpenSourceEnjoyer/Yi-34B-200K-bnb-4bit
OpenSourceEnjoyer/Nous-Hermes-2-Mistral-7B-DPO-SFT-LoRA
OpenSourceEnjoyer/Nous-Hermes-2-Mistral-7B-DPO-SFT-FP16
openSourcerer9000/sbds-model
OpenSourceEnjoyer/Nous-Hermes-2-Mistral-7B-DPO-SFT-GGUF-Q8
George33/Opensourcecognix
PedroCintra67/llm-open-source
Pankaj001/Opensource_attack_examples
OpenSourceEnjoyer/LLaMA-3-8B-Function-Calling-GGUF
OpenSourceEnjoyer/LLaMA-3-8B-Function-Calling-FP16
openclimatefix/open-source-quartz-solar-forecast
OpenSourceEnjoyer/LLaMA-3-8B-Instruct-T-Q4_K_M-GGUF
OpenSourceEnjoyer/LLaMA-3.1-8B-Function-Calling-GGUF
OpenSourceEnjoyer/LLaMA-3.1-8B-Function-Calling-FP16
OpenSourceEnjoyer/Mistral-Nemo-Function-C