In [12]:
# If needed:
# !pip install -q huggingface_hub pandas

import os, json, time, re, io
import pandas as pd
from typing import Any, Dict, Iterable, List, Optional

from huggingface_hub import HfApi, model_info, hf_hub_download
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError, RevisionNotFoundError

# ==== CONFIG ====
IN_JSON = "model_data/hf_models_open_raw.json"              # your input file with many JSON objects
OUT_CSV = "licenses_other.csv"       # output CSV: model_id, license_md_text
SLEEP_S = 0.2                        # polite pacing for API calls
HF_TOKEN = "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"

OUT_JSON = "other_licensed_models.json"  # new JSON output
APPEND_JSON = False                      # set to True to append into an existing JSON array


api = HfApi(token=HF_TOKEN)


In [13]:
def load_json_objects(path: str) -> List[Dict[str, Any]]:
    """
    Loads either:
      - a JSON array of objects, or
      - newline-delimited JSON (JSONL), one object per line.
    Returns a list of dicts.
    """
    with open(path, "r", encoding="utf-8") as f:
        data = f.read().strip()
        # Try JSON array first
        try:
            obj = json.loads(data)
            if isinstance(obj, list):
                return [o for o in obj if isinstance(o, dict)]
            elif isinstance(obj, dict):
                return [obj]
        except json.JSONDecodeError:
            pass

    # Fallback: JSONL
    objs: List[Dict[str, Any]] = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                o = json.loads(line)
                if isinstance(o, dict):
                    objs.append(o)
            except json.JSONDecodeError:
                # skip bad lines silently
                continue
    return objs

objs = load_json_objects(IN_JSON)
len(objs)


20076

In [14]:
def is_license_other(o: Dict[str, Any]) -> bool:
    # direct license field
    lic = str(o.get("license", "")).strip().lower()
    if lic == "other":
        return True

    # card_data / cardData as plain string like "license: other"
    for k in ("card_data", "cardData"):
        val = str(o.get(k, "")).lower()
        if "license" in val and "other" in val:
            # be a bit stricter: look for "license: other"
            if re.search(r"\blicense\s*:\s*other\b", val):
                return True

    # tags may include "license:other"
    tags = o.get("tags", []) or []
    if any(isinstance(t, str) and t.strip().lower() == "license:other" for t in tags):
        return True

    return False

def resolve_model_id(o: Dict[str, Any]) -> Optional[str]:
    for k in ("model_id", "modelId", "id", "repo_id"):
        v = o.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip()
    return None

targets = []
for o in objs:
    if is_license_other(o):
        mid = resolve_model_id(o)
        if mid:
            targets.append({"model_id": mid, "raw": o})

len(targets), targets[:3]


(817,
 [{'model_id': 'darumatoakabeko/yuzu_v11_lcm_openvino',
   'raw': {'id': 'darumatoakabeko/yuzu_v11_lcm_openvino',
    'author': 'darumatoakabeko',
    'sha': 'd22df33650479942c36d78fc91ed0aca938fff5b',
    'last_modified': '2025-10-05 14:48:17+00:00',
    'created_at': '2025-10-05 14:46:28+00:00',
    'private': False,
    'gated': False,
    'disabled': False,
    'downloads': 0,
    'downloads_all_time': None,
    'likes': 0,
    'library_name': None,
    'gguf': None,
    'inference': None,
    'inference_provider_mapping': None,
    'tags': ['license:other', 'region:us'],
    'pipeline_tag': None,
    'mask_token': None,
    'trending_score': None,
    'card_data': 'license: other\nlicense_name: creativeml-openrail-m-addendum\nlicense_link: https://civitai.com/models/license/114059',
    'widget_data': None,
    'model_index': None,
    'config': None,
    'transformers_info': None,
    'siblings': ["RepoSibling(rfilename='.gitattributes', size=None, blob_id=None, lfs=None)",

In [17]:
def _extract_sibling_names(siblings: Iterable[Any]) -> List[str]:
    """
    Siblings can be RepoSibling objects, dicts, or even string reprs.
    Return a list of filenames.
    """
    names: List[str] = []
    for s in siblings:
        # RepoSibling object
        if hasattr(s, "rfilename"):
            try:
                n = s.rfilename
                if isinstance(n, str):
                    names.append(n)
                    continue
            except Exception:
                pass

        # Dict-like
        if isinstance(s, dict) and "rfilename" in s and isinstance(s["rfilename"], str):
            names.append(s["rfilename"])
            continue

        # String repr like: "RepoSibling(rfilename='LICENSE.md', size=None, ...)"
        if isinstance(s, str):
            m = re.search(r"rfilename='([^']+)'", s)
            if m:
                names.append(m.group(1))
                continue
    return names

def has_license_md(siblings: Iterable[Any]) -> Optional[str]:
    """Return exact 'license.md' if present (case-insensitive), else None."""
    for name in _extract_sibling_names(siblings):
        if name.lower() == "license.md":
            return name
    return None

# def fetch_license_md_text(repo_id: str, token: Optional[str]) -> Optional[str]:
#     """Return text of license.md if present; None otherwise."""
#     try:
#         info = model_info(repo_id=repo_id, token=token)
#     except (RepositoryNotFoundError, RevisionNotFoundError, HfHubHTTPError, Exception):
#         return None

#     siblings = getattr(info, "siblings", None)
#     if not siblings:
#         return None

#     fname = has_license_md(siblings)
#     if not fname:
#         return None

#     try:
#         local_path = hf_hub_download(
#             repo_id=repo_id,
#             filename=fname,
#             repo_type="model",
#             token=token,
#             local_dir="hf_license_cache",
#             local_dir_use_symlinks=False
#         )
#         with open(local_path, "r", encoding="utf-8", errors="replace") as f:
#             return f.read()
#     except Exception:
#         return None

def fetch_license_md_text(repo_id: str, token: str) -> Optional[str]:
    """
    If the repo has an exact 'license.md', download it into 'hf_license_cache'
    and return its text content. Returns None if not found or on any error.
    """
    try:
        info = model_info(repo_id=repo_id, token=token)
    except (RepositoryNotFoundError, RevisionNotFoundError, HfHubHTTPError, Exception):
        return None

    siblings = getattr(info, "siblings", None)
    if not siblings:
        return None

    fname = has_license_md(siblings)
    if not fname:
        return None

    try:
        local_path = hf_hub_download(
            repo_id=repo_id,
            filename=fname,
            repo_type="model",
            token=token,
            local_dir="hf_license_cache",
            local_dir_use_symlinks=False
        )
        with open(local_path, "r", encoding="utf-8", errors="replace") as f:
            return f.read()
    except Exception:
        return None


In [18]:
import json, time

results = []
for t in targets:  # 'targets' comes from your earlier filtering step (license == "other")
    mid = t["model_id"]
    text = fetch_license_md_text(mid, HF_TOKEN)
    if text is not None:
        results.append({"model_id": mid, "license_md_text": text})
    time.sleep(SLEEP_S)

# Write JSON (overwrite by default; set APPEND_JSON=True to append into existing array)
if APPEND_JSON and os.path.exists(OUT_JSON):
    try:
        with open(OUT_JSON, "r", encoding="utf-8") as f:
            existing = json.load(f)
        if not isinstance(existing, list):
            existing = []
    except Exception:
        existing = []
    payload = existing + results
else:
    payload = results

with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(results)} items to {OUT_JSON}")


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


LICENSE.md: 0.00B [00:00, ?B/s]

LICENSE.md: 0.00B [00:00, ?B/s]

LICENSE.md: 0.00B [00:00, ?B/s]

LICENSE.md:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

LICENSE.md:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

LICENSE.md: 0.00B [00:00, ?B/s]

LICENSE.md:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

LICENSE.md: 0.00B [00:00, ?B/s]

Wrote 10 items to other_licensed_models.json


In [None]:
print(f"Total JSON objects: {len(objs)}")
print(f"With license == 'other': {len(targets)}")
print(f"Found license.md for: {len(out_df)}")
print(f"Wrote: {OUT_CSV}")

In [None]:
import pandas as pd

df = pd.read_csv('model_data/hf_models_open_raw.csv')


other_license_models = df[df['license'] == 'other']
other_license_models