In [4]:
import re
from typing import Optional

try:
    import yaml  # pip install pyyaml
except Exception:
    yaml = None

def _normalize_license(val):
    """Return a clean license string from various shapes."""
    if val is None:
        return ""
    if isinstance(val, str):
        return val.strip()
    if isinstance(val, (list, tuple)):
        uniq = [str(s).strip() for s in val if s and str(s).strip()]
        return ", ".join(sorted(set(uniq)))
    if isinstance(val, dict):
        for k in ("id", "name", "license", "value"):
            if k in val and val[k]:
                return str(val[k]).strip()
        try:
            import json
            return json.dumps(val, ensure_ascii=False)
        except Exception:
            return str(val)
    return str(val).strip()

def _parse_yaml_front_matter(text: str) -> dict:
    """
    Parse YAML front-matter from README markdown text.
    Supports:
      ---\n<yaml>\n---   or
      ---\n<yaml>\n...
    Returns {} if not found or parse fails.
    """
    if not text:
        return {}
    # Match front matter at the very top of the file
    m = re.match(r"^\s*---\s*\n(.*?)\n(?:---|\.\.\.)\s*(?:\n|$)", text, re.DOTALL)
    if not m:
        return {}
    yaml_block = m.group(1)
    # Prefer PyYAML if present
    if yaml is not None:
        try:
            data = yaml.safe_load(yaml_block) or {}
            if isinstance(data, dict):
                return data
            return {}
        except Exception:
            pass
    # Minimal fallback: parse simple key: value and lists with "- item"
    data = {}
    current_key = None
    for line in yaml_block.splitlines():
        if re.match(r"^\s*#", line) or not line.strip():
            continue
        kv = re.match(r"^([A-Za-z0-9_\-]+)\s*:\s*(.*)$", line)
        if kv:
            key, value = kv.group(1), kv.group(2).strip()
            if value == "" or value == "|":
                data[key] = []
                current_key = key
            elif value.startswith("[") and value.endswith("]"):
                # simple inline list: [a, b]
                items = [s.strip() for s in value[1:-1].split(",") if s.strip()]
                data[key] = items
                current_key = None
            else:
                data[key] = value
                current_key = None
            continue
        # list item
        if current_key and re.match(r"^\s*-\s+", line):
            item = re.sub(r"^\s*-\s+", "", line).strip()
            data.setdefault(current_key, [])
            data[current_key].append(item)
    return data

def resolve_license(info, fallback_tags=None, readme_text: Optional[str] = None):
    """
    Resolve license using multiple sources, in order:
      1) info.cardData (license, licenses, license_name, license_id)
      2) info.config.license
      3) tags entries like 'license:apache-2.0'
      4) README YAML front-matter keys: license, licenses, license_name, license_id
      5) presence of LICENSE file (returns 'license-file' if found)
    Also returns an optional license_link if present in YAML (e.g., 'license_link').
    """
    # 1) cardData
    cd = getattr(info, "cardData", None)
    if isinstance(cd, dict):
        for key in ("license", "licenses", "license_name", "license_id"):
            if key in cd and cd[key]:
                lic = _normalize_license(cd[key])
                if lic:
                    return lic, cd.get("license_link", "")

    # 2) config
    cfg = getattr(info, "config", None)
    if isinstance(cfg, dict) and cfg.get("license"):
        lic = _normalize_license(cfg["license"])
        if lic:
            return lic, ""

    # 3) tags (handles the common 'license:apache-2.0' pattern)
    tags = fallback_tags or getattr(info, "tags", None)
    if tags:
        for t in tags:
            if isinstance(t, str) and t.lower().startswith("license:"):
                lic = t.split(":", 1)[1].strip()
                if lic:
                    return lic, ""

    # 4) README YAML front-matter
    if readme_text:
        meta = _parse_yaml_front_matter(readme_text)
        if meta:
            for key in ("license", "licenses", "license_name", "license_id"):
                if key in meta and meta[key]:
                    lic = _normalize_license(meta[key])
                    link = _normalize_license(meta.get("license_link", ""))
                    if lic:
                        return lic, link

    # 5) LICENSE file present?
    try:
        siblings = getattr(info, "siblings", None) or []
        has_license_file = any(
            hasattr(s, "rfilename") and s.rfilename.upper() == "LICENSE" for s in siblings
        )
        if has_license_file:
            return "license-file", ""
    except Exception:
        pass

    return "", ""  # unknown

In [None]:
from huggingface_hub import model_info, hf_hub_download, HfApi
from huggingface_hub.utils import HfHubHTTPError

repo_id = "ChenWu98/openthoughts3_math_teachers_source_split_17000_20000_0_qwen2_5_7b_instruct"

# If the repo is private/gated, set your token here or in HF_TOKEN env var
api = HfApi()  # or HfApi(token="hf_xxx")

info = model_info(repo_id)  # HEAD of default branch (usually 'main')
print("Default branch head SHA:", info.sha)
print("Last modified:", info.lastModified)

try:
    readme_path = hf_hub_download(
        repo_id=repo_id,
        filename="README.md",
        repo_type="model",
        revision=info.sha,           # pin to exactly what the page shows
        force_download=True,         # bypass local cache
        local_files_only=False
    )
    with open(readme_path, "r", encoding="utf-8", errors="replace") as f:
        readme_text = f.read()
    print(readme_text[:500])  # sanity check
except HfHubHTTPError as e:
    print("Failed to download README:", e)

---
base_model: Qwen/Qwen2.5-7B-Instruct
library_name: transformers
model_name: openthoughts3_math_teachers_source_split_17000_20000_0_qwen2_5_7b_instruct
tags:
- generated_from_trainer
- trl
- sft
licence: license
---

# Model Card for openthoughts3_math_teachers_source_split_17000_20000_0_qwen2_5_7b_instruct

This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
It has been trained using [TRL](https://github.com/huggingface/trl).

## Quick start

```python
from transformers import pipeline

question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
generator = pipeline("text-generation", model="None", device="cuda")
output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
print(output["generated_text"])
```

## Training procedure

[<img src="https://raw.githubusercontent.com/wandb/assets/main/wan