In [1]:
import re, json
from urllib.parse import urljoin, urlparse, urldefrag
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

BASE = "https://exrx.net"

SPACE = re.compile(r"\s+")
def norm(t: str) -> str:
    return SPACE.sub(" ", (t or "").strip())

def last_path_key(u: str) -> str:
    p = urlparse(u)
    return p.path.rstrip("/").split("/")[-1]  # es. ShouldWt

# Mappa stabile: chiave nel path -> nome macro umano
PATH2MACRO = {
    "NeckWt": "Neck",
    "ShouldWt": "Shoulders",
    "ArmWt": "Upper Arms",
    "ForeArmWt": "Forearms",
    "BackWt": "Back",
    "ChestWt": "Chest",
    "WaistWt": "Waist",
    "HipsWt": "Hips",
    "ThighWt": "Thighs",
    "CalfWt": "Calves",
    # Se in futuro userai “Other Exercises”, aggiungi qui le chiavi relative
}
MACRO_NAMES = set(PATH2MACRO.values())

In [2]:
async def fetch_html(url: str) -> str:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        ctx = await browser.new_context(
            user_agent=("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/124.0.0.0 Safari/537.36")
        )
        page = await ctx.new_page()
        await page.goto(url, wait_until="domcontentloaded")
        await page.wait_for_timeout(1200)  # piccolo respiro per WAF/JS
        html = await page.content()
        await ctx.close(); await browser.close()
        return html

In [3]:
async def parse_macro_categories(dir_url=f"{BASE}/Lists/Directory"):
    html = await fetch_html(dir_url)
    soup = BeautifulSoup(html, "lxml")

    # raccogli tutti i link che portano a /Lists/ExList/<Key>
    candidates = []
    for a in soup.select("a[href]"):
        href = a.get("href");
        if not href:
            continue
        absu = urljoin(dir_url, href)
        if "/Lists/ExList/" not in absu:
            continue
        key = last_path_key(absu)              # es. ShouldWt
        url_clean = urldefrag(absu)[0]         # senza #fragment
        txt = norm(a.get_text())               # etichetta mostrata per quel link
        candidates.append((key, url_clean, txt))

    # collassa per chiave (Key) -> scegli un nome macro “migliore”
    macros = {}
    for key, url_clean, txt in candidates:
        # se la chiave è mappata, usa il nome canonico; altrimenti prova il testo del link
        canonical = PATH2MACRO.get(key)
        display = canonical or txt
        # preferisci un nome che appartenga all’elenco canonico se appare tra i link
        if key not in macros:
            macros[key] = {"macro_key": key, "macro_name": display, "list_url": url_clean}
        else:
            # se non avevamo un nome canonico e ora vediamo un link con nome “ufficiale”, aggiorna
            if macros[key]["macro_name"] not in MACRO_NAMES and display in MACRO_NAMES:
                macros[key]["macro_name"] = display
            # assicura che l’URL sia quello senza fragment
            macros[key]["list_url"] = url_clean

    # ordina alfabeticamente per nome macro
    out = sorted(macros.values(), key=lambda d: d["macro_name"].lower())
    return out

In [4]:
macros = await parse_macro_categories()
print("Macrocategorie trovate:", len(macros))
for m in macros:
    print(f"- {m['macro_name']:12s} → {m['list_url']}")

Macrocategorie trovate: 10
- Back         → https://exrx.net/Lists/ExList/BackWt
- Calves       → https://exrx.net/Lists/ExList/CalfWt
- Chest        → https://exrx.net/Lists/ExList/ChestWt
- Forearms     → https://exrx.net/Lists/ExList/ForeArmWt
- Hips         → https://exrx.net/Lists/ExList/HipsWt
- Neck         → https://exrx.net/Lists/ExList/NeckWt
- Shoulders    → https://exrx.net/Lists/ExList/ShouldWt
- Thighs       → https://exrx.net/Lists/ExList/ThighWt
- Upper Arms   → https://exrx.net/Lists/ExList/ArmWt
- Waist        → https://exrx.net/Lists/ExList/WaistWt


In [5]:
import re, json
from urllib.parse import urljoin, urldefrag
from bs4 import BeautifulSoup

SPACE = re.compile(r"\s+")
def norm(t: str) -> str:
    return SPACE.sub(" ", (t or "").strip())

IMPLE_MAP = {
    "barbell": "barbell",
    "body weight": "bodyweight",
    "bodyweight": "bodyweight",
    "dumbbell": "dumbbell",
    "kettlebell": "kettlebell",
    "cable": "cable",
    "lever": "lever",
    "machine": "machine",
    "smith": "smith machine",
    "smith machine": "smith machine",
    "suspended": "suspended",
    "self-assisted": "self-assisted",
    "stretch": "stretch",
    "dynamic stretch": "dynamic stretch",
    "plyometrics": "plyometrics",
    "band": "band",
    "medicine ball": "medicine ball",
    "sandbag": "sandbag",
    "sled": "sled",
    "trap bar": "trap bar",
    "trx": "suspended",
}
def norm_implement(label: str) -> str:
    l = norm(label).lower()
    if l.startswith("lever"):  # lever (selectorized / plate loaded)
        return "lever"
    for k,v in IMPLE_MAP.items():
        if k in l:
            return v
    return l or "unknown"

async def parse_macro_page(list_url: str, macro_name: str):
    html = await fetch_html(list_url)
    soup = BeautifulSoup(html, "lxml")

    SPACE = re.compile(r"\s+")
    def norm(t: str) -> str:
        return SPACE.sub(" ", (t or "").strip())

    IMPLEMENT_HEADINGS = {
        "barbell","body weight","bodyweight","cable","dumbbell","kettlebell",
        "lever","machine","smith","smith machine","suspended","self-assisted",
        "stretch","dynamic stretch","plyometrics","band","medicine ball","sandbag",
        "sled","trap bar","trx"
    }
    def is_implement_heading(text: str) -> bool:
        t = norm(text).lower()
        if t.startswith("lever"):  # lever (selectorized/plate loaded)
            return True
        return any(k == t or k in t for k in IMPLEMENT_HEADINGS)

    def norm_impl(label: str) -> str:
        l = norm(label).lower()
        if l.startswith("lever"):  return "lever"
        if "smith" in l:           return "smith machine"
        if "body weight" in l or l == "bodyweight": return "bodyweight"
        for k in IMPLEMENT_HEADINGS:
            if k in l: return k
        return l or "unknown"

    def anchor(li):
        a = li.find("a", href=True)
        if not a:
            return None, None
        return norm(a.get_text()), urldefrag(urljoin(list_url, a["href"]))[0]

    def direct_text(el) -> str:
        if el is None: return ""
        txts = [t for t in el.find_all(string=True, recursive=False)]
        return norm(" ".join(txts))

    records = []
    current_micro = None
    current_impl  = None
    processed_uls = set()

    def emit(name, url, micro, impl):
        if not name or not url:
            return
        records.append({
            "macro_group": macro_name,
            "micro_group": micro or "",
            "implement": impl or "",
            "exercise_name": name,
            "exercise_url": url,
            "list_url": list_url,
        })

    def walk_variants(parent_li, base_name, base_url, micro, impl):
        """
        Scorre le varianti annidate sotto parent_li:
        - per ogni li child con anchor → nome composto "base — child"
        - se un child ha a sua volta sub-ul, prosegue ricorsivamente
        """
        ul = parent_li.find("ul")
        if not ul:
            return
        if id(ul) in processed_uls:
            return
        processed_uls.add(id(ul))

        for child in ul.find_all("li", recursive=False):
            vname, vurl = anchor(child)
            if vname and vurl:
                composed = f"{base_name} — {vname}"
                emit(composed, vurl, micro, impl)
            # ricorsione per livelli più profondi
            if child.find("ul"):
                walk_variants(child, base_name if not vname else composed, vurl or base_url, micro, impl)

    # scan sequenziale
    for node in soup.find_all(["h1","h2","h3","h4","strong","ul","li"]):
        tag  = node.name.lower()
        text = norm(node.get_text())

        # micro (titoli muscolari)
        if tag in ("h1","h2","h3") and text:
            if any(w in text.lower() for w in (
                "deltoid","quadriceps","hamstrings","gluteus","pectoralis","abdominis",
                "obliques","trapezius","rhomboids","erector","calves","soleus",
                "gastrocnemius","forearm","biceps","triceps","back","hip","waist","neck"
            )):
                current_micro = text
                current_impl  = None
                continue

        # implement come header
        if tag in ("h3","h4","strong") and text and is_implement_heading(text):
            current_impl = norm_impl(text)
            continue

        # implement come LI contenitore: <li>Barbell<ul>...</ul></li>
        if tag == "li" and node.find("ul"):
            impl_label = direct_text(node)
            if is_implement_heading(impl_label):
                current_impl = norm_impl(impl_label)
                # ogni li *figlio* di questo ul è un esercizio o una sottocategoria
                ul = node.find("ul")
                if id(ul) not in processed_uls:
                    processed_uls.add(id(ul))
                    for li in ul.find_all("li", recursive=False):
                        name, url = anchor(li)
                        if name and url:
                            emit(name, url, current_micro, current_impl)
                            # gestisci varianti sotto questo li (se presenti)
                            if li.find("ul"):
                                walk_variants(li, name, url, current_micro, current_impl)
                continue

        # UL “libero” sotto un header implement
        if tag == "ul" and current_micro:
            if node.find_parent("li") is not None:
                continue
            prev_hdr = node.find_previous(lambda t: t.name in ("h3","h4","strong","h2","h1"))
            if not (prev_hdr and is_implement_heading(prev_hdr.get_text())):
                continue
            current_impl = norm_impl(prev_hdr.get_text())
            if id(node) in processed_uls:
                continue
            processed_uls.add(id(node))
            for li in node.find_all("li", recursive=False):
                name, url = anchor(li)
                if name and url:
                    emit(name, url, current_micro, current_impl)
                    if li.find("ul"):
                        walk_variants(li, name, url, current_micro, current_impl)

    # dedup finale per (url, micro, implement, name)
    seen, out = set(), []
    for r in records:
        key = (r["exercise_url"], r["micro_group"], r["implement"], r["exercise_name"])
        if key in seen:
            continue
        seen.add(key); out.append(r)
    return out

    # dedup
    seen, out = set(), []
    for r in records:
        key = (r["exercise_url"], r["micro_group"], r["implement"])
        if key in seen:
            continue
        seen.add(key); out.append(r)
    return out

    # Dedup
    seen, out = set(), []
    for r in records:
        key = (r["exercise_url"], r["micro_group"], r["implement"])
        if key in seen:
            continue
        seen.add(key); out.append(r)
    return out

    # dedup
    seen, out = set(), []
    for r in records:
        key = (r["exercise_url"], r["micro_group"], r["implement"])
        if key in seen:
            continue
        seen.add(key); out.append(r)
    return out

In [6]:
# esempio: Shoulders
macros = await parse_macro_categories()
shoulders = next(m for m in macros if m["macro_name"]=="Shoulders")
recs = await parse_macro_page(shoulders["list_url"], macro_name=shoulders["macro_name"])
print("Esercizi estratti (Shoulders):", len(recs))
for r in recs[:15]:
    print(f"- [{r['micro_group']}] {r['implement'] or 'unknown'} | {r['exercise_name']} → {r['exercise_url']}")

Esercizi estratti (Shoulders): 70
- [Anterior Deltoid] barbell | Front Raise → https://exrx.net/WeightExercises/DeltoidAnterior/BBFrontRaise
- [Anterior Deltoid] barbell | Military Press → https://exrx.net/WeightExercises/DeltoidAnterior/BBMilitaryPress
- [Anterior Deltoid] barbell | Military Press — Seated → https://exrx.net/WeightExercises/DeltoidAnterior/BBSeatedMilitaryPress
- [Anterior Deltoid] cable | Front Raise → https://exrx.net/WeightExercises/DeltoidAnterior/CBSeatedFrontRaise
- [Anterior Deltoid] cable | Front Raise — Alternating → https://exrx.net/WeightExercises/DeltoidAnterior/CBAlternatingFrontRaise
- [Anterior Deltoid] cable | Front Raise — One Arm → https://exrx.net/WeightExercises/DeltoidAnterior/CBFrontRaise
- [Anterior Deltoid] cable | Shoulder Press → https://exrx.net/WeightExercises/DeltoidAnterior/CBStandingShoulderPress
- [Anterior Deltoid] cable | Shoulder Press — Seated → https://exrx.net/WeightExercises/DeltoidAnterior/CBShoulderPress
- [Anterior Deltoid] du

In [7]:
import asyncio, time

async def crawl_all_macros(delay_sec: float = 0.8, limit: int | None = None):
    macros = await parse_macro_categories()
    if limit:
        macros = macros[:limit]

    all_rows = []
    per_macro_counts = {}

    for i, m in enumerate(macros, 1):
        t0 = time.time()
        try:
            rows = await parse_macro_page(m["list_url"], macro_name=m["macro_name"])
        except Exception as e:
            print(f"[{i}/{len(macros)}] ERRORE {m['macro_name']}: {e}")
            rows = []
        all_rows.extend(rows)
        per_macro_counts[m["macro_name"]] = len(rows)

        dt = time.time() - t0
        # rate limit gentile
        if dt < delay_sec:
            await asyncio.sleep(delay_sec - dt)

        print(f"[{i}/{len(macros)}] {m['macro_name']:<12} → {len(rows):3d} righe (tot {len(all_rows)})")

    return macros, all_rows, per_macro_counts

# ESEGUI (puoi iniziare con limit=3 per test)
macros, rows, per_macro = await crawl_all_macros(delay_sec=0.8, limit=None)
print("\nTot righe:", len(rows))
print("Per macro:", per_macro)
print("Esempio:", rows[:3])

[1/10] Back         → 109 righe (tot 109)
[2/10] Calves       →  59 righe (tot 168)
[3/10] Chest        →  72 righe (tot 240)
[4/10] Forearms     →  28 righe (tot 268)
[5/10] Hips         → 148 righe (tot 416)
[6/10] Neck         →  17 righe (tot 433)
[7/10] Shoulders    →  70 righe (tot 503)
[8/10] Thighs       → 125 righe (tot 628)
[9/10] Upper Arms   →  60 righe (tot 688)
[10/10] Waist        → 101 righe (tot 789)

Tot righe: 789
Per macro: {'Back': 109, 'Calves': 59, 'Chest': 72, 'Forearms': 28, 'Hips': 148, 'Neck': 17, 'Shoulders': 70, 'Thighs': 125, 'Upper Arms': 60, 'Waist': 101}
Esempio: [{'macro_group': 'Back', 'micro_group': 'General Back', 'implement': 'barbell', 'exercise_name': 'Bent-over Row', 'exercise_url': 'https://exrx.net/WeightExercises/BackGeneral/BBBentOverRow', 'list_url': 'https://exrx.net/Lists/ExList/BackWt'}, {'macro_group': 'Back', 'micro_group': 'General Back', 'implement': 'barbell', 'exercise_name': 'Bent-over Row — Underhand', 'exercise_url': 'https://ex

In [8]:
from collections import defaultdict

def build_catalog(rows: list[dict]) -> dict:
    """
    Collassa le righe (macro,micro,implement,exercise_name,exercise_url)
    in un indice per exercise_url con:
      - name (preferisce il più lungo/descrittivo)
      - implements (set)
      - macro_groups (set)
      - micro_groups (set)
      - variants_by_impl (implement -> set di nomi completi)
      - list_urls (set)
    """
    cat = {}
    for r in rows:
        url = r["exercise_url"]
        name = r["exercise_name"]
        impl = r["implement"] or ""
        micro = r["micro_group"] or ""
        macro = r["macro_group"] or ""
        lst  = r["list_url"]

        doc = cat.get(url, {
            "name": name,
            "exercise_url": url,
            "implements": set(),
            "macro_groups": set(),
            "micro_groups": set(),
            "variants_by_impl": defaultdict(set),
            "list_urls": set(),
        })

        # tieni il nome più “ricco”
        if len(name) > len(doc["name"]):
            doc["name"] = name

        if impl:
            doc["implements"].add(impl)
            doc["variants_by_impl"][impl].add(name)
        else:
            doc["variants_by_impl"]["unknown"].add(name)

        if macro: doc["macro_groups"].add(macro)
        if micro: doc["micro_groups"].add(micro)
        if lst:   doc["list_urls"].add(lst)

        cat[url] = doc

    # set → list ordinata
    for d in cat.values():
        d["implements"]    = sorted(d["implements"])
        d["macro_groups"]  = sorted(d["macro_groups"])
        d["micro_groups"]  = sorted(d["micro_groups"])
        d["list_urls"]     = sorted(d["list_urls"])
        d["variants_by_impl"] = {k: sorted(v) for k, v in d["variants_by_impl"].items()}
    return cat

catalog = build_catalog(rows)
print("Esercizi unici:", len(catalog))
# stampa 3 esempi
for i, (u, d) in enumerate(catalog.items()):
    if i >= 3: break
    print("\nURL:", u)
    print("  name:         ", d["name"])
    print("  implements:   ", d["implements"])
    print("  macro_groups: ", d["macro_groups"])
    print("  micro_groups: ", d["micro_groups"])
    print("  variants_by_impl sample:", {k: v[:3] for k,v in d["variants_by_impl"].items()})

Esercizi unici: 745

URL: https://exrx.net/WeightExercises/BackGeneral/BBBentOverRow
  name:          Bent-over Row
  implements:    ['barbell']
  macro_groups:  ['Back']
  micro_groups:  ['General Back']
  variants_by_impl sample: {'barbell': ['Bent-over Row']}

URL: https://exrx.net/WeightExercises/BackGeneral/BBUnderhandBentOverRow
  name:          Bent-over Row — Underhand
  implements:    ['barbell']
  macro_groups:  ['Back']
  micro_groups:  ['General Back']
  variants_by_impl sample: {'barbell': ['Bent-over Row — Underhand']}

URL: https://exrx.net/WeightExercises/BackGeneral/CBOneArmBentoverRow
  name:          One Arm Bent-over Row
  implements:    ['cable']
  macro_groups:  ['Back']
  micro_groups:  ['General Back']
  variants_by_impl sample: {'cable': ['One Arm Bent-over Row']}


In [9]:
catalog

{'https://exrx.net/WeightExercises/BackGeneral/BBBentOverRow': {'name': 'Bent-over Row',
  'exercise_url': 'https://exrx.net/WeightExercises/BackGeneral/BBBentOverRow',
  'implements': ['barbell'],
  'macro_groups': ['Back'],
  'micro_groups': ['General Back'],
  'variants_by_impl': {'barbell': ['Bent-over Row']},
  'list_urls': ['https://exrx.net/Lists/ExList/BackWt']},
 'https://exrx.net/WeightExercises/BackGeneral/BBUnderhandBentOverRow': {'name': 'Bent-over Row — Underhand',
  'exercise_url': 'https://exrx.net/WeightExercises/BackGeneral/BBUnderhandBentOverRow',
  'implements': ['barbell'],
  'macro_groups': ['Back'],
  'micro_groups': ['General Back'],
  'variants_by_impl': {'barbell': ['Bent-over Row — Underhand']},
  'list_urls': ['https://exrx.net/Lists/ExList/BackWt']},
 'https://exrx.net/WeightExercises/BackGeneral/CBOneArmBentoverRow': {'name': 'One Arm Bent-over Row',
  'exercise_url': 'https://exrx.net/WeightExercises/BackGeneral/CBOneArmBentoverRow',
  'implements': ['cab

In [10]:
import re, hashlib
from urllib.parse import urlparse

def exrx_canonical_key(ex_url: str):
    """
    Ritorna ('WeightExercises', slug) per gli esercizi, altrimenti (first_path, slug).
    Esempio: https://exrx.net/WeightExercises/GluteusMaximus/BWSquat -> ('WeightExercises','BWSquat')
    """
    p = urlparse(ex_url)
    parts = [x for x in p.path.split("/") if x]
    if not parts:
        return None
    slug = parts[-1]
    first = parts[0]
    return (first, slug)

def merge_docs(a: dict, b: dict) -> dict:
    """Unisci due entry dello stesso esercizio."""
    out = dict(a)
    # name: tieni il più ricco
    if len(b.get("name","")) > len(out.get("name","")):
        out["name"] = b["name"]

    # fields set-like
    for k in ("implements","macro_groups","micro_groups","list_urls"):
        sa = set(out.get(k, []))
        sb = set(b.get(k, []))
        out[k] = sorted(sa | sb)

    # variants_by_impl: unione per implement
    vbi_a = out.get("variants_by_impl", {})
    vbi_b = b.get("variants_by_impl", {})
    merged = {}
    for impl in set(vbi_a.keys()) | set(vbi_b.keys()):
        merged[impl] = sorted(set(vbi_a.get(impl, [])) | set(vbi_b.get(impl, [])))
    out["variants_by_impl"] = merged

    return out

In [11]:
def dedupe_by_slug(catalog: dict) -> dict:
    """
    Collassa il catalog per chiave canonica ('WeightExercises', slug).
    Restituisce un nuovo dict indicizzato dalla *chiave canonica* (non più dall'URL singolo).
    """
    buckets = {}
    url_index = {}  # mappa canon_key -> lista URL originali (utile per trace)

    for url, doc in catalog.items():
        key = exrx_canonical_key(url)
        if key is None:
            # tieni a parte (se hai altre sezioni oltre WeightExercises)
            key = ("other", url.rsplit("/",1)[-1])
        if key not in buckets:
            buckets[key] = {
                **doc,
                "exercise_url": url,      # tieni uno come "preferito"
                "_all_urls": [url],       # traccia tutti gli URL uniti
            }
            url_index[key] = [url]
        else:
            buckets[key] = merge_docs(buckets[key], doc)
            url_index[key].append(url)
            buckets[key]["_all_urls"] = sorted(set(buckets[key].get("_all_urls", []) + [url]))

    return buckets

dedup = dedupe_by_slug(catalog)
print("Prima:", len(catalog), "→ Dopo (per slug):", len(dedup))
# Esempio: cerca BWSquat
for k, d in dedup.items():
    if k[1].lower() == "bwsquat":
        print("\nChiave canonica:", k)
        print("URL preferito:  ", d["exercise_url"])
        print("Tutti gli URL:  ", d["_all_urls"])
        print("Macro groups:   ", d["macro_groups"])
        print("Micro groups:   ", d["micro_groups"])
        print("Implements:     ", d["implements"])
        print("Name:           ", d["name"])
        break

Prima: 745 → Dopo (per slug): 653

Chiave canonica: ('WeightExercises', 'BWSquat')
URL preferito:   https://exrx.net/WeightExercises/GluteusMaximus/BWSquat
Tutti gli URL:   ['https://exrx.net/WeightExercises/GluteusMaximus/BWSquat', 'https://exrx.net/WeightExercises/Quadriceps/BWSquat']
Macro groups:    ['Hips', 'Thighs']
Micro groups:    ['Gluteus Maximus', 'Quadriceps']
Implements:      ['bodyweight']
Name:            Squat


In [24]:
from playwright.async_api import async_playwright

async def fetch_page_with_token(url: str):
    """
    Torna (html, token) dove token è quello richiesto per exrx.glorb.com/api/video/<token>/<fID>
    Se il token fallisce, ritorna None ma il parser userà comunque fallback (es. GIF).
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        ctx = await browser.new_context(user_agent=(
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/124.0.0.0 Safari/537.36"
        ))
        page = await ctx.new_page()
        await page.goto(url, wait_until="domcontentloaded")
        await page.wait_for_timeout(1000)

        # HTML intero
        html = await page.content()

        # Prova a chiedere il token al backend first-party
        token = None
        try:
            # usare page.evaluate mantiene le stesse credenziali/cookie
            token = await page.evaluate("""(async () => {
                try {
                    const r = await fetch('/api/video/token', {cache:'no-store'});
                    const j = await r.json();
                    return j && j.token ? j.token : null;
                } catch(e){ return null; }
            })()""")
        except Exception:
            token = None

        await ctx.close(); await browser.close()
        return html, token

In [32]:
import re, json
from urllib.parse import urljoin
from bs4 import BeautifulSoup

SPACE = re.compile(r"\s+")
def norm(x: str) -> str:
    return SPACE.sub(" ", (x or "").strip())

def text_after_h2(soup: BeautifulSoup, label: str) -> list:
    """Raccoglie i nodi (p/li/ul/ol) dopo un <h2> con quel label, fino al prossimo <h2>."""
    h2 = soup.find(lambda t: t.name == "h2" and label.lower() in t.get_text(strip=True).lower())
    if not h2:
        return []
    nodes = []
    for sib in h2.find_all_next():
        if sib == h2:
            continue
        if sib.name == "h2":
            break
        if sib.name in ("p","ul","ol","li","table","div","strong"):
            nodes.append(sib)
    return nodes

def parse_classification_from_table(nodes: list) -> dict:
    out = {"utility": None, "mechanics": None, "force": None}
    # cerca una <table> tra i nodi
    for n in nodes:
        if n.name == "table":
            for tr in n.find_all("tr"):
                cells = [norm(td.get_text(" ")) for td in tr.find_all(["td","th"])]
                if len(cells) >= 2:
                    k = cells[0].lower().rstrip(":")
                    v = cells[-1]
                    if k == "utility": out["utility"] = v
                    if k == "mechanics": out["mechanics"] = v
                    if k == "force": out["force"] = v
    return out

def parse_instructions(nodes: list) -> tuple[str,str,str]:
    """
    I nodi contengono:
      <p><strong>Preparation</strong></p>
      <p>...</p>
      <p><strong>Execution</strong></p>
      <p>...</p>
    e così via. Split semplice sui sottotitoli strong.
    """
    prep, execu, comm = [], [], []
    bucket = None
    for n in nodes:
        if n.name == "p":
            # se contiene uno strong con una label
            strong = n.find("strong")
            if strong:
                label = norm(strong.get_text()).lower()
                if label.startswith("preparation"):
                    bucket = "prep"; continue
                if label.startswith("execution"):
                    bucket = "exec"; continue
                if label.startswith("comments"):
                    bucket = "comm"; continue
            # paragrafo normale
            txt = norm(n.get_text(" "))
            if not txt:
                continue
            if bucket == "prep":   prep.append(txt)
            elif bucket == "exec": execu.append(txt)
            elif bucket == "comm": comm.append(txt)
            else:
                # se non si è settato ancora nulla ed è il primo p dopo "Instructions",
                # di solito è parte di "Preparation"
                if not prep and not execu and not comm:
                    prep.append(txt)
                else:
                    execu.append(txt)
        elif n.name in ("ul","ol"):
            # appendiamo le liste al bucket corrente
            items = [norm(li.get_text(" ")) for li in n.find_all("li")]
            if not items:
                continue
            joined = "\n".join(f"- {it}" for it in items)
            if bucket == "prep":   prep.append(joined)
            elif bucket == "exec": execu.append(joined)
            elif bucket == "comm": comm.append(joined)
    return "\n".join(prep).strip(), "\n".join(execu).strip(), "\n".join(comm).strip()

def parse_comments(soup: BeautifulSoup) -> str:
    nodes = text_after_h2(soup, "comments")
    parts = []
    for n in nodes:
        if n.name == "p":
            t = norm(n.get_text(" "))
            if t: parts.append(t)
        elif n.name in ("ul","ol"):
            items = [norm(li.get_text(" ")) for li in n.find_all("li")]
            if items:
                parts.append("\n".join(f"- {it}" for it in items))
    return "\n".join(parts).strip()

def parse_muscles(soup: BeautifulSoup) -> dict:
    """
    Sezione <h2>Muscles con sottotitoli <p><strong>Target</strong></p> + <ul> ecc.
    """
    nodes = text_after_h2(soup, "muscles")
    muscles = {"target": [], "synergists": [], "stabilizers": [], "dynamic_stabilizers": []}

    label = None
    for n in nodes:
        if n.name == "p":
            st = n.find("strong")
            if st:
                lab = norm(st.get_text()).lower()
                if "target" in lab:               label = "target"; continue
                if "synergist" in lab:            label = "synergists"; continue
                if lab.startswith("dynamic"):     label = "dynamic_stabilizers"; continue
                if "stabilizer" in lab:           label = "stabilizers"; continue
        if n.name == "ul" and label:
            items = [norm(li.get_text(" ")) for li in n.find_all("li")]
            muscles[label] = items
    return muscles

def parse_fid_from_script(html: str) -> int | None:
    """
    Nel tuo HTML:
      const bID = 95712;
      const fID = 10526;
      const baseUrl = 'https://exrx.glorb.com/api/video/';
    """
    m = re.search(r"const\s+fID\s*=\s*(\d+)\s*;", html)
    return int(m.group(1)) if m else None

def build_media(html: str, url: str, token: str | None):
    soup = BeautifulSoup(html, "lxml")

    # thumb da <meta property="og:image"> o <meta name="thumbnail">
    thumb = None
    og = soup.find("meta", attrs={"property":"og:image"})
    if og and og.get("content"):
        thumb = og["content"]
    if not thumb:
        tn = soup.find("meta", attrs={"name":"thumbnail"})
        if tn and tn.get("content"):
            thumb = tn["content"]

    # clip dal player JS (token + fID) se possibile
    fid = parse_fid_from_script(html)
    clip_url = None
    if token and fid:
        clip_url = f"https://exrx.glorb.com/api/video/{token}/{fid}"

    # fallback: <video><source>
    if not clip_url:
        vid = soup.find("video")
        if vid:
            src = vid.get("src")
            if not src:
                se = vid.find("source")
                if se and se.get("src"):
                    src = se["src"]
            if src:
                clip_url = urljoin(url, src)

    return {
        "thumb": thumb,
        "clip_url": clip_url,
        "clip_type": "video" if clip_url else None
    }

def infer_implement_from_slug(slug: str) -> str | None:
    slug_up = slug.upper()
    if slug_up.startswith("BW"): return "bodyweight"
    if slug_up.startswith("BB"): return "barbell"
    if slug_up.startswith("DB"): return "dumbbell"
    if slug_up.startswith("CB"): return "cable"
    if slug_up.startswith("KB"): return "kettlebell"
    if slug_up.startswith("LV"): return "lever"
    if slug_up.startswith("SM"): return "smith machine"
    if slug_up.startswith("SL"): return "sled"
    return None

def muscles_flat_lists(m: dict) -> tuple[list, list]:
    """Unisce target+synergists come lista principale; stabilizers+dynamic in seconda lista."""
    target = m.get("target", []) or []
    synerg = m.get("synergists", []) or []
    stabs  = (m.get("stabilizers", []) or []) + (m.get("dynamic_stabilizers", []) or [])
    def dedup(seq):
        seen, out = set(), []
        for x in seq:
            if x not in seen:
                seen.add(x); out.append(x)
        return out
    return dedup(target + synerg), dedup(stabs)

def tools_from_slug(slug: str) -> list[dict]:
    """Ritorna una lista di oggetti tool. Per ora solo l'implement dedotto dallo slug."""
    s = slug.upper()
    implement = None
    if s.startswith("BW"): implement = "bodyweight"
    elif s.startswith("BB"): implement = "barbell"
    elif s.startswith("DB"): implement = "dumbbell"
    elif s.startswith("CB"): implement = "cable"
    elif s.startswith("KB"): implement = "kettlebell"
    elif s.startswith("LV"): implement = "lever"
    elif s.startswith("SM"): implement = "smith machine"
    elif s.startswith("SL"): implement = "sled"
    tools = []
    if implement:
        tools.append({"type": "implement", "name": implement})
    return tools

def parse_exrx_exercise_v3(html: str, url: str, *_ignore_hints):
    soup = BeautifulSoup(html, "lxml")
    # titolo
    h1 = soup.find("h1", class_="page-title") or soup.find("h1")
    name = norm(h1.get_text()) if h1 else "Exercise"

    # classification
    class_nodes = text_after_h2(soup, "classification")
    classification = parse_classification_from_table(class_nodes)

    # instructions
    instr_nodes = text_after_h2(soup, "instructions")
    preparation, execution, _comm_from_instr = parse_instructions(instr_nodes)

    # comments
    comments = parse_comments(soup)
    if not comments:
        comments = _comm_from_instr

    # muscles → appiattiti
    muscles_raw = parse_muscles(soup)
    main_list, stabs_list = muscles_flat_lists(muscles_raw)

    # tools (lista di oggetti)
    slug = url.rstrip("/").rsplit("/",1)[-1]
    tools = tools_from_slug(slug)

    return {
        "name": name,
        "source": {"site": "ExRx", "url": url},
        "tools": tools,  # <- lista di oggetti
        "sections": {
            "classification": classification,
            "preparation": preparation,
            "execution": execution,
            "comments": comments,
        },
        "main_muscles_involved": main_list,  # <- lista piatta
        "stabilizers": stabs_list,           # <- lista piatta
    }

In [None]:
import os, re, json, asyncio, subprocess
from pathlib import Path
import aiohttp
import cv2
from bs4 import BeautifulSoup

def file_ok(path: str, min_bytes: int = 1024) -> bool:
    try:
        return os.path.exists(path) and os.path.getsize(path) >= min_bytes
    except:
        return False

def can_read_first_frame(video_path: str) -> bool:
    cap = cv2.VideoCapture(video_path)
    ok, frame = cap.read()
    cap.release()
    return bool(ok and frame is not None)

async def download_binary(url: str, dest: str) -> bool:
    try:
        async with aiohttp.ClientSession() as sess:
            async with sess.get(url, timeout=120) as r:
                r.raise_for_status()
                data = await r.read()
        Path(dest).parent.mkdir(parents=True, exist_ok=True)
        Path(dest).write_bytes(data)
        return True
    except Exception as e:
        print("download_binary error:", e)
        return False

def parse_fid_from_script(html: str) -> int | None:
    m = re.search(r"const\s+fID\s*=\s*(\d+)\s*;", html)
    return int(m.group(1)) if m else None

def first_non_logo_thumb(html: str, base_url: str) -> str | None:
    soup = BeautifulSoup(html, "lxml")
    og = soup.find("meta", attrs={"property":"og:image"})
    if og and og.get("content"):
        return og["content"]
    tn = soup.find("meta", attrs={"name":"thumbnail"})
    if tn and tn.get("content"):
        return tn["content"]
    for img in soup.find_all("img"):
        src = img.get("src") or ""
        if "logo" in src.lower():
            continue
        return src if src.startswith("http") else base_url.rstrip("/") + "/" + src.lstrip("/")
    return None

async def save_thumb(html: str, url: str, base_dir="exrx_media") -> dict:
    out = {}
    thumb_url = first_non_logo_thumb(html, url)
    if not thumb_url:
        return out
    name = thumb_url.rsplit("/",1)[-1].split("?")[0] or "thumb.bin"
    dest = os.path.join(base_dir, "thumbs", name)
    ok = await download_binary(thumb_url, dest)
    if ok and file_ok(dest, 200):
        out["thumb_path"] = dest
    return out

async def save_exrx_clip(html: str, token: str | None, url: str, base_dir="exrx_media") -> dict:
    """
    - Se token e fID sono disponibili, costruisce l'URL firmato.
    - Tenta download diretto (MP4/WebM/GIF).
    - Se Content-Type/MIME indica HLS (.m3u8), usa ffmpeg per salvare MP4.
    """
    out = {}
    fid = parse_fid_from_script(html)
    if not (token and fid):
        # fallback: prova a pescare <video><source> src se presente
        soup = BeautifulSoup(html, "lxml")
        vid = soup.find("video")
        src = None
        if vid:
            src = vid.get("src") or (vid.find("source").get("src") if vid.find("source") else None)
        if not src:
            return out
        clip_url = src if src.startswith("http") else url.rstrip("/") + "/" + src.lstrip("/")
    else:
        clip_url = f"https://exrx.glorb.com/api/video/{token}/{fid}"

    # Controlla header per capire se è HLS
    is_hls = False
    content_type = None
    try:
        async with aiohttp.ClientSession() as sess:
            async with sess.head(clip_url, timeout=20) as r:
                # alcuni server non rispondono a HEAD → fallback GET
                if r.status >= 400:
                    raise aiohttp.ClientResponseError(r.request_info, r.history, status=r.status, message="HEAD failed")
                content_type = r.headers.get("Content-Type","").lower()
                is_hls = (".m3u8" in clip_url.lower()) or ("mpegurl" in content_type)
    except Exception:
        # fallback: prova GET a piccoli byte
        try:
            async with aiohttp.ClientSession() as sess:
                async with sess.get(clip_url, timeout=20) as r:
                    content_type = r.headers.get("Content-Type","").lower()
                    is_hls = (".m3u8" in clip_url.lower()) or ("mpegurl" in content_type)
        except Exception:
            pass

    Path(os.path.join(base_dir, "clips")).mkdir(parents=True, exist_ok=True)
    if is_hls:
        # richiede ffmpeg installato
        out_path = os.path.join(base_dir, "clips", f"{fid or 'clip'}.mp4")
        try:
            subprocess.run(
                ["ffmpeg", "-y", "-i", clip_url, "-c", "copy", out_path],
                check=True,
                stdout=subprocess.PIPE, stderr=subprocess.PIPE
            )
            if file_ok(out_path, 50_000):
                out["clip_path"] = out_path
                out["clip_can_read_first_frame"] = can_read_first_frame(out_path)
        except FileNotFoundError:
            print("ffmpeg non trovato: installalo o aggiungilo al PATH per salvare stream HLS.")
        except subprocess.CalledProcessError as e:
            print("ffmpeg errore:", e)
    else:
        # tentativo download diretto
        name = clip_url.rsplit("/",1)[-1].split("?")[0] or "clip.bin"
        out_path = os.path.join(base_dir, "clips", name)
        ok = await download_binary(clip_url, out_path)
        if ok and file_ok(out_path, 50_000):
            out["clip_path"] = out_path
            if out_path.lower().endswith((".mp4",".webm")):
                out["clip_can_read_first_frame"] = can_read_first_frame(out_path)

    return out

In [33]:
async def scrape_exrx_one(url: str):
    html, token = await fetch_page_with_token(url)
    doc = parse_exrx_exercise_v3(html, url)
    media = build_media(html, url, token)
    doc["media"] = media
    return doc

In [49]:
import os, re, json, asyncio, subprocess
from pathlib import Path
import aiohttp
import cv2
from bs4 import BeautifulSoup
import shutil
from urllib.parse import urlparse

def file_ok(path: str, min_bytes: int = 1024) -> bool:
    try:
        return os.path.exists(path) and os.path.getsize(path) >= min_bytes
    except:
        return False

def can_read_first_frame(video_path: str) -> bool:
    cap = cv2.VideoCapture(video_path)
    ok, frame = cap.read()
    cap.release()
    return bool(ok and frame is not None)

async def download_binary(url: str, dest: str) -> bool:
    try:
        async with aiohttp.ClientSession() as sess:
            async with sess.get(url, timeout=120) as r:
                r.raise_for_status()
                data = await r.read()
        Path(dest).parent.mkdir(parents=True, exist_ok=True)
        Path(dest).write_bytes(data)
        return True
    except Exception as e:
        print("download_binary error:", e)
        return False

def parse_fid_from_script(html: str) -> int | None:
    m = re.search(r"const\s+fID\s*=\s*(\d+)\s*;", html)
    return int(m.group(1)) if m else None

def first_non_logo_thumb(html: str, base_url: str) -> str | None:
    soup = BeautifulSoup(html, "lxml")
    og = soup.find("meta", attrs={"property":"og:image"})
    if og and og.get("content"):
        return og["content"]
    tn = soup.find("meta", attrs={"name":"thumbnail"})
    if tn and tn.get("content"):
        return tn["content"]
    for img in soup.find_all("img"):
        src = img.get("src") or ""
        if "logo" in src.lower():
            continue
        return src if src.startswith("http") else base_url.rstrip("/") + "/" + src.lstrip("/")
    return None

async def save_thumb(html: str, url: str, base_dir="exrx_media") -> dict:
    out = {}
    thumb_url = first_non_logo_thumb(html, url)
    if not thumb_url:
        return out
    name = thumb_url.rsplit("/",1)[-1].split("?")[0] or "thumb.bin"
    dest = os.path.join(base_dir, "thumbs", name)
    ok = await download_binary(thumb_url, dest)
    if ok and file_ok(dest, 200):
        out["thumb_path"] = dest
    return out

def _ffmpeg_ok() -> bool:
    try:
        out = subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return out.returncode == 0
    except FileNotFoundError:
        return False

def _ffmpeg_bin() -> str | None:
    return shutil.which("ffmpeg")

def _ua() -> str:
    return ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/124.0.0.0 Safari/537.36")

async def save_exrx_clip(html: str, token: str | None, url: str, base_dir="exrx_media",
                         extra_cookie_header: str | None = None) -> dict:
    """
    Scarica l’MP4 dalla playlist HLS ExRx.
    Strategia:
      - Prova PRIMA il base URL: https://exrx.glorb.com/api/video/<token>/<fID>
      - Se fallisce, prova .../master.m3u8 poi .../index.m3u8
      - Tenta '-c copy', poi fallback ricodifica H.264/AAC
    """
    out = {}
    fid = parse_fid_from_script(html)

    # Costruisci base_clip (o fallback da <video><source>)
    if token and fid:
        base_clip = f"https://exrx.glorb.com/api/video/{token}/{fid}"
    else:
        soup = BeautifulSoup(html, "lxml")
        vid = soup.find("video")
        src = None
        if vid:
            src = vid.get("src") or (vid.find("source").get("src") if vid.find("source") else None)
        if not src:
            return out
        base_clip = src if src.startswith("http") else url.rstrip("/") + "/" + src.lstrip("/")

    # Ordine tentativi: BASE → master.m3u8 → index.m3u8
    candidates = [
        base_clip,
        base_clip.rstrip("/") + "/master.m3u8",
        base_clip.rstrip("/") + "/index.m3u8",
    ]

    ffmpeg = _ffmpeg_bin()
    if not ffmpeg:
        print("⚠️ ffmpeg non trovato dal kernel. Metti il path assoluto o riavvia il kernel.")
        return out

    # Header per server HLS
    parsed = urlparse(url)
    referer = f"{parsed.scheme}://{parsed.netloc}"
    headers = [
        f"Referer: {referer}",
        f"Origin: {referer}",
        "Accept: */*",
    ]
    if extra_cookie_header:
        headers.append(f"Cookie: {extra_cookie_header}")
    headers_str = "\r\n".join(headers) + "\r\n"
    ua = _ua()

    clips_dir = Path(base_dir) / "clips"
    clips_dir.mkdir(parents=True, exist_ok=True)
    out_path = str(clips_dir / f"{fid or 'clip'}.mp4")

    def try_ffmpeg(input_url: str) -> bool:
        # 1) Copy
        cmd_copy = [
            ffmpeg, "-y", "-loglevel", "warning",
            "-user_agent", ua,
            "-headers", headers_str,
            "-i", input_url,
            "-c", "copy", "-bsf:a", "aac_adtstoasc",
            out_path
        ]
        p = subprocess.run(cmd_copy, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if p.returncode == 0 and file_ok(out_path, 50_000):
            return True
        # 2) Transcode compatibile
        cmd_transcode = [
            ffmpeg, "-y", "-loglevel", "warning",
            "-user_agent", ua,
            "-headers", headers_str,
            "-i", input_url,
            "-vf", "scale=640:-2",
            "-r", "25",
            "-c:v", "libx264", "-pix_fmt", "yuv420p", "-preset", "veryfast", "-crf", "22",
            "-c:a", "aac", "-b:a", "128k",
            "-movflags", "+faststart",
            out_path
        ]
        p2 = subprocess.run(cmd_transcode, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if p2.returncode == 0 and file_ok(out_path, 50_000):
            return True
        # stampa diagnostica
        if p.stderr:
            print("ffmpeg copy stderr:\n", p.stderr.decode("utf-8", errors="ignore"))
        if p2.stderr:
            print("ffmpeg transcode stderr:\n", p2.stderr.decode("utf-8", errors="ignore"))
        # pulizia file vuoti
        try:
            if os.path.exists(out_path) and os.path.getsize(out_path) < 50_000:
                os.remove(out_path)
        except:
            pass
        return False

    ok = False
    for cand in candidates:
        if try_ffmpeg(cand):
            ok = True
            break

    if not ok:
        print("❌ Download video fallito o file troppo piccolo.")
        return out

    out["clip_path"] = out_path
    out["clip_can_read_first_frame"] = can_read_first_frame(out_path)
    return out

In [50]:
async def scrape_exrx_and_download(url: str) -> dict:
    html, token = await fetch_page_with_token(url)
    doc = parse_exrx_exercise_v3(html, url)

    # salva thumb e clip (bytes su disco)
    media = {}
    media.update(await save_thumb(html, url))
    media.update(await save_exrx_clip(html, token, url))

    # niente clip_url nell’output
    doc["media"] = media
    return doc

In [51]:
url = "https://exrx.net/WeightExercises/GluteusMaximus/BWSquat"
doc = await scrape_exrx_and_download(url)
print(json.dumps(doc, ensure_ascii=False, indent=2))
# Verifiche rapide:
print("Thumb:", doc.get("media", {}).get("thumb_path"))
print("Clip :", doc.get("media", {}).get("clip_path"))
print("Frame OK?:", doc.get("media", {}).get("clip_can_read_first_frame"))

{
  "name": "Squat",
  "source": {
    "site": "ExRx",
    "url": "https://exrx.net/WeightExercises/GluteusMaximus/BWSquat"
  },
  "tools": [
    {
      "type": "implement",
      "name": "bodyweight"
    }
  ],
  "sections": {
    "classification": {
      "utility": "Basic",
      "mechanics": "Compound",
      "force": "Push"
    },
    "preparation": "Stand with arms extended forward.",
    "execution": "Squat down by bending hips back while allowing knees to bend forward slightly, keeping back straight and knees pointed same direction as feet. Descend until thighs are just past parallel to floor. Squat up by extending knees and hips until legs are straight. Return and repeat.",
    "comments": "Keep head facing forward, back straight, chest high, and feet flat on surface with equal distribution of weight through forefoot and heel. Knees should point same direction as feet throughout movement. Arms positioned forward allows torso to be positioned more upright. See Squat Analysis .

In [52]:
import os, json, asyncio
from pathlib import Path
from typing import Any, Dict

# Assunzione: hai già in memoria:
# - dedup (dict) con chiavi canoniche e campi come da tuo esempio
# - fetch_page_with_token(url): -> (html, token) oppure (html, token, cookie_header)
# - parse_exrx_exercise_v3(html, url): -> dict con sections/tools/muscles ecc.
# - save_thumb(html, url)
# - save_exrx_clip(html, token, url, base_dir="exrx_media", extra_cookie_header=None)

SAVE_DIR = Path("exrx_json")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

def slug_from_url(url: str) -> str:
    return url.rstrip("/").rsplit("/", 1)[-1]

def validate_doc(doc: Dict[str, Any]) -> Dict[str, bool]:
    sections = doc.get("sections", {})
    prep_ok = bool(sections.get("preparation"))
    exec_ok = bool(sections.get("execution"))
    muscles_ok = bool(doc.get("main_muscles_involved"))  # lista non vuota
    video_ok = bool(doc.get("media", {}).get("clip_path"))
    thumb_ok = bool(doc.get("media", {}).get("thumb_path"))
    return {
        "prep_ok": prep_ok,
        "exec_ok": exec_ok,
        "muscles_ok": muscles_ok,
        "video_ok": video_ok,
        "thumb_ok": thumb_ok,
    }

async def process_one_exercise(key, meta: Dict[str, Any], sem: asyncio.Semaphore):
    """
    key: chiave canonica (tuple o str)
    meta: es. {
      "preferred_url": "...",
      "all_urls": [...],
      "macro_groups": [...],
      "micro_groups": [...],
      "implements": [...],
      "name": "..."
    }
    """
    async with sem:
        pref_url = meta.get("preferred_url") or (meta.get("all_urls") or [None])[0]
        if not pref_url:
            print(f"[SKIP] {key}: nessun URL.")
            return None

        # 1) fetch pagina + token (+eventuali cookie)
        try:
            out = await fetch_page_with_token(pref_url)
            if len(out) == 3:
                html, token, cookie_header = out
            else:
                html, token = out
                cookie_header = None
        except Exception as e:
            print(f"[ERR] fetch_page_with_token fallito: {pref_url} → {e}")
            return None

        # 2) parse pagina
        try:
            parsed = parse_exrx_exercise_v3(html, pref_url)
        except Exception as e:
            print(f"[ERR] parse fallito: {pref_url} → {e}")
            return None

        # 3) salva media (thumb + clip)
        media = {}
        try:
            media.update(await save_thumb(html, pref_url))
        except Exception as e:
            print(f"[WARN] thumb fallita: {pref_url} → {e}")
        try:
            media.update(await save_exrx_clip(html, token, pref_url, extra_cookie_header=cookie_header))
        except Exception as e:
            print(f"[WARN] clip fallita: {pref_url} → {e}")

        # 4) fonde metadati dal dedupe
        #    (senza sovrascrivere sections/media già estratte dal parser)
        record = {
            "canonical_key": key if isinstance(key, (str, int)) else list(key),
            "name": parsed.get("name") or meta.get("name"),
            "source": {"site": "ExRx", "url": pref_url},
            "aliases": meta.get("all_urls", []),
            "macro_groups": meta.get("macro_groups", []),
            "micro_groups": meta.get("micro_groups", []),
            "implements": meta.get("implements", []),
            "tools": parsed.get("tools", []),
            "sections": parsed.get("sections", {}),
            "main_muscles_involved": parsed.get("main_muscles_involved", []),
            "stabilizers": parsed.get("stabilizers", []),
            "media": media
        }

        # 5) validazione + log sintetico
        flags = validate_doc(record)
        missing = [k for k, ok in flags.items() if not ok]
        if missing:
            print(f"[INFO] {record['name'] or 'N/A'} → missing: {', '.join(missing)}")
        else:
            print(f"[OK]   {record['name'] or 'N/A'}")

        # 6) salva JSON per esercizio
        fname = slug_from_url(pref_url) or "exercise"
        out_path = SAVE_DIR / f"{fname}.json"
        out_path.write_text(json.dumps(record, ensure_ascii=False, indent=2))

        return record, flags

async def run_bulk_from_dedup(dedup: Dict[Any, Dict[str, Any]], max_concurrency: int = 3):
    sem = asyncio.Semaphore(max_concurrency)
    tasks = []
    for key, meta in dedup.items():
        tasks.append(process_one_exercise(key, meta, sem))

    results = await asyncio.gather(*tasks, return_exceptions=True)

    # aggrega risultati + summary
    master = []
    summary = {
        "total": 0,
        "ok_full": 0,
        "missing_prep": 0,
        "missing_exec": 0,
        "missing_muscles": 0,
        "missing_video": 0,
        "missing_thumb": 0,
        "failed": 0,
    }

    for res in results:
        if isinstance(res, Exception) or res is None:
            summary["failed"] += 1
            continue
        record, flags = res
        master.append(record)
        summary["total"] += 1
        if all(flags.values()):
            summary["ok_full"] += 1
        if not flags["prep_ok"]:
            summary["missing_prep"] += 1
        if not flags["exec_ok"]:
            summary["missing_exec"] += 1
        if not flags["muscles_ok"]:
            summary["missing_muscles"] += 1
        if not flags["video_ok"]:
            summary["missing_video"] += 1
        if not flags["thumb_ok"]:
            summary["missing_thumb"] += 1

    # salva master
    master_path = SAVE_DIR / "_all_exercises.json"
    master_path.write_text(json.dumps(master, ensure_ascii=False, indent=2))

    # stampa summary
    print("\n==== SUMMARY ====")
    for k, v in summary.items():
        print(f"{k:>16}: {v}")
    print(f"Salvato master → {master_path}")
    return master, summary

In [53]:
# Esempio: dedup con una sola voce (il tuo caso squat)
# In realtà qui passa direttamente il tuo "dedup" completo che hai già calcolato
dedup_example = {
    ("WeightExercises", "BWSquat"): {
        "preferred_url": "https://exrx.net/WeightExercises/GluteusMaximus/BWSquat",
        "all_urls": [
            "https://exrx.net/WeightExercises/GluteusMaximus/BWSquat",
            "https://exrx.net/WeightExercises/Quadriceps/BWSquat"
        ],
        "macro_groups": ["Hips", "Thighs"],
        "micro_groups": ["Gluteus Maximus", "Quadriceps"],
        "implements": ["bodyweight"],
        "name": "Squat"
    }
}

master, summary = await run_bulk_from_dedup(dedup)

[SKIP] ('WeightExercises', 'BBBentOverRow'): nessun URL.
[SKIP] ('WeightExercises', 'BBUnderhandBentOverRow'): nessun URL.
[SKIP] ('WeightExercises', 'CBOneArmBentoverRow'): nessun URL.
[SKIP] ('WeightExercises', 'CBOneArmStrBackHighRow'): nessun URL.
[SKIP] ('WeightExercises', 'CBSeatedRow'): nessun URL.
[SKIP] ('WeightExercises', 'CBStraightBackSeatedRow'): nessun URL.
[SKIP] ('WeightExercises', 'CBWideGripSeatedRow'): nessun URL.
[SKIP] ('WeightExercises', 'CBWideGripStrBackSeatedRow'): nessun URL.
[SKIP] ('WeightExercises', 'DBBentOverRow'): nessun URL.
[SKIP] ('WeightExercises', 'DBLyingRow'): nessun URL.
[SKIP] ('WeightExercises', 'LVNeutralGripInclineRowPL'): nessun URL.
[SKIP] ('WeightExercises', 'LVWideGripInclineRowPL'): nessun URL.
[SKIP] ('WeightExercises', 'LVNarrowGripSeatedRowH'): nessun URL.
[SKIP] ('WeightExercises', 'LVWideGripSeatedRowH'): nessun URL.
[SKIP] ('WeightExercises', 'LVWideLowGripSeatedRowH'): nessun URL.
[SKIP] ('WeightExercises', 'LVSeatedLowRow'): ness