In [1]:
import re, json
from urllib.parse import urljoin, urlparse, urldefrag
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

BASE = "https://exrx.net"

SPACE = re.compile(r"\s+")
def norm(t: str) -> str:
    return SPACE.sub(" ", (t or "").strip())

def last_path_key(u: str) -> str:
    p = urlparse(u)
    return p.path.rstrip("/").split("/")[-1]  # es. ShouldWt

# Mappa stabile: chiave nel path -> nome macro umano
PATH2MACRO = {
    "NeckWt": "Neck",
    "ShouldWt": "Shoulders",
    "ArmWt": "Upper Arms",
    "ForeArmWt": "Forearms",
    "BackWt": "Back",
    "ChestWt": "Chest",
    "WaistWt": "Waist",
    "HipsWt": "Hips",
    "ThighWt": "Thighs",
    "CalfWt": "Calves",
    # Se in futuro userai “Other Exercises”, aggiungi qui le chiavi relative
}
MACRO_NAMES = set(PATH2MACRO.values())

In [2]:
async def fetch_html(url: str) -> str:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        ctx = await browser.new_context(
            user_agent=("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/124.0.0.0 Safari/537.36")
        )
        page = await ctx.new_page()
        await page.goto(url, wait_until="domcontentloaded")
        await page.wait_for_timeout(1200)  # piccolo respiro per WAF/JS
        html = await page.content()
        await ctx.close(); await browser.close()
        return html

In [3]:
async def parse_macro_categories(dir_url=f"{BASE}/Lists/Directory"):
    html = await fetch_html(dir_url)
    soup = BeautifulSoup(html, "lxml")

    # raccogli tutti i link che portano a /Lists/ExList/<Key>
    candidates = []
    for a in soup.select("a[href]"):
        href = a.get("href");
        if not href:
            continue
        absu = urljoin(dir_url, href)
        if "/Lists/ExList/" not in absu:
            continue
        key = last_path_key(absu)              # es. ShouldWt
        url_clean = urldefrag(absu)[0]         # senza #fragment
        txt = norm(a.get_text())               # etichetta mostrata per quel link
        candidates.append((key, url_clean, txt))

    # collassa per chiave (Key) -> scegli un nome macro “migliore”
    macros = {}
    for key, url_clean, txt in candidates:
        # se la chiave è mappata, usa il nome canonico; altrimenti prova il testo del link
        canonical = PATH2MACRO.get(key)
        display = canonical or txt
        # preferisci un nome che appartenga all’elenco canonico se appare tra i link
        if key not in macros:
            macros[key] = {"macro_key": key, "macro_name": display, "list_url": url_clean}
        else:
            # se non avevamo un nome canonico e ora vediamo un link con nome “ufficiale”, aggiorna
            if macros[key]["macro_name"] not in MACRO_NAMES and display in MACRO_NAMES:
                macros[key]["macro_name"] = display
            # assicura che l’URL sia quello senza fragment
            macros[key]["list_url"] = url_clean

    # ordina alfabeticamente per nome macro
    out = sorted(macros.values(), key=lambda d: d["macro_name"].lower())
    return out

In [4]:
macros = await parse_macro_categories()
print("Macrocategorie trovate:", len(macros))
for m in macros:
    print(f"- {m['macro_name']:12s} → {m['list_url']}")

Macrocategorie trovate: 10
- Back         → https://exrx.net/Lists/ExList/BackWt
- Calves       → https://exrx.net/Lists/ExList/CalfWt
- Chest        → https://exrx.net/Lists/ExList/ChestWt
- Forearms     → https://exrx.net/Lists/ExList/ForeArmWt
- Hips         → https://exrx.net/Lists/ExList/HipsWt
- Neck         → https://exrx.net/Lists/ExList/NeckWt
- Shoulders    → https://exrx.net/Lists/ExList/ShouldWt
- Thighs       → https://exrx.net/Lists/ExList/ThighWt
- Upper Arms   → https://exrx.net/Lists/ExList/ArmWt
- Waist        → https://exrx.net/Lists/ExList/WaistWt


In [5]:
import re, json
from urllib.parse import urljoin, urldefrag
from bs4 import BeautifulSoup

SPACE = re.compile(r"\s+")
def norm(t: str) -> str:
    return SPACE.sub(" ", (t or "").strip())

IMPLE_MAP = {
    "barbell": "barbell",
    "body weight": "bodyweight",
    "bodyweight": "bodyweight",
    "dumbbell": "dumbbell",
    "kettlebell": "kettlebell",
    "cable": "cable",
    "lever": "lever",
    "machine": "machine",
    "smith": "smith machine",
    "smith machine": "smith machine",
    "suspended": "suspended",
    "self-assisted": "self-assisted",
    "stretch": "stretch",
    "dynamic stretch": "dynamic stretch",
    "plyometrics": "plyometrics",
    "band": "band",
    "medicine ball": "medicine ball",
    "sandbag": "sandbag",
    "sled": "sled",
    "trap bar": "trap bar",
    "trx": "suspended",
}
def norm_implement(label: str) -> str:
    l = norm(label).lower()
    if l.startswith("lever"):  # lever (selectorized / plate loaded)
        return "lever"
    for k,v in IMPLE_MAP.items():
        if k in l:
            return v
    return l or "unknown"

async def parse_macro_page(list_url: str, macro_name: str):
    html = await fetch_html(list_url)
    soup = BeautifulSoup(html, "lxml")

    SPACE = re.compile(r"\s+")
    def norm(t: str) -> str:
        return SPACE.sub(" ", (t or "").strip())

    IMPLEMENT_HEADINGS = {
        "barbell","body weight","bodyweight","cable","dumbbell","kettlebell",
        "lever","machine","smith","smith machine","suspended","self-assisted",
        "stretch","dynamic stretch","plyometrics","band","medicine ball","sandbag",
        "sled","trap bar","trx"
    }
    def is_implement_heading(text: str) -> bool:
        t = norm(text).lower()
        if t.startswith("lever"):  # lever (selectorized/plate loaded)
            return True
        return any(k == t or k in t for k in IMPLEMENT_HEADINGS)

    def norm_impl(label: str) -> str:
        l = norm(label).lower()
        if l.startswith("lever"):  return "lever"
        if "smith" in l:           return "smith machine"
        if "body weight" in l or l == "bodyweight": return "bodyweight"
        for k in IMPLEMENT_HEADINGS:
            if k in l: return k
        return l or "unknown"

    def anchor(li):
        a = li.find("a", href=True)
        if not a:
            return None, None
        return norm(a.get_text()), urldefrag(urljoin(list_url, a["href"]))[0]

    def direct_text(el) -> str:
        if el is None: return ""
        txts = [t for t in el.find_all(string=True, recursive=False)]
        return norm(" ".join(txts))

    records = []
    current_micro = None
    current_impl  = None
    processed_uls = set()

    def emit(name, url, micro, impl):
        if not name or not url:
            return
        records.append({
            "macro_group": macro_name,
            "micro_group": micro or "",
            "implement": impl or "",
            "exercise_name": name,
            "exercise_url": url,
            "list_url": list_url,
        })

    def walk_variants(parent_li, base_name, base_url, micro, impl):
        """
        Scorre le varianti annidate sotto parent_li:
        - per ogni li child con anchor → nome composto "base — child"
        - se un child ha a sua volta sub-ul, prosegue ricorsivamente
        """
        ul = parent_li.find("ul")
        if not ul:
            return
        if id(ul) in processed_uls:
            return
        processed_uls.add(id(ul))

        for child in ul.find_all("li", recursive=False):
            vname, vurl = anchor(child)
            if vname and vurl:
                composed = f"{base_name} — {vname}"
                emit(composed, vurl, micro, impl)
            # ricorsione per livelli più profondi
            if child.find("ul"):
                walk_variants(child, base_name if not vname else composed, vurl or base_url, micro, impl)

    # scan sequenziale
    for node in soup.find_all(["h1","h2","h3","h4","strong","ul","li"]):
        tag  = node.name.lower()
        text = norm(node.get_text())

        # micro (titoli muscolari)
        if tag in ("h1","h2","h3") and text:
            if any(w in text.lower() for w in (
                "deltoid","quadriceps","hamstrings","gluteus","pectoralis","abdominis",
                "obliques","trapezius","rhomboids","erector","calves","soleus",
                "gastrocnemius","forearm","biceps","triceps","back","hip","waist","neck"
            )):
                current_micro = text
                current_impl  = None
                continue

        # implement come header
        if tag in ("h3","h4","strong") and text and is_implement_heading(text):
            current_impl = norm_impl(text)
            continue

        # implement come LI contenitore: <li>Barbell<ul>...</ul></li>
        if tag == "li" and node.find("ul"):
            impl_label = direct_text(node)
            if is_implement_heading(impl_label):
                current_impl = norm_impl(impl_label)
                # ogni li *figlio* di questo ul è un esercizio o una sottocategoria
                ul = node.find("ul")
                if id(ul) not in processed_uls:
                    processed_uls.add(id(ul))
                    for li in ul.find_all("li", recursive=False):
                        name, url = anchor(li)
                        if name and url:
                            emit(name, url, current_micro, current_impl)
                            # gestisci varianti sotto questo li (se presenti)
                            if li.find("ul"):
                                walk_variants(li, name, url, current_micro, current_impl)
                continue

        # UL “libero” sotto un header implement
        if tag == "ul" and current_micro:
            if node.find_parent("li") is not None:
                continue
            prev_hdr = node.find_previous(lambda t: t.name in ("h3","h4","strong","h2","h1"))
            if not (prev_hdr and is_implement_heading(prev_hdr.get_text())):
                continue
            current_impl = norm_impl(prev_hdr.get_text())
            if id(node) in processed_uls:
                continue
            processed_uls.add(id(node))
            for li in node.find_all("li", recursive=False):
                name, url = anchor(li)
                if name and url:
                    emit(name, url, current_micro, current_impl)
                    if li.find("ul"):
                        walk_variants(li, name, url, current_micro, current_impl)

    # dedup finale per (url, micro, implement, name)
    seen, out = set(), []
    for r in records:
        key = (r["exercise_url"], r["micro_group"], r["implement"], r["exercise_name"])
        if key in seen:
            continue
        seen.add(key); out.append(r)
    return out

    # dedup
    seen, out = set(), []
    for r in records:
        key = (r["exercise_url"], r["micro_group"], r["implement"])
        if key in seen:
            continue
        seen.add(key); out.append(r)
    return out

    # Dedup
    seen, out = set(), []
    for r in records:
        key = (r["exercise_url"], r["micro_group"], r["implement"])
        if key in seen:
            continue
        seen.add(key); out.append(r)
    return out

    # dedup
    seen, out = set(), []
    for r in records:
        key = (r["exercise_url"], r["micro_group"], r["implement"])
        if key in seen:
            continue
        seen.add(key); out.append(r)
    return out

In [6]:
# esempio: Shoulders
macros = await parse_macro_categories()
shoulders = next(m for m in macros if m["macro_name"]=="Shoulders")
recs = await parse_macro_page(shoulders["list_url"], macro_name=shoulders["macro_name"])
print("Esercizi estratti (Shoulders):", len(recs))
for r in recs[:15]:
    print(f"- [{r['micro_group']}] {r['implement'] or 'unknown'} | {r['exercise_name']} → {r['exercise_url']}")

Esercizi estratti (Shoulders): 70
- [Anterior Deltoid] barbell | Front Raise → https://exrx.net/WeightExercises/DeltoidAnterior/BBFrontRaise
- [Anterior Deltoid] barbell | Military Press → https://exrx.net/WeightExercises/DeltoidAnterior/BBMilitaryPress
- [Anterior Deltoid] barbell | Military Press — Seated → https://exrx.net/WeightExercises/DeltoidAnterior/BBSeatedMilitaryPress
- [Anterior Deltoid] cable | Front Raise → https://exrx.net/WeightExercises/DeltoidAnterior/CBSeatedFrontRaise
- [Anterior Deltoid] cable | Front Raise — Alternating → https://exrx.net/WeightExercises/DeltoidAnterior/CBAlternatingFrontRaise
- [Anterior Deltoid] cable | Front Raise — One Arm → https://exrx.net/WeightExercises/DeltoidAnterior/CBFrontRaise
- [Anterior Deltoid] cable | Shoulder Press → https://exrx.net/WeightExercises/DeltoidAnterior/CBStandingShoulderPress
- [Anterior Deltoid] cable | Shoulder Press — Seated → https://exrx.net/WeightExercises/DeltoidAnterior/CBShoulderPress
- [Anterior Deltoid] du

In [7]:
import asyncio, time

async def crawl_all_macros(delay_sec: float = 0.8, limit: int | None = None):
    macros = await parse_macro_categories()
    if limit:
        macros = macros[:limit]

    all_rows = []
    per_macro_counts = {}

    for i, m in enumerate(macros, 1):
        t0 = time.time()
        try:
            rows = await parse_macro_page(m["list_url"], macro_name=m["macro_name"])
        except Exception as e:
            print(f"[{i}/{len(macros)}] ERRORE {m['macro_name']}: {e}")
            rows = []
        all_rows.extend(rows)
        per_macro_counts[m["macro_name"]] = len(rows)

        dt = time.time() - t0
        # rate limit gentile
        if dt < delay_sec:
            await asyncio.sleep(delay_sec - dt)

        print(f"[{i}/{len(macros)}] {m['macro_name']:<12} → {len(rows):3d} righe (tot {len(all_rows)})")

    return macros, all_rows, per_macro_counts

# ESEGUI (puoi iniziare con limit=3 per test)
macros, rows, per_macro = await crawl_all_macros(delay_sec=0.8, limit=None)
print("\nTot righe:", len(rows))
print("Per macro:", per_macro)
print("Esempio:", rows[:3])

[1/10] Back         → 109 righe (tot 109)
[2/10] Calves       →  59 righe (tot 168)
[3/10] Chest        →  72 righe (tot 240)
[4/10] Forearms     →  28 righe (tot 268)
[5/10] Hips         → 148 righe (tot 416)
[6/10] Neck         →  17 righe (tot 433)
[7/10] Shoulders    →  70 righe (tot 503)
[8/10] Thighs       → 125 righe (tot 628)
[9/10] Upper Arms   →  60 righe (tot 688)
[10/10] Waist        → 101 righe (tot 789)

Tot righe: 789
Per macro: {'Back': 109, 'Calves': 59, 'Chest': 72, 'Forearms': 28, 'Hips': 148, 'Neck': 17, 'Shoulders': 70, 'Thighs': 125, 'Upper Arms': 60, 'Waist': 101}
Esempio: [{'macro_group': 'Back', 'micro_group': 'General Back', 'implement': 'barbell', 'exercise_name': 'Bent-over Row', 'exercise_url': 'https://exrx.net/WeightExercises/BackGeneral/BBBentOverRow', 'list_url': 'https://exrx.net/Lists/ExList/BackWt'}, {'macro_group': 'Back', 'micro_group': 'General Back', 'implement': 'barbell', 'exercise_name': 'Bent-over Row — Underhand', 'exercise_url': 'https://ex

In [8]:
from collections import defaultdict

def build_catalog(rows: list[dict]) -> dict:
    """
    Collassa le righe (macro,micro,implement,exercise_name,exercise_url)
    in un indice per exercise_url con:
      - name (preferisce il più lungo/descrittivo)
      - implements (set)
      - macro_groups (set)
      - micro_groups (set)
      - variants_by_impl (implement -> set di nomi completi)
      - list_urls (set)
    """
    cat = {}
    for r in rows:
        url = r["exercise_url"]
        name = r["exercise_name"]
        impl = r["implement"] or ""
        micro = r["micro_group"] or ""
        macro = r["macro_group"] or ""
        lst  = r["list_url"]

        doc = cat.get(url, {
            "name": name,
            "exercise_url": url,
            "implements": set(),
            "macro_groups": set(),
            "micro_groups": set(),
            "variants_by_impl": defaultdict(set),
            "list_urls": set(),
        })

        # tieni il nome più “ricco”
        if len(name) > len(doc["name"]):
            doc["name"] = name

        if impl:
            doc["implements"].add(impl)
            doc["variants_by_impl"][impl].add(name)
        else:
            doc["variants_by_impl"]["unknown"].add(name)

        if macro: doc["macro_groups"].add(macro)
        if micro: doc["micro_groups"].add(micro)
        if lst:   doc["list_urls"].add(lst)

        cat[url] = doc

    # set → list ordinata
    for d in cat.values():
        d["implements"]    = sorted(d["implements"])
        d["macro_groups"]  = sorted(d["macro_groups"])
        d["micro_groups"]  = sorted(d["micro_groups"])
        d["list_urls"]     = sorted(d["list_urls"])
        d["variants_by_impl"] = {k: sorted(v) for k, v in d["variants_by_impl"].items()}
    return cat

catalog = build_catalog(rows)
print("Esercizi unici:", len(catalog))
# stampa 3 esempi
for i, (u, d) in enumerate(catalog.items()):
    if i >= 3: break
    print("\nURL:", u)
    print("  name:         ", d["name"])
    print("  implements:   ", d["implements"])
    print("  macro_groups: ", d["macro_groups"])
    print("  micro_groups: ", d["micro_groups"])
    print("  variants_by_impl sample:", {k: v[:3] for k,v in d["variants_by_impl"].items()})

Esercizi unici: 745

URL: https://exrx.net/WeightExercises/BackGeneral/BBBentOverRow
  name:          Bent-over Row
  implements:    ['barbell']
  macro_groups:  ['Back']
  micro_groups:  ['General Back']
  variants_by_impl sample: {'barbell': ['Bent-over Row']}

URL: https://exrx.net/WeightExercises/BackGeneral/BBUnderhandBentOverRow
  name:          Bent-over Row — Underhand
  implements:    ['barbell']
  macro_groups:  ['Back']
  micro_groups:  ['General Back']
  variants_by_impl sample: {'barbell': ['Bent-over Row — Underhand']}

URL: https://exrx.net/WeightExercises/BackGeneral/CBOneArmBentoverRow
  name:          One Arm Bent-over Row
  implements:    ['cable']
  macro_groups:  ['Back']
  micro_groups:  ['General Back']
  variants_by_impl sample: {'cable': ['One Arm Bent-over Row']}


In [9]:
catalog

{'https://exrx.net/WeightExercises/BackGeneral/BBBentOverRow': {'name': 'Bent-over Row',
  'exercise_url': 'https://exrx.net/WeightExercises/BackGeneral/BBBentOverRow',
  'implements': ['barbell'],
  'macro_groups': ['Back'],
  'micro_groups': ['General Back'],
  'variants_by_impl': {'barbell': ['Bent-over Row']},
  'list_urls': ['https://exrx.net/Lists/ExList/BackWt']},
 'https://exrx.net/WeightExercises/BackGeneral/BBUnderhandBentOverRow': {'name': 'Bent-over Row — Underhand',
  'exercise_url': 'https://exrx.net/WeightExercises/BackGeneral/BBUnderhandBentOverRow',
  'implements': ['barbell'],
  'macro_groups': ['Back'],
  'micro_groups': ['General Back'],
  'variants_by_impl': {'barbell': ['Bent-over Row — Underhand']},
  'list_urls': ['https://exrx.net/Lists/ExList/BackWt']},
 'https://exrx.net/WeightExercises/BackGeneral/CBOneArmBentoverRow': {'name': 'One Arm Bent-over Row',
  'exercise_url': 'https://exrx.net/WeightExercises/BackGeneral/CBOneArmBentoverRow',
  'implements': ['cab

In [10]:
import re, hashlib
from urllib.parse import urlparse

def exrx_canonical_key(ex_url: str):
    """
    Ritorna ('WeightExercises', slug) per gli esercizi, altrimenti (first_path, slug).
    Esempio: https://exrx.net/WeightExercises/GluteusMaximus/BWSquat -> ('WeightExercises','BWSquat')
    """
    p = urlparse(ex_url)
    parts = [x for x in p.path.split("/") if x]
    if not parts:
        return None
    slug = parts[-1]
    first = parts[0]
    return (first, slug)

def merge_docs(a: dict, b: dict) -> dict:
    """Unisci due entry dello stesso esercizio."""
    out = dict(a)
    # name: tieni il più ricco
    if len(b.get("name","")) > len(out.get("name","")):
        out["name"] = b["name"]

    # fields set-like
    for k in ("implements","macro_groups","micro_groups","list_urls"):
        sa = set(out.get(k, []))
        sb = set(b.get(k, []))
        out[k] = sorted(sa | sb)

    # variants_by_impl: unione per implement
    vbi_a = out.get("variants_by_impl", {})
    vbi_b = b.get("variants_by_impl", {})
    merged = {}
    for impl in set(vbi_a.keys()) | set(vbi_b.keys()):
        merged[impl] = sorted(set(vbi_a.get(impl, [])) | set(vbi_b.get(impl, [])))
    out["variants_by_impl"] = merged

    return out

In [11]:
def dedupe_by_slug(catalog: dict) -> dict:
    """
    Collassa il catalog per chiave canonica ('WeightExercises', slug).
    Restituisce un nuovo dict indicizzato dalla *chiave canonica* (non più dall'URL singolo).
    """
    buckets = {}
    url_index = {}  # mappa canon_key -> lista URL originali (utile per trace)

    for url, doc in catalog.items():
        key = exrx_canonical_key(url)
        if key is None:
            # tieni a parte (se hai altre sezioni oltre WeightExercises)
            key = ("other", url.rsplit("/",1)[-1])
        if key not in buckets:
            buckets[key] = {
                **doc,
                "exercise_url": url,      # tieni uno come "preferito"
                "_all_urls": [url],       # traccia tutti gli URL uniti
            }
            url_index[key] = [url]
        else:
            buckets[key] = merge_docs(buckets[key], doc)
            url_index[key].append(url)
            buckets[key]["_all_urls"] = sorted(set(buckets[key].get("_all_urls", []) + [url]))

    return buckets

dedup = dedupe_by_slug(catalog)
print("Prima:", len(catalog), "→ Dopo (per slug):", len(dedup))
# Esempio: cerca BWSquat
for k, d in dedup.items():
    if k[1].lower() == "bwsquat":
        print("\nChiave canonica:", k)
        print("URL preferito:  ", d["exercise_url"])
        print("Tutti gli URL:  ", d["_all_urls"])
        print("Macro groups:   ", d["macro_groups"])
        print("Micro groups:   ", d["micro_groups"])
        print("Implements:     ", d["implements"])
        print("Name:           ", d["name"])
        break

Prima: 745 → Dopo (per slug): 653

Chiave canonica: ('WeightExercises', 'BWSquat')
URL preferito:   https://exrx.net/WeightExercises/GluteusMaximus/BWSquat
Tutti gli URL:   ['https://exrx.net/WeightExercises/GluteusMaximus/BWSquat', 'https://exrx.net/WeightExercises/Quadriceps/BWSquat']
Macro groups:    ['Hips', 'Thighs']
Micro groups:    ['Gluteus Maximus', 'Quadriceps']
Implements:      ['bodyweight']
Name:            Squat
