#### Cell 1 — Project root + imports

In [1]:
# --- Project Root Setup (run first in every notebook) ---
from pathlib import Path
import sys
root = Path("..").resolve()
sys.path.append(str(root / "src"))
print("Project root:", root)


Project root: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL


#### Cell 2 — Load flattened JSON, compute top-100 glosses, save artifacts
This builds stable label maps and a filtered JSON just for WLASL100.


In [2]:
import json, yaml
from collections import Counter

# Load config & flat JSON
with open(root / "configs" / "wlasl100.yaml", "r") as f:
    C = yaml.safe_load(f)
flat_path = root / "data" / "metadata" / "WLASL_flat.json"
data = json.load(open(flat_path))

# Count per-gloss frequency and pick top-K
top_k = C["wlasl"]["top_k"]
gloss_counts = Counter(d["gloss"] for d in data)
most_common = gloss_counts.most_common(top_k)

# Tie-break: frequency desc, then alphabetical asc for determinism
most_common_sorted = sorted(most_common, key=lambda x: (-x[1], x[0]))
top_glosses = [g for g, _ in most_common_sorted]

# Build label maps (stable)
gloss2idx = {g: i for i, g in enumerate(top_glosses)}
idx2gloss = {i: g for g, i in gloss2idx.items()}

# Filter instances to only top-100
wl100 = [d for d in data if d["gloss"] in gloss2idx]

# Save artifacts
meta_dir = root / "data" / "metadata"
meta_dir.mkdir(parents=True, exist_ok=True)
(json.dumps(top_glosses, indent=2))
with open(meta_dir / "wlasl100_glosses.txt", "w") as f:
    f.write("\n".join(top_glosses))
with open(meta_dir / "wlasl100_label_map.json", "w") as f:
    json.dump({"gloss2idx": gloss2idx, "idx2gloss": idx2gloss}, f, indent=2)
with open(meta_dir / "WLASL100_flat.json", "w") as f:
    json.dump(wl100, f, indent=2)

print(f"Top-{top_k} glosses saved to wlasl100_glosses.txt")
print(f"Label map saved to wlasl100_label_map.json")
print(f"WLASL100 instances: {len(wl100)}")


Top-100 glosses saved to wlasl100_glosses.txt
Label map saved to wlasl100_label_map.json
WLASL100 instances: 2038


#### Cell 3 — Quick sanity: counts per split for WLASL100

In [3]:
from collections import Counter, defaultdict

split_counts = Counter(d["split"] for d in wl100)
per_gloss = defaultdict(lambda: Counter())
for d in wl100:
    per_gloss[d["gloss"]][d["split"]] += 1

print("WLASL100 split counts:", dict(split_counts))
print("Examples per gloss (min/avg/max):",
      min(sum(c.values()) for c in per_gloss.values()),
      sum(sum(c.values()) for c in per_gloss.values())/len(per_gloss),
      max(sum(c.values()) for c in per_gloss.values()))


WLASL100 split counts: {'train': 1442, 'val': 338, 'test': 258}
Examples per gloss (min/avg/max): 18 20.38 40


#### Cell 4 — Prepare download list + outputs
We’ll download to data/raw/{video_id}.mp4, create a missing.txt, and remux non-mp4 inputs to mp4 if needed.

In [4]:
import os, subprocess, shutil
from tqdm import tqdm
from urllib.parse import urlparse

raw_dir = root / "data" / "raw"
raw_dir.mkdir(parents=True, exist_ok=True)
missing = []
downloaded = 0

def ensure_mp4(path_in: Path) -> Path:
    """
    If the downloaded file is not .mp4 or not H.264/AAC container,
    remux/re-encode to mp4 using ffmpeg (stream copy if possible).
    """
    if path_in.suffix.lower() == ".mp4":
        return path_in
    path_out = path_in.with_suffix(".mp4")
    cmd = ["ffmpeg", "-y", "-i", str(path_in), "-c", "copy", str(path_out)]
    try:
        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        path_in.unlink(missing_ok=True)
        return path_out
    except subprocess.CalledProcessError:
        # fallback: re-encode to guaranteed mp4 (H.264 + AAC)
        cmd = ["ffmpeg", "-y", "-i", str(path_in), "-c:v", "libx264", "-c:a", "aac", "-movflags", "+faststart", str(path_out)]
        subprocess.run(cmd, check=True)
        path_in.unlink(missing_ok=True)
        return path_out

def download_one(url: str, vid: str) -> bool:
    # target path (without extension yet)
    target = raw_dir / f"{vid}.mp4"
    if target.exists():
        return True
    # temp path (let yt-dlp choose extension)
    tmp = raw_dir / f"{vid}.%(ext)s"
    cmd = [
        "yt-dlp",
        "-o", str(tmp),
        "-f", "mp4/best",   # prefer mp4; fallback to best
        url
    ]
    try:
        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        # Find the actual file that was saved (match vid.*)
        candidates = list(raw_dir.glob(f"{vid}.*"))
        if not candidates:
            return False
        final_path = ensure_mp4(candidates[0])
        return final_path.exists()
    except subprocess.CalledProcessError:
        return False

# Build a unique set of (video_id, url) to avoid duplicates
pairs = {}
for d in wl100:
    vid, url = d["video_id"], d["url"]
    if vid and url:
        pairs[vid] = url

print(f"Prepared {len(pairs)} unique videos to download.")


Prepared 2038 unique videos to download.


#### Cell 5 — Download with retries (YouTube + direct HTTP both work in yt-dlp)

In [5]:
import sys, subprocess, shutil

# Check if yt-dlp exists in PATH
print("yt-dlp path:", shutil.which("yt-dlp"))

# Install or upgrade inside current ai-env if missing
if shutil.which("yt-dlp") is None:
    subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "yt-dlp"], check=True)

# Re-check
print("yt-dlp path after install:", shutil.which("yt-dlp"))


yt-dlp path: None
yt-dlp path after install: None


In [8]:
# Preflight: fast reachability filter to avoid long stalls
from pathlib import Path
import json, yaml, sys, time
from urllib.parse import urlparse
import requests
from tqdm import tqdm

root = Path("..").resolve()
C = yaml.safe_load(open(root / "configs" / "wlasl100.yaml"))
meta = json.load(open(root / "data" / "metadata" / "WLASL100_flat.json"))

# Build unique (video_id -> url)
pairs = {}
for d in meta:
    vid, url = d.get("video_id"), d.get("url")
    if vid and url and vid not in pairs:
        pairs[vid] = url

# Domains we know are frequently down/slow — skip for now (we'll request these)
BLOCKLIST = {"www.aslpro.com"}  # add if you see repeated timeouts: {'aslsignbank.haskins.yale.edu', ...}

def host(url: str) -> str:
    try:
        return urlparse(url).netloc.lower()
    except Exception:
        return ""

# Quick HEAD probe with short timeout; we only need to know if host is responsive at all
def is_reachable(url: str, timeout=4):
    h = host(url)
    if h in BLOCKLIST:
        return False
    try:
        # some servers don't support HEAD; allow GET fallback
        r = requests.head(url, timeout=timeout, allow_redirects=True)
        if 200 <= r.status_code < 400:
            return True
        # retry with GET for HEAD-hostile servers
        r = requests.get(url, stream=True, timeout=timeout, allow_redirects=True)
        return 200 <= r.status_code < 400
    except Exception:
        return False

reachable, unreachable = {}, []

for vid, url in tqdm(pairs.items(), desc="Preflight reachability"):
    if is_reachable(url):
        reachable[vid] = url
    else:
        unreachable.append(vid)

meta_dir = root / "data" / "metadata"
meta_dir.mkdir(parents=True, exist_ok=True)
with open(meta_dir / "preflight_unreachable.txt", "w") as f:
    f.write("\n".join(unreachable))

print(f"Reachable: {len(reachable)} / {len(pairs)}")
print(f"Preflight unreachable saved to: {meta_dir / 'preflight_unreachable.txt'}")

# Use only reachable URLs for the heavy yt-dlp step:
pairs = reachable


Preflight reachability: 100%|██████████| 2038/2038 [09:38<00:00,  3.52it/s]

Reachable: 1480 / 2038
Preflight unreachable saved to: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/data/metadata/preflight_unreachable.txt





In [9]:
from yt_dlp import YoutubeDL
from pathlib import Path
import json, yaml, shutil, subprocess, sys
from time import sleep
from urllib.parse import urlparse, urlunparse
from tqdm import tqdm

root = Path("..").resolve()
with open(root / "configs" / "wlasl100.yaml", "r") as f:
    C = yaml.safe_load(f)

meta = json.load(open(root / "data" / "metadata" / "WLASL100_flat.json"))
raw_dir = root / "data" / "raw"
raw_dir.mkdir(parents=True, exist_ok=True)

# ---------- helpers ----------
def normalize_url(u: str) -> str:
    # Yale Signbank has TLS hostname mismatch; prefer http
    try:
        p = urlparse(u)
        host = (p.netloc or "").lower()
        if "aslsignbank.haskins.yale.edu" in host:
            p = p._replace(scheme="http")
            return urlunparse(p)
        return u
    except Exception:
        return u

def ensure_mp4(path_in: Path) -> Path:
    if path_in.suffix.lower() == ".mp4":
        return path_in
    dst = path_in.with_suffix(".mp4")
    ffmpeg = shutil.which("ffmpeg")
    if not ffmpeg:
        return path_in  # can't remux
    # try stream copy then re-encode
    try:
        subprocess.run(["ffmpeg","-y","-i",str(path_in),"-c","copy","-movflags","+faststart",str(dst)],
                       check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        path_in.unlink(missing_ok=True)
        return dst
    except subprocess.CalledProcessError:
        subprocess.run(["ffmpeg","-y","-i",str(path_in),"-c:v","libx264","-c:a","aac","-movflags","+faststart",str(dst)],
                       check=True)
        path_in.unlink(missing_ok=True)
        return dst

def is_youtube(url: str) -> bool:
    n = urlparse(url).netloc.lower()
    return "youtube.com" in n or "youtu.be" in n

# unique (video_id -> url)
pairs = {}
for d in meta:
    vid, url = d.get("video_id"), d.get("url")
    if vid and url and vid not in pairs:
        pairs[vid] = normalize_url(url)

ffmpeg_path = shutil.which("ffmpeg")
print("ffmpeg:", ffmpeg_path or "NOT FOUND (install with apt)")

# ---- NEW: filter to preflight reachable only (if file exists) ----
preflight_path = root / "data" / "metadata" / "preflight_unreachable.txt"
if preflight_path.exists():
    unreachable_ids = set(x.strip() for x in preflight_path.read_text().splitlines() if x.strip())
    before = len(pairs)
    pairs = {vid: url for vid, url in pairs.items() if vid not in unreachable_ids}
    print(f"Filtered out {before - len(pairs)} preflight-unreachable; {len(pairs)} remain.")
else:
    print("No preflight_unreachable.txt found; using all pairs.")

# Base options (no Android by default)
base_opts = {
    "outtmpl": str(raw_dir / "%(id)s.%(ext)s"),
    "format": "mp4/bestvideo+bestaudio/best",
    "merge_output_format": "mp4",
    "noprogress": True,
    "quiet": True,
    "geo_bypass": True,
}

# ---- NEW: use cookies only if present ----
cookiefile = root / "data" / "metadata" / "youtube_cookies.txt"
if cookiefile.exists():
    base_opts["cookiefile"] = str(cookiefile)
    print(f"Using YouTube cookies file: {cookiefile}")
else:
    print("No YouTube cookies file found; proceeding unauthenticated.")

# Try multiple yt client profiles (avoid Android PO token unless last resort)
yt_client_profiles = [
    {"youtube": {"player_client": ["web"]}},
    {"youtube": {"player_client": ["ios"]}},
    {"youtube": {"player_client": ["tvhtml5"]}},
    {"youtube": {"player_client": ["android"]}},  # last resort
]

def try_download_one(vid: str, url: str, retries=1) -> str:
    """Return 'ok' | 'skip' | 'fail'."""
    if (raw_dir / f"{vid}.mp4").exists():
        return "ok"

    # non-YouTube
    if not is_youtube(url):
        opts = dict(base_opts)
        netloc = urlparse(url).netloc.lower()
        if "aslsignbank.haskins.yale.edu" in netloc:
            opts["nocheckcertificate"] = True  # per-host only
        for _ in range(retries + 1):
            try:
                with YoutubeDL(opts) as ydl:
                    ydl.download([url])
                cand = list(raw_dir.glob(f"{vid}.*"))
                if cand:
                    ensure_mp4(cand[0])
                if (raw_dir / f"{vid}.mp4").exists():
                    return "ok"
            except Exception as e:
                msg = str(e).lower()
                if any(x in msg for x in ["404", "not found", "removed", "unavailable"]):
                    return "skip"
                sleep(1.0)
        return "fail"

    # YouTube
    for client_args in yt_client_profiles:
        opts = dict(base_opts)
        opts["extractor_args"] = {"youtube": client_args["youtube"]}
        try:
            with YoutubeDL(opts) as ydl:
                ydl.download([url])
            cand = list(raw_dir.glob(f"{vid}.*"))
            if cand:
                ensure_mp4(cand[0])
            if (raw_dir / f"{vid}.mp4").exists():
                return "ok"
        except Exception as e:
            msg = str(e).lower()
            unavailable = [
                "video unavailable", "private video", "copyright",
                "has been removed", "not available", "410", "403", "404"
            ]
            if any(x in msg for x in unavailable):
                return "skip"
            continue
    return "fail"

# Download loop: non-YouTube first
order = list(pairs.items())
non_yt = [(v,u) for v,u in order if not is_youtube(u)]
yt = [(v,u) for v,u in order if is_youtube(u)]
sequence = non_yt + yt

ok = 0; skipped = []; failed = []
for vid, url in tqdm(sequence, desc="Downloading"):
    status = try_download_one(vid, url)
    if status == "ok":
        ok += 1
    elif status == "skip":
        skipped.append(vid)
    else:
        failed.append(vid)

meta_dir = root / "data" / "metadata"
meta_dir.mkdir(parents=True, exist_ok=True)
with open(meta_dir / "skipped_unavailable.txt", "w") as f:
    f.write("\n".join(skipped))
with open(meta_dir / "missing.txt", "w") as f:
    f.write("\n".join(failed))

print(f"✅ OK: {ok}")
print(f"⚠️ Skipped (unavailable): {len(skipped)}  -> data/metadata/skipped_unavailable.txt")
print(f"❌ Failed (retry/request): {len(failed)}   -> data/metadata/missing.txt")


ffmpeg: /usr/bin/ffmpeg
Filtered out 558 preflight-unreachable; 1480 remain.
Using YouTube cookies file: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/data/metadata/youtube_cookies.txt


[download] Got error: HTTPSConnectionPool(host='www.signingsavvy.com', port=443): Read timed out. (read timeout=20.0)
Downloading:  11%|█         | 158/1480 [07:14<2:28:05,  6.72s/it]ERROR: [generic] Unable to download webpage: HTTPSConnectionPool(host='media.asldeafined.com', port=443): Read timed out. (read timeout=20.0) (caused by TransportError("HTTPSConnectionPool(host='media.asldeafined.com', port=443): Read timed out. (read timeout=20.0)"))
[download] Got error: 2332456 bytes read, 753835 more expected]ERROR: 
Downloading:  48%|████▊     | 711/1480 [36:37<09:35,  1.34it/s]ERROR: [generic] Unable to download webpage: HTTPSConnectionPool(host='www.signingsavvy.com', port=443): Read timed out. (read timeout=20.0) (caused by TransportError("HTTPSConnectionPool(host='www.signingsavvy.com', port=443): Read timed out. (read timeout=20.0)"))
ERROR: [youtube] 1QOYOZ3g-aY: Video unavailable
         player = https://www.youtube.com/s/player/c6d7bdc9/player_ias.vflset/en_US/base.js
       

✅ OK: 7
⚠️ Skipped (unavailable): 611  -> data/metadata/skipped_unavailable.txt
❌ Failed (retry/request): 862   -> data/metadata/missing.txt
