Now we’ll build 02_preprocess_segments.ipynb, which will:

Load your WLASL100_flat.json metadata.

For each downloaded clip in data/raw/, extract the exact sign segment using ffmpeg based on frame_start / frame_end.

Save it under:

data/videos/{split}/{gloss}/{video_id}.mp4

Optionally verify a few clips and count per split.

### Cell 1 — Project root & imports

In [1]:
# --- Project Root Setup (run first in every notebook) ---
from pathlib import Path
import sys

root = Path("..").resolve()
sys.path.append(str(root / "src"))

print("Project root:", root)
print("Notebook cwd :", Path.cwd())


Project root: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL
Notebook cwd : /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/notebooks


#### Cell 2 — Config & libraries

In [2]:
import json, yaml, shutil, subprocess, math, os
from collections import Counter, defaultdict
from tqdm import tqdm

# Load config
with open(root / "configs" / "wlasl100.yaml", "r") as f:
    C = yaml.safe_load(f)

RAW_DIR     = root / C["paths"]["raw_dir"]        # data/raw
OUT_DIR     = root / C["paths"]["videos_dir"]     # data/videos
META_DIR    = root / C["paths"]["metadata_dir"]   # data/metadata
FPS_DEFAULT = C["fps"]                            # 25
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("RAW_DIR:", RAW_DIR)
print("OUT_DIR:", OUT_DIR)


RAW_DIR: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/data/raw
OUT_DIR: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/data/videos


### Cell 3 — Load WLASL100 metadata & label map

In [3]:
# WLASL100 instances (flattened)
wl100 = json.load(open(META_DIR / "WLASL100_flat.json"))
print("WLASL100 instances:", len(wl100))

# Stable label map (created in 01_download_wlasl100)
label_map_path = META_DIR / "wlasl100_label_map.json"
if label_map_path.exists():
    lm = json.load(open(label_map_path))
    gloss2idx = lm["gloss2idx"]
    idx2gloss = {int(k): v for k, v in lm["idx2gloss"].items()}
else:
    # fallback: derive from wl100 present set (should match prior step)
    glosses = sorted(list({d["gloss"] for d in wl100}))
    gloss2idx = {g:i for i,g in enumerate(glosses)}
    idx2gloss = {i:g for g,i in gloss2idx.items()}

print("Classes:", len(gloss2idx))


WLASL100 instances: 2038
Classes: 100


#### map downloaded files back to their correct instances by matching URLs

In [11]:
import os

# Build url → filename mapping from what was downloaded
downloaded_files = list(RAW_DIR.glob("*.mp4"))
file_map = {f.name.lower(): f for f in downloaded_files}

# Build reverse map by last URL component (basename)
url_to_file = {}
for f in downloaded_files:
    url_to_file[f.name.lower()] = f

# Helper: extract basename from URL
def url_basename(u: str) -> str:
    return os.path.basename(u).lower()

items = []
for d in wl100:
    basename = url_basename(d["url"])
    if basename in url_to_file:
        d["local_path"] = str(url_to_file[basename])
        items.append(d)

print("Instances matched by URL basename:", len(items))
print("Split counts:", Counter(d["split"] for d in items))


Instances matched by URL basename: 743
Split counts: Counter({'train': 540, 'val': 123, 'test': 80})


### Cell 4 — Filter to clips we actually downloaded

In [12]:
# Cell 4 — Match metadata to downloaded files by URL basename
import os
from collections import Counter

downloaded_mp4s = list(RAW_DIR.glob("*.mp4"))
print("Downloaded .mp4 files found:", len(downloaded_mp4s))

# Map basename -> Path
basename_to_path = {p.name.lower(): p for p in downloaded_mp4s}

def url_basename(u: str) -> str:
    try:
        return os.path.basename(u).lower()
    except Exception:
        return ""

# Attach local_path to items that we can locate on disk
items = []
collisions = {}
for d in wl100:
    bname = url_basename(d["url"])
    p = basename_to_path.get(bname)
    if p:
        d2 = dict(d)
        d2["local_path"] = str(p)
        items.append(d2)
        # detect basename collisions (rare)
        collisions.setdefault(bname, set()).add(d2["video_id"])

print("Instances matched by URL basename:", len(items))
print("Split counts:", Counter(d["split"] for d in items))
collisions = {k:v for k,v in collisions.items() if len(v) > 1}
if collisions:
    print("⚠️ Basename collisions detected (same file used by multiple video_ids):", len(collisions))


Downloaded .mp4 files found: 804
Instances matched by URL basename: 743
Split counts: Counter({'train': 540, 'val': 123, 'test': 80})
⚠️ Basename collisions detected (same file used by multiple video_ids): 99


### Cell 5 — Trimming helpers (ffmpeg)

In [13]:
def frames_to_seconds(frame_idx: int, fps: int) -> float:
    # WLASL frames are 1-based; start frame 1 => time 0.0s
    return max(0.0, (frame_idx - 1) / float(fps))

def out_path_for(d: dict) -> Path:
    # data/videos/{split}/{gloss}/{video_id}.mp4
    return OUT_DIR / d["split"] / d["gloss"] / f'{d["video_id"]}.mp4'

def ensure_parent(p: Path):
    p.parent.mkdir(parents=True, exist_ok=True)

def trim_with_ffmpeg(src: Path, dst: Path, start_s: float, end_s: float | None):
    """
    Accurate segment trim using re-encode (ensures precise cut even when sources are quirky).
    - If end_s is None, trim from start to end of file.
    """
    ensure_parent(dst)
    # Build ffmpeg command
    if end_s is None:
        cmd = [
            "ffmpeg", "-y",
            "-ss", f"{start_s}",
            "-i", str(src),
            "-c:v", "libx264", "-c:a", "aac",
            "-movflags", "+faststart",
            str(dst)
        ]
    else:
        duration = max(0.04, end_s - start_s)
        cmd = [
            "ffmpeg", "-y",
            "-ss", f"{start_s}",
            "-i", str(src),
            "-t", f"{duration}",
            "-c:v", "libx264", "-c:a", "aac",
            "-movflags", "+faststart",
            str(dst)
        ]
    # Run ffmpeg quietly but fail on error
    subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)


#### Cell 6 — Build task list & process (with resume/skip)

In [14]:
# Cell 6 — Build task list & process (with resume/skip)
OVERWRITE = False
VERIFY_BYTES_MIN = 5_000
ERRORS = []

def item_to_times(d: dict) -> tuple[float, float | None]:
    fps = d.get("fps", FPS_DEFAULT) or FPS_DEFAULT
    fs, fe = d.get("frame_start", 1), d.get("frame_end", -1)
    start_s = max(0.0, (int(fs) - 1) / float(fps))
    end_s = None if int(fe) == -1 else max(0.0, (int(fe) - 1) / float(fps))
    if end_s is not None and end_s <= start_s:
        end_s = start_s + 1.0 / float(fps)
    return start_s, end_s

def out_path_for(d: dict) -> Path:
    # data/videos/{split}/{gloss}/{video_id}.mp4  (keep output standardized by video_id)
    return OUT_DIR / d["split"] / d["gloss"] / f'{d["video_id"]}.mp4'

tasks = []
missing_src = 0
for d in items:
    src = Path(d["local_path"])
    if not src.exists():
        missing_src += 1
        continue
    dst = out_path_for(d)
    tasks.append((d, src, dst))

print("Trim tasks:", len(tasks))
print("Missing local file despite match:", missing_src)


Trim tasks: 743
Missing local file despite match: 0


### Cell 7 — Run trimming (progress bar)

In [15]:
ok, skipped = 0, 0
for d, src, dst in tqdm(tasks, desc="Trimming"):
    try:
        if not src.exists():
            skipped += 1
            continue
        if dst.exists() and not OVERWRITE:
            # lightweight validity check
            if dst.stat().st_size >= VERIFY_BYTES_MIN:
                ok += 1
                continue
            else:
                # re-trim if file too small
                dst.unlink(missing_ok=True)

        start_s, end_s = item_to_times(d)
        trim_with_ffmpeg(src, dst, start_s, end_s)

        if dst.exists() and dst.stat().st_size >= VERIFY_BYTES_MIN:
            ok += 1
        else:
            ERRORS.append((d["video_id"], "too_small_or_missing"))

    except subprocess.CalledProcessError as e:
        ERRORS.append((d["video_id"], "ffmpeg_error"))
    except Exception as e:
        ERRORS.append((d["video_id"], f"exception:{type(e).__name__}"))

print(f"✅ Trimmed OK: {ok}")
print(f"⏭️ Skipped (no raw / already valid): {skipped}")
print(f"❌ Errors: {len(ERRORS)}")


Trimming: 100%|██████████| 743/743 [02:47<00:00,  4.44it/s]

✅ Trimmed OK: 743
⏭️ Skipped (no raw / already valid): 0
❌ Errors: 0





### Cell 8 — Save error report (if any)

In [16]:
err_path = META_DIR / "preprocess_errors.csv"
if ERRORS:
    with open(err_path, "w") as f:
        f.write("video_id,error\n")
        for vid, msg in ERRORS:
            f.write(f"{vid},{msg}\n")
    print("Error log:", err_path)
else:
    print("No errors.")


No errors.


### Cell 9 — Coverage summary (per split & per gloss)

In [17]:
# Count processed clips under data/videos/
processed = list(OUT_DIR.rglob("*.mp4"))
print("Processed files:", len(processed))

# Per split counts
per_split = Counter(p.parts[-3] for p in processed)  # {split}/{gloss}/{vid}.mp4
print("Per split:", dict(per_split))

# Per gloss (top 10)
per_gloss = Counter(p.parts[-2] for p in processed)
top10 = per_gloss.most_common(10)
print("Top 10 gloss counts:", top10)

# Compare available vs processed
avail_by_split = Counter(d["split"] for d in items)
print("Available raw by split:", dict(avail_by_split))


Processed files: 752
Per split: {'test': 81, 'val': 124, 'train': 547}
Top 10 gloss counts: [('go', 13), ('thanksgiving', 11), ('before', 11), ('shirt', 11), ('cousin', 11), ('drink', 11), ('help', 11), ('no', 10), ('how', 10), ('yes', 10)]
Available raw by split: {'train': 540, 'val': 123, 'test': 80}


#### Cell 10 — (Optional) Quick integrity probe with decord

In [18]:
# Optional: verify a few random processed clips can be opened
try:
    import random
    import decord
    decord.bridge.set_bridge('torch')

    samples = random.sample(processed, min(5, len(processed)))
    for p in samples:
        vr = decord.VideoReader(str(p))
        n = len(vr)
        print(p.name, "frames:", n)
except Exception as e:
    print("Decord probe skipped or failed:", e)


42967.mp4 frames: 43
24641.mp4 frames: 52
57945.mp4 frames: 90
24948.mp4 frames: 67
17026.mp4 frames: 109


What your counts mean

Available raw by split: train=540, val=123, test=80 → 743 instances where we found a local file by URL basename.

Processed files: train=547, val=124, test=81 → 752 trimmed clips written to data/videos/....

That small bump (752 > 743) usually happens because multiple metadata rows can point to the same source file/basename (e.g., variants or duplicate URLs across instances). Since we output by {video_id}.mp4, each distinct instance becomes its own trimmed clip—even if they originated from the same raw file—so totals can increase. This is fine and expected.

Top gloss counts also look balanced enough to proceed (some common signs have ~10–13 clips).

#### Next: create a training manifest (CSV) for PyTorch

We’ll index all processed clips with their split, gloss, label id, and the absolute path. Do this in your 02_preprocess_segments.ipynb at the end.



#### Notebook cell — build wlasl100_manifest.csv

In [19]:
# Build a manifest of processed videos: split, gloss, label, video_id, path
from pathlib import Path
import json, yaml, csv

manifest_path = root / "data" / "metadata" / "wlasl100_manifest.csv"

# Load label map created earlier
label_map_path = root / "data" / "metadata" / "wlasl100_label_map.json"
lm = json.load(open(label_map_path))
gloss2idx = lm["gloss2idx"]

rows = []
for split_dir in (root / "data" / "videos").iterdir():
    if not split_dir.is_dir():
        continue
    split = split_dir.name  # train/val/test
    for gloss_dir in split_dir.iterdir():
        if not gloss_dir.is_dir():
            continue
        gloss = gloss_dir.name
        label = gloss2idx.get(gloss)
        if label is None:
            # gloss not in top-100 map; skip (should be rare)
            continue
        for mp4 in gloss_dir.glob("*.mp4"):
            video_id = mp4.stem
            rows.append([split, gloss, label, video_id, str(mp4.resolve())])

print("Manifest rows:", len(rows))

with open(manifest_path, "w", newline="") as f:
    w = csv.writer(f)
    w.writerow(["split","gloss","label","video_id","path"])
    w.writerows(rows)

print("Saved manifest:", manifest_path)


Manifest rows: 752
Saved manifest: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/data/metadata/wlasl100_manifest.csv


#### Notebook cell — quick QA on the manifest

In [20]:
import pandas as pd
m = pd.read_csv(root / "data" / "metadata" / "wlasl100_manifest.csv")
print("Total:", len(m))
print("By split:\n", m.groupby("split").size())
print("Classes:", m["label"].nunique())
m.head()


Total: 752
By split:
 split
test      81
train    547
val      124
dtype: int64
Classes: 100


Unnamed: 0,split,gloss,label,video_id,path
0,test,thanksgiving,29,57640,/home/falasoul/notebooks/USD/AAI-590/Capstone/...
1,test,brown,79,69252,/home/falasoul/notebooks/USD/AAI-590/Capstone/...
2,test,time,73,58502,/home/falasoul/notebooks/USD/AAI-590/Capstone/...
3,test,bird,53,6335,/home/falasoul/notebooks/USD/AAI-590/Capstone/...
4,test,dog,36,17097,/home/falasoul/notebooks/USD/AAI-590/Capstone/...


Optional (recommended): clip duration distribution + min-per-class check

This helps you see if any gloss has too few samples.

#### Notebook cell — durations & per-class counts

In [21]:
import pandas as pd, decord, numpy as np
from collections import Counter

decord.bridge.set_bridge('torch')
m = pd.read_csv(root / "data" / "metadata" / "wlasl100_manifest.csv")

# Count per class
counts = m.groupby(["label","gloss"]).size().reset_index(name="n")
print("Per-class stats (head):")
print(counts.sort_values("n", ascending=False).head(10))
print("Min/Median/Max per class:", counts["n"].min(), counts["n"].median(), counts["n"].max())

# (Optional) durations for a sample subset (durations can be slow to compute)
sampled = m.sample(min(200, len(m)), random_state=42)
def get_frames(fp):
    try:
        vr = decord.VideoReader(fp)
        return len(vr)
    except Exception:
        return np.nan

fps = 25
sampled["n_frames"] = sampled["path"].apply(get_frames)
sampled["sec"] = sampled["n_frames"] / fps
print("Duration sec — mean/median:", sampled["sec"].mean(), sampled["sec"].median())
print("Duration sec — 5th/95th pct:", sampled["sec"].quantile(0.05), sampled["sec"].quantile(0.95))


Per-class stats (head):
    label         gloss   n
5       5            go  13
1       1         drink  11
9       9        cousin  11
3       3        before  11
12     12          help  11
29     29  thanksgiving  11
46     46         shirt  11
17     17           yes  10
86     86           how  10
13     13            no  10
Min/Median/Max per class: 3 8.0 13
Duration sec — mean/median: 2.7612 2.76
Duration sec — 5th/95th pct: 1.278 4.241999999999999


These stats look healthy and exactly what we hoped to see:

Per-class counts: min = 3, median = 8, max = 13 → mild imbalance (totally manageable).

Durations: mean/median ≈ 2.76 s @ 25 fps → ~69 frames per clip on average (5th–95th ≈ 1.3–4.24 s).

What this means for training

Clip sampling: Use 32 frames with stride=2 (i.e., sample 32 frames across ~64 original frames ≈ 2.5 s).
For shorter clips, edge-pad/loop-pad; for longer clips, uniform subsample.

Transforms: Keep it simple and safe for ASL—no horizontal flip (can invert handedness), light resize to 112×112, center crop, per-channel normalize.

Class imbalance: Use a WeightedRandomSampler on the train split (weights ∝ 1 / class frequency). No need to drop rare classes (min=3 is fine). Optionally try FocalLoss later.

Checkpoints: Save checkpoints/last.pt each epoch and checkpoints/best.pt on best val_top1.