### Cell A: Building Raw 2k manifest
Goal: parse nslt_2000.json + WLASL_v0.3.json, map gloss→video ids→files under videos/, split into train/val/test, and write manifest_nslt2000_raw.csv with columns: path,gloss,label,split

In [11]:
from pathlib import Path
import json
import pandas as pd

# project root (your notebooks/ lives under repo root)
root = Path("..").resolve()
data_dir   = root / "data" / "wlasl_preprocessed"
videos_dir = data_dir / "videos"

nslt_path  = data_dir / "nslt_2000.json"   # dict: { "05798": {"subset": "...", "action":[...]} }
wlasl_path = data_dir / "WLASL_v0.3.json"  # list: [{gloss, instances:[{video_id, video_path, split}]}]

print("Root:", root)
print("NSLT:", nslt_path.exists(), nslt_path)
print("WLASL:", wlasl_path.exists(), wlasl_path)
print("Videos dir exists:", videos_dir.exists())


Root: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL
NSLT: True /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/data/wlasl_preprocessed/nslt_2000.json
WLASL: True /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/data/wlasl_preprocessed/WLASL_v0.3.json
Videos dir exists: True


In [12]:
def norm_split(s: str):
    s = (s or "").lower()
    if s.startswith("tra"): return "train"
    if s.startswith("val") or s.startswith("dev"): return "val"
    if s.startswith("tes"): return "test"
    return s

with open(wlasl_path, "r") as f:
    wlasl = json.load(f)

# video_id -> {gloss, url_path, split_wlasl}
vid2wlasl = {}
gloss2ids = {}

for entry in wlasl:
    gloss = entry.get("gloss") or entry.get("label") or entry.get("name")
    if not gloss:
        continue
    for inst in entry.get("instances", []):
        vid  = str(inst.get("video_id") or inst.get("id") or "").zfill(5)
        vrel = inst.get("video_path") or inst.get("path") or inst.get("url")
        sp   = norm_split(inst.get("split") or inst.get("subset") or inst.get("phase"))
        if not (vid and vrel):
            continue
        vid2wlasl[vid] = {"gloss": gloss, "url_path": vrel, "split_wlasl": sp}
        gloss2ids.setdefault(gloss, []).append(vid)

print(f"WLASL index videos: {len(vid2wlasl)} | glosses: {len(gloss2ids)}")
print("Sample:", list(vid2wlasl.items())[:1])


WLASL index videos: 21083 | glosses: 2000
Sample: [('69241', {'gloss': 'book', 'url_path': 'http://aslbricks.org/New/ASL-Videos/book.mp4', 'split_wlasl': 'train'})]


Cell C — Join NSLT (authoritative split) → Manifest (includes video_id)

In [13]:
with open(nslt_path, "r") as f:
    nslt = json.load(f)   # dict keyed by video_id

rows, missing = [], []
for vid_key, meta in nslt.items():
    vid = str(vid_key).zfill(5)
    split = norm_split(meta.get("subset"))
    w = vid2wlasl.get(vid)
    if not w:
        missing.append(vid)
        continue
    # build a *candidate* local path from the WLASL url_path (will be cleaned later)
    candidate = (data_dir / "videos" / w["url_path"]).as_posix()
    rows.append({
        "video_id": vid,
        "path": candidate,          # will be normalized later
        "gloss": w["gloss"],
        "split": split if split in ("train","val","test") else (w["split_wlasl"] or "train"),
    })

df_raw = pd.DataFrame(rows).drop_duplicates(subset=["video_id"])
glosses = sorted(df_raw["gloss"].unique())
g2id = {g:i for i,g in enumerate(glosses)}
df_raw["label"] = df_raw["gloss"].map(g2id).astype(int)

# Save raw (includes video_id)
man_raw = data_dir / "manifest_nslt2000_raw.csv"
df_raw[["path","gloss","label","split","video_id"]].to_csv(man_raw, index=False)

pd.DataFrame({"gloss":glosses, "label":[g2id[g] for g in glosses]}).to_csv(
    data_dir/"class_map_nslt2000.csv", index=False
)

print(f"[manifest_nslt2000_raw.csv] rows={len(df_raw)} "
      f"| train={(df_raw['split']=='train').sum()} "
      f"val={(df_raw['split']=='val').sum()} "
      f"test={(df_raw['split']=='test').sum()} "
      f"| classes={len(glosses)}")
if missing:
    print(f"⚠️ Missing {len(missing)} video_ids in WLASL index (skipped). e.g. {missing[:10]}")
print(df_raw.head(3))


[manifest_nslt2000_raw.csv] rows=21083 | train=14289 val=3916 test=2878 | classes=2000
⚠️ Missing 12 video_ids in WLASL index (skipped). e.g. ['12209', '39347', '60721', '16096', '13422', '57839', '47639', '51153', '20138', '48251']
  video_id                                               path    gloss  split  \
0    65097  /home/falasoul/notebooks/USD/AAI-590/Capstone/...      art  train   
1    35544  /home/falasoul/notebooks/USD/AAI-590/Capstone/...       me  train   
2    32962  /home/falasoul/notebooks/USD/AAI-590/Capstone/...  lettuce    val   

   label  
0     98  
1   1071  
2   1017  


Cell D — Strict local manifest (only keep videos/<video_id>.mp4)

In [14]:
# keep rows where videos/<video_id>.mp4 exists on disk; drop everything else
def id_path_on_disk(vid):
    p = videos_dir / f"{str(vid).zfill(5)}.mp4"
    return p if p.exists() else None

df_strict = df_raw.copy()
df_strict["local_path"] = df_strict["video_id"].map(id_path_on_disk)
df_strict = df_strict[df_strict["local_path"].notna()].copy()
df_strict["path"] = df_strict["local_path"].map(lambda p: p.as_posix())
df_strict = df_strict.drop(columns=["local_path"]).drop_duplicates(subset=["video_id"])

man_local = data_dir / "manifest_nslt2000_localid.csv"
df_strict[["path","gloss","label","split","video_id"]].to_csv(man_local, index=False)

print("videos/*.mp4 on disk:", len(list(videos_dir.glob("*.mp4"))))
print("[manifest_nslt2000_localid.csv] rows:", len(df_strict),
      "| unique paths:", df_strict["path"].nunique(),
      "| classes:", df_strict["gloss"].nunique())
print("Splits:",
      {s:int((df_strict['split']==s).sum()) for s in ('train','val','test')})
print("Example:", df_strict.iloc[0][["video_id","path","gloss","label","split"]].to_dict())


videos/*.mp4 on disk: 11980
[manifest_nslt2000_localid.csv] rows: 11980 | unique paths: 11980 | classes: 2000
Splits: {'train': 8313, 'val': 2253, 'test': 1414}
Example: {'video_id': '65097', 'path': '/home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/data/wlasl_preprocessed/videos/65097.mp4', 'gloss': 'art', 'label': 98, 'split': 'train'}


Cell E — Quick distribution sanity (optional but recommended)

In [15]:
# per-class counts (train split)
train_counts = (df_strict[df_strict["split"]=="train"]
                .groupby(["label","gloss"])["video_id"].count()
                .rename("count").reset_index()
                .sort_values("count", ascending=False))

print("Top 10 classes by train samples:")
print(train_counts.head(10).to_string(index=False))

print("\nLowest 10 classes by train samples:")
print(train_counts.tail(10).to_string(index=False))


Top 10 classes by train samples:
 label    gloss  count
   168   before     12
   164      bed     12
   789       go     11
   418     cool     11
  1797     thin     11
   568    drink     11
  1843    trade     10
   392 computer     10
   678   family     10
  1751     tall     10

Lowest 10 classes by train samples:
 label      gloss  count
   250        bug      1
   562     dragon      1
  1043    look at      1
  1867         tv      1
  1922  wash face      1
  1094       milk      1
  1410    realize      1
  1363 propaganda      1
  1361    promote      1
   448       cuba      1
