#### Full Data set

### Cell A — Imports & paths

In [1]:
# === Cell A — Imports & paths ===
from pathlib import Path
import pandas as pd
import numpy as np

root = Path("..").resolve()
data_dir = root / "data" / "wlasl_preprocessed"

orig_manifest = data_dir / "manifest_nslt2000_roi_final_clean.csv"
assert orig_manifest.exists(), f"Missing: {orig_manifest}"

df = pd.read_csv(orig_manifest)
print("Loaded:", len(df), "rows")
print("Columns:", df.columns.tolist())
print("Unique glosses:", df["gloss"].nunique())


Loaded: 11901 rows
Columns: ['video_id', 'path', 'gloss', 'label', 'split', 'exists']
Unique glosses: 1999


### Cell B — Per-gloss 70/15/15 split on full RO

In [2]:
# === Cell B — Per-gloss 70/15/15 split on full ROI ===

# keep only glosses with at least MIN_SAMPLES total clips
MIN_SAMPLES = 7   # tweak if you want more/less strict
vc = df["gloss"].value_counts()
good_gloss = vc[vc >= MIN_SAMPLES].index
df = df[df["gloss"].isin(good_gloss)].copy()

print("After MIN_SAMPLES filter:")
print("  rows:", len(df))
print("  glosses:", df["gloss"].nunique())

dfs = []
rng = np.random.RandomState(42)

for gloss, gdf in df.groupby("gloss"):
    gdf = gdf.sample(frac=1.0, random_state=42).reset_index(drop=True)

    n = len(gdf)
    n_train = max(1, int(0.7 * n))
    n_val   = max(1, int(0.15 * n))
    n_test  = n - n_train - n_val
    if n_test <= 0:
        n_test = 1
        if n_train > 1:
            n_train -= 1

    gdf.loc[:n_train-1,         "split"] = "train"
    gdf.loc[n_train:n_train+n_val-1, "split"] = "val"
    gdf.loc[n_train+n_val:,     "split"] = "test"

    dfs.append(gdf)

df_split = pd.concat(dfs, ignore_index=True)
print("Split counts:", df_split["split"].value_counts().to_dict())


After MIN_SAMPLES filter:
  rows: 5292
  glosses: 647
Split counts: {'train': 3286, 'test': 1350, 'val': 656}


#### Cell C — Build contiguous labels & save new manifest

In [3]:
# === Cell C — Build label_new and save ===

classes = sorted(df_split["gloss"].unique())
gloss_to_new = {g: i for i, g in enumerate(classes)}
df_split["label_new"] = df_split["gloss"].map(gloss_to_new)

print("Final stats:")
print("  rows:", len(df_split))
print("  classes:", len(classes))
print("  split counts:", df_split["split"].value_counts().to_dict())

new_manifest = data_dir / "manifest_nslt2000_roi_full_resplit_70_15_15_min7.csv"
df_split.to_csv(new_manifest, index=False)
print("Saved new manifest:", new_manifest)


Final stats:
  rows: 5292
  classes: 647
  split counts: {'train': 3286, 'test': 1350, 'val': 656}
Saved new manifest: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/data/wlasl_preprocessed/manifest_nslt2000_roi_full_resplit_70_15_15_min7.csv
