Goals: (a) audit codec/fps/pix_fmt & decode checks, (b) generate and run re-encode script to videos_clean/ (H.264/yuv420p/30fps), (c) auto-trim active window to videos_trim/, (d) build updated manifests (clean/trim).

In [4]:
# Cell A — Audit with ffprobe & Decord
from pathlib import Path
import subprocess

import json, pandas as pd, numpy as np

root = Path("..").resolve()
data_dir = root/"data"/"wlasl_preprocessed"
man_raw = data_dir / "manifest_nslt2000_localid.csv"   # <-- strict local
df = pd.read_csv(man_raw)


def ffprobe_json(p):
    try:
        out = subprocess.check_output(
            ["ffprobe","-v","error","-print_format","json","-show_streams","-show_format",str(p)],
            stderr=subprocess.STDOUT
        ).decode()
        return json.loads(out)
    except Exception as e:
        return {"error": str(e)}

rows = []
for p_rel in df["path"].unique():
    p = Path(p_rel)
    j = ffprobe_json(p)
    codec = pix = fps = has_audio = None
    ok = False
    if "error" not in j:
        v = [s for s in j["streams"] if s.get("codec_type")=="video"]
        a = [s for s in j["streams"] if s.get("codec_type")=="audio"]
        if v:
            v = v[0]
            codec = v.get("codec_name")
            pix   = v.get("pix_fmt")
            r = v.get("avg_frame_rate","0/0")
            try:
                n,d = r.split("/")
                fps = float(n)/float(d) if float(d)!=0 else None
            except: fps=None
            has_audio = 1 if a else 0
            ok = True
    rows.append({"path":p_rel,"probe_ok":ok,"codec":codec,"pix_fmt":pix,"fps":fps,"has_audio":has_audio})
aud = pd.DataFrame(rows)
aud.to_csv(root/"runs"/"nslt2000_audit.csv", index=False)
print("Audit saved to runs/nslt2000_audit.csv")

# Decide which need re-encode  ✅ robust to strings/None
aud["fps"] = pd.to_numeric(aud["fps"], errors="coerce")
aud["codec"] = aud["codec"].astype(str)
aud["pix_fmt"] = aud["pix_fmt"].astype(str)
aud["has_audio"] = pd.to_numeric(aud["has_audio"], errors="coerce").fillna(0).astype(int)
aud["probe_ok"] = aud["probe_ok"].fillna(False).astype(bool)

# allow 29.97 / 30.0 tolerance
fps_ok = aud["fps"].apply(lambda x: abs(x - 30.0) <= 0.6 if pd.notna(x) else False)

need = (
    (~aud["probe_ok"]) |
    (~aud["codec"].str.lower().eq("h264")) |
    (~aud["pix_fmt"].str.lower().eq("yuv420p")) |
    (~fps_ok) |
    (aud["has_audio"].eq(1))
)

todo = aud[need.fillna(True)]
print("Need re-encode:", len(todo), "of", len(aud))
# --- A.1 Sanity: are we auditing everything? ---
print("\n[Sanity]")
print("Rows in manifest:", len(df))
print("Unique paths in manifest:", df["path"].nunique())
print("Audited rows:", len(aud))
print("Unique audited paths:", aud["path"].nunique())

# --- A.2 Reason breakdown (why each file was flagged) ---
aud["reason_codec"]   = ~aud["codec"].str.lower().eq("h264")
aud["reason_pixfmt"]  = ~aud["pix_fmt"].str.lower().eq("yuv420p")
aud["reason_fps"]     = ~fps_ok
aud["reason_audio"]   = aud["has_audio"].eq(1)
aud["reason_probe"]   = ~aud["probe_ok"]

print("\n[Breakdown of reasons]")
for col in ["reason_probe","reason_codec","reason_pixfmt","reason_fps","reason_audio"]:
    print(f"{col}: {int(aud[col].sum())}")

print("\nTop codec values:\n", aud["codec"].value_counts(dropna=False).head(10))
print("\nTop pix_fmt values:\n", aud["pix_fmt"].value_counts(dropna=False).head(10))

# Optional: sample a few flagged rows to inspect
print("\nSample flagged rows:")
display(aud[need].sample(min(5, len(aud[need])), random_state=0))



Audit saved to runs/nslt2000_audit.csv
Need re-encode: 8905 of 11980

[Sanity]
Rows in manifest: 11980
Unique paths in manifest: 11980
Audited rows: 11980
Unique audited paths: 11980

[Breakdown of reasons]
reason_probe: 0
reason_codec: 161
reason_pixfmt: 0
reason_fps: 3494
reason_audio: 7020

Top codec values:
 codec
h264     11819
mpeg4      161
Name: count, dtype: int64

Top pix_fmt values:
 pix_fmt
yuv420p    11980
Name: count, dtype: int64

Sample flagged rows:


Unnamed: 0,path,probe_ok,codec,pix_fmt,fps,has_audio,reason_codec,reason_pixfmt,reason_fps,reason_audio,reason_probe
8510,/home/falasoul/notebooks/USD/AAI-590/Capstone/...,True,h264,yuv420p,29.97003,1,False,False,False,True,False
6265,/home/falasoul/notebooks/USD/AAI-590/Capstone/...,True,h264,yuv420p,25.0,0,False,False,True,False,False
3399,/home/falasoul/notebooks/USD/AAI-590/Capstone/...,True,h264,yuv420p,29.97003,1,False,False,False,True,False
8454,/home/falasoul/notebooks/USD/AAI-590/Capstone/...,True,h264,yuv420p,30.003371,1,False,False,False,True,False
7011,/home/falasoul/notebooks/USD/AAI-590/Capstone/...,True,h264,yuv420p,25.0,0,False,False,True,False,False


In [5]:
#Cell B — Generate re-encode script (normalize codec/fps, strip audio)
#
#Creates videos_clean/ and a shell script you can run with GNU parallel.

# === Cell B — Generate re-encode script for flagged files ===
import shlex
from pathlib import Path

root = Path("..").resolve()
data_dir = root / "data" / "wlasl_preprocessed"
vclean = data_dir / "videos_clean"
vclean.mkdir(parents=True, exist_ok=True)

script = (root / "runs" / "nslt2000_reencode.sh")
with open(script, "w") as f:
    f.write("#!/usr/bin/env bash\nset -euo pipefail\n\n")
    for p_rel in todo["path"]:
        src = Path(p_rel)
        if not src.exists():
            continue
        dst = vclean / src.name
        # Scale shorter side to 256 (keeps aspect), force ~30fps, strip audio, H.264 yuv420p
        vf = "scale='if(gt(iw,ih),-1,256)':'if(gt(iw,ih),256,-1)',fps=30"
        cmd = (
            f"ffmpeg -y -loglevel error -i {shlex.quote(str(src))} "
            f"-vf {shlex.quote(vf)} -an -c:v libx264 -pix_fmt yuv420p "
            f"-preset veryfast -crf 23 -movflags +faststart {shlex.quote(str(dst))}"
        )
        f.write(cmd + "\n")

print('Wrote:', script)
print('Run with:\n  chmod +x runs/nslt2000_reencode.sh\n  parallel -j 8 < runs/nslt2000_reencode.sh')



Wrote: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/runs/nslt2000_reencode.sh
Run with:
  chmod +x runs/nslt2000_reencode.sh
  parallel -j 8 < runs/nslt2000_reencode.sh


Why we re-encode (and those flags)

Consistency → better training. Your raw set mixes 25/29.97/30 fps, some audio tracks, and a few MPEG-4 codecs. 

Standardizing to H.264 + yuv420p + ~30 fps + no audio removes decode hiccups and gives the model clean, uniform clips.

CRF 23 (quality/size tradeoff). With libx264, -crf controls quality: lower = higher quality/larger files. 23 is a proven sweet spot; 24–26 trims size if disk is tight. (NVENC uses -cq, not -crf.)

Scale short side to 256. Keeps aspect ratio, reduces VRAM bandwidth, and makes augmentation/resizing more stable.

Strip audio. Wasted bytes for this task and sometimes causes ffprobe/loader stalls

## Rechecking the Audit after we are done.

In [7]:
# === Cell A — Post-Reencode Audit (merged: prefer cleaned if present) ===
from pathlib import Path
import subprocess, json, pandas as pd, numpy as np

root = Path("..").resolve()
data_dir = root / "data" / "wlasl_preprocessed"
manifest = data_dir / "manifest_nslt2000_localid.csv"   # original manifest (points to videos/)
df0 = pd.read_csv(manifest)

# --- Build effective path: prefer videos_clean/<file> if it exists ---
def prefer_clean_path(p_str: str) -> str:
    p = Path(p_str)
    clean = data_dir / "videos_clean" / p.name
    return str(clean) if clean.exists() else str(p_str)

df = df0.copy()
df["path_effective"] = df["path"].map(prefer_clean_path)
used_clean = (df["path_effective"].str.contains("/videos_clean/")).sum()
used_raw   = len(df) - used_clean

print(f"[Path selection] using videos_clean: {used_clean} | using original videos: {used_raw} | total: {len(df)}")

def ffprobe_json(p):
    try:
        out = subprocess.check_output(
            ["ffprobe","-v","error","-print_format","json","-show_streams","-show_format",str(p)],
            stderr=subprocess.STDOUT
        ).decode()
        return json.loads(out)
    except Exception as e:
        return {"error": str(e)}

# --- Collect audit over effective paths ---
rows = []
uniq_paths = df["path_effective"].unique()
for p_rel in uniq_paths:
    j = ffprobe_json(p_rel)
    codec = pix = fps = has_audio = None
    ok = False
    if "error" not in j:
        v = [s for s in j["streams"] if s.get("codec_type")=="video"]
        a = [s for s in j["streams"] if s.get("codec_type")=="audio"]
        if v:
            v = v[0]
            codec = v.get("codec_name")
            pix   = v.get("pix_fmt")
            r     = v.get("avg_frame_rate","0/0")
            try:
                n,d = r.split("/")
                fps = float(n)/float(d) if float(d)!=0 else None
            except:
                fps = None
            has_audio = 1 if a else 0
            ok = True
    rows.append({"path": p_rel, "probe_ok": ok, "codec": codec, "pix_fmt": pix, "fps": fps, "has_audio": has_audio})

aud_after = pd.DataFrame(rows)

# --- Save artifacts for reporting ---
out_audit = root / "runs" / "nslt2000_audit_after_merged.csv"
aud_after.to_csv(out_audit, index=False)
(pd.DataFrame({"path": df[df["path_effective"].str.contains("/videos_clean/")]["path_effective"].unique()})
   .to_csv(root/"runs"/"nslt2000_used_clean_paths.csv", index=False))
(pd.DataFrame({"path": df[~df["path_effective"].str.contains("/videos_clean/")]["path_effective"].unique()})
   .to_csv(root/"runs"/"nslt2000_used_raw_paths.csv", index=False))
print(f"✅ Saved: {out_audit.name}, nslt2000_used_clean_paths.csv, nslt2000_used_raw_paths.csv")

# --- Evaluate cleanliness ---
aud_after["fps"]       = pd.to_numeric(aud_after["fps"], errors="coerce")
aud_after["codec"]     = aud_after["codec"].astype(str)
aud_after["pix_fmt"]   = aud_after["pix_fmt"].astype(str)
aud_after["has_audio"] = pd.to_numeric(aud_after["has_audio"], errors="coerce").fillna(0).astype(int)
aud_after["probe_ok"]  = aud_after["probe_ok"].fillna(False).astype(bool)

# allow 29.97 / 30.0
fps_ok = aud_after["fps"].apply(lambda x: abs(x - 30.0) <= 0.6 if pd.notna(x) else False)

need = (
    (~aud_after["probe_ok"]) |
    (~aud_after["codec"].str.lower().eq("h264")) |
    (~aud_after["pix_fmt"].str.lower().eq("yuv420p")) |
    (~fps_ok) |
    (aud_after["has_audio"].eq(1))
)

print("\n[AFTER RE-ENCODE (merged paths)]")
print(f"Total unique files audited: {len(aud_after)}")
print(f"Still needing re-encode: {int(need.sum())}")

print("\n[Breakdown of reasons]")
aud_after["reason_codec"]  = ~aud_after["codec"].str.lower().eq("h264")
aud_after["reason_pixfmt"] = ~aud_after["pix_fmt"].str.lower().eq("yuv420p")
aud_after["reason_fps"]    = ~fps_ok
aud_after["reason_audio"]  = aud_after["has_audio"].eq(1)
aud_after["reason_probe"]  = ~aud_after["probe_ok"]
for col in ["reason_probe","reason_codec","reason_pixfmt","reason_fps","reason_audio"]:
    print(f"{col}: {int(aud_after[col].sum())}")

print("\nTop codec values:\n", aud_after["codec"].value_counts(dropna=False).head(10))
print("\nTop pix_fmt values:\n", aud_after["pix_fmt"].value_counts(dropna=False).head(10))

# --- Optional: compare with the very first 'before' audit if present ---
before_csv = root / "runs" / "nslt2000_audit.csv"
if before_csv.exists():
    aud_before = pd.read_csv(before_csv)
    print("\n[Comparison — before vs AFTER (merged)]")
    print(f"Before: {len(aud_before)} files (all original paths)")
    print(f"After:  {len(aud_after)} files | need re-encode now: {int(need.sum())}")
else:
    print("\nNo previous 'before' audit file found for comparison.")


[Path selection] using videos_clean: 8905 | using original videos: 3075 | total: 11980
✅ Saved: nslt2000_audit_after_merged.csv, nslt2000_used_clean_paths.csv, nslt2000_used_raw_paths.csv

[AFTER RE-ENCODE (merged paths)]
Total unique files audited: 11980
Still needing re-encode: 0

[Breakdown of reasons]
reason_probe: 0
reason_codec: 0
reason_pixfmt: 0
reason_fps: 0
reason_audio: 0

Top codec values:
 codec
h264    11980
Name: count, dtype: int64

Top pix_fmt values:
 pix_fmt
yuv420p    11980
Name: count, dtype: int64

[Comparison — before vs AFTER (merged)]
Before: 11980 files (all original paths)
After:  11980 files | need re-encode now: 0


### Merging Dataset after cleaning

# Save a manifest that already prefers cleaned files



In [8]:
merged_manifest = data_dir / "manifest_nslt2000_merged.csv"
df_out = df0.copy()
df_out["path"] = df_out["path"].map(prefer_clean_path)
df_out.to_csv(merged_manifest, index=False)
print("✅ Wrote:", merged_manifest)

✅ Wrote: /home/falasoul/notebooks/USD/AAI-590/Capstone/AAI-590-G3-ASL/data/wlasl_preprocessed/manifest_nslt2000_merged.csv
