In [1]:
from pathlib import Path

NOTEBOOK_DIR = Path("")

DATASET_ROOT = NOTEBOOK_DIR / "UCF_Crime"
PROCESSED_DIR = DATASET_ROOT / "processed"

print("Dataset root:", DATASET_ROOT, DATASET_ROOT.exists())
print("Processed dir:", PROCESSED_DIR, PROCESSED_DIR.exists())
print("Index exists:", (PROCESSED_DIR / "preprocess_index.csv").exists())


Dataset root: /home/diego/Escritorio/Pruebas/tesispython/UCF_Crime False
Processed dir: /home/diego/Escritorio/Pruebas/tesispython/UCF_Crime/processed False
Index exists: False


# Paso 1 — Obtención y validación de datos (UCF-Crime)

## Objetivo
Validar que el dataset UCF-Crime está correctamente ubicado y estructurado en disco, y que las carpetas/clases contienen videos legibles antes de ejecutar cualquier preprocesamiento, extracción de embeddings o entrenamiento.

## Estructura esperada
La estructura de referencia del dataset (según README oficial) se organiza como:

`UCF_CRIME/Videos/<Clase>/video.ext`

En esta etapa se trabaja con:
- **Fase 1 (Validación técnica):** subconjunto de anomalías (Abuse, Arrest, Arson, Assault) + normal provisional.
- **Fase 2 (Experimento completo):** anomalías completas + normales oficiales (`Training_Normal_Videos_Anomaly` y `Testing_Normal_Videos_Anomaly`) + splits oficiales.

## Resultado esperado
El notebook debe imprimir mensajes del tipo:
- `Abuse: cargado correctamente (N videos)`
- `Arrest: cargado correctamente (N videos)`
y generar un reporte reproducible.


## Paso 1.1 — Configuración (Código)

In [34]:
from pathlib import Path
from datetime import datetime
import json

# =========================
# CONFIGURACIÓN PRINCIPAL
# =========================
DATASET_ROOT = Path("UCF_Crime")
VIDEOS_DIR = DATASET_ROOT / "Videos"

# Extensiones soportadas
VIDEO_EXTS = {".mp4", ".avi", ".mkv", ".mov", ".webm"}

# Clases oficiales UCF-Crime (13 anomalías)
ANOMALY_FOLDERS = [
    "Abuse", "Arrest", "Arson", "Assault", "Burglary", "Explosion",
    "Fighting", "Robbery", "RoadAccidents", "Shooting", "Shoplifting",
    "Stealing", "Vandalism"
]

# Normales oficiales (anomaly detection)
NORMAL_OFFICIAL = ["Training_Normal_Videos_Anomaly", "Testing_Normal_Videos_Anomaly"]

# Normal opcional (event recognition) — puede usarse como placeholder en Fase 1
NORMAL_OPTIONAL = ["Normal_Videos_event"]

print("DATASET_ROOT:", DATASET_ROOT)
print("VIDEOS_DIR:", VIDEOS_DIR)


DATASET_ROOT: UCF_Crime
VIDEOS_DIR: UCF_Crime/Videos


## Paso 1.2 — Funciones de validación (Código)

In [35]:
def list_videos(folder: Path):
    """Lista videos válidos dentro de una carpeta (recursivo)."""
    if not folder.exists() or not folder.is_dir():
        return []
    vids = []
    for p in folder.rglob("*"):
        if p.is_file() and p.suffix.lower() in VIDEO_EXTS:
            vids.append(p)
    return sorted(vids)

def check_folder(name: str, folder: Path, required: bool):
    """
    Revisa existencia + conteo de videos.
    Retorna dict con estado para reporte.
    """
    info = {
        "name": name,
        "path": str(folder),
        "exists": folder.exists(),
        "video_count": 0,
        "status": None
    }

    if not folder.exists():
        info["status"] = "MISSING" if required else "NOT_PRESENT_OPTIONAL"
        return info

    vids = list_videos(folder)
    info["video_count"] = len(vids)
    if len(vids) == 0:
        info["status"] = "EMPTY"
    else:
        info["status"] = "OK"

    return info

def print_status(info):
    """Imprime estado en formato 'presentable'."""
    if info["status"] == "OK":
        print(f"{info['name']}: cargado correctamente ({info['video_count']} videos)")
    elif info["status"] == "EMPTY":
        print(f"{info['name']}: VACÍA (0 videos)")
    elif info["status"] == "MISSING":
        print(f"{info['name']}: FALTA (no existe carpeta)")
    else:
        print(f"{info['name']}: NO PRESENTE (opcional)")


## Paso 1.3 — Validación de estructura base (Código)

In [36]:
print("=== Paso 1.3 — Validación de estructura base ===")

if not VIDEOS_DIR.exists():
    raise FileNotFoundError(f"No existe la carpeta esperada: {VIDEOS_DIR}")

print(f"OK: carpeta 'Videos' encontrada en: {VIDEOS_DIR}")


=== Paso 1.3 — Validación de estructura base ===
OK: carpeta 'Videos' encontrada en: UCF_Crime/Videos


## Paso 1.4 — Validación de clases (subconjunto y detección global) (Código)

In [37]:
print("\n=== Paso 1.4 — Validación de clases disponibles ===")

report = {
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "dataset_root": str(DATASET_ROOT),
    "videos_dir": str(VIDEOS_DIR),
    "anomalies": [],
    "normals_official": [],
    "normals_optional": [],
}

# 1) Anomalías (listadas como opcionales porque puede faltar parte del dataset aún)
print("\n---- Anomalías (oficiales) ----")
for cls in ANOMALY_FOLDERS:
    info = check_folder(cls, VIDEOS_DIR / cls, required=False)
    report["anomalies"].append(info)
    print_status(info)

# 2) Normales oficiales
print("\n---- Normales (oficiales anomaly detection) ----")
for nf in NORMAL_OFFICIAL:
    info = check_folder(nf, VIDEOS_DIR / nf, required=False)
    report["normals_official"].append(info)
    print_status(info)

# 3) Normal opcional (placeholder)
print("\n---- Normales (opcionales / placeholder) ----")
for nf in NORMAL_OPTIONAL:
    info = check_folder(nf, VIDEOS_DIR / nf, required=False)
    report["normals_optional"].append(info)
    print_status(info)



=== Paso 1.4 — Validación de clases disponibles ===

---- Anomalías (oficiales) ----
Abuse: cargado correctamente (50 videos)
Arrest: cargado correctamente (50 videos)
Arson: cargado correctamente (50 videos)
Assault: cargado correctamente (50 videos)
Burglary: NO PRESENTE (opcional)
Explosion: NO PRESENTE (opcional)
Fighting: NO PRESENTE (opcional)
Robbery: NO PRESENTE (opcional)
RoadAccidents: NO PRESENTE (opcional)
Shooting: NO PRESENTE (opcional)
Shoplifting: NO PRESENTE (opcional)
Stealing: NO PRESENTE (opcional)
Vandalism: NO PRESENTE (opcional)

---- Normales (oficiales anomaly detection) ----
Training_Normal_Videos_Anomaly: NO PRESENTE (opcional)
Testing_Normal_Videos_Anomaly: NO PRESENTE (opcional)

---- Normales (opcionales / placeholder) ----
Normal_Videos_event: cargado correctamente (50 videos)


## Paso 1.5 — Checklist Fase 1 vs Fase 2 (Código)

In [38]:
print("\n=== Paso 1.5 — Checklist de preparación ===")

# Conteos
anomaly_ok = [x for x in report["anomalies"] if x["status"] == "OK"]
normal_off_ok = [x for x in report["normals_official"] if x["status"] == "OK"]
normal_opt_ok = [x for x in report["normals_optional"] if x["status"] == "OK"]

total_anomaly_videos = sum(x["video_count"] for x in anomaly_ok)
total_normal_off = sum(x["video_count"] for x in normal_off_ok)
total_normal_opt = sum(x["video_count"] for x in normal_opt_ok)

print(f"Anomalías presentes: {len(anomaly_ok)} / {len(ANOMALY_FOLDERS)} (videos: {total_anomaly_videos})")
print(f"Normales oficiales presentes: {len(normal_off_ok)} / 2 (videos: {total_normal_off})")
print(f"Normales placeholder presentes: {len(normal_opt_ok)} / 1 (videos: {total_normal_opt})")

# Fase 1: al menos 1 anomalía + algún normal
fase1_ok = (len(anomaly_ok) > 0) and ((total_normal_off + total_normal_opt) > 0)

# Fase 2: normales oficiales ambos presentes
fase2_ok = (len(normal_off_ok) == 2)

print("\nResultado:")
print("FASE 1 (validación pipeline):", "OK" if fase1_ok else "NO OK")
print("FASE 2 (experimento completo):", "OK" if fase2_ok else "PENDIENTE (faltan normales oficiales)")



=== Paso 1.5 — Checklist de preparación ===
Anomalías presentes: 4 / 13 (videos: 200)
Normales oficiales presentes: 0 / 2 (videos: 0)
Normales placeholder presentes: 1 / 1 (videos: 50)

Resultado:
FASE 1 (validación pipeline): OK
FASE 2 (experimento completo): PENDIENTE (faltan normales oficiales)


## Paso 1.6 — Validación explícita de tu subconjunto + normal elegido (Código)

In [39]:
print("\n=== Paso 1.6 — Subconjunto de trabajo (Fase 1) ===")

SUBSET = ["Abuse", "Arrest", "Arson", "Assault"]

subset_infos = []
subset_ok = True

for cls in SUBSET:
    info = check_folder(cls, VIDEOS_DIR / cls, required=True)
    subset_infos.append(info)
    print_status(info)
    subset_ok = subset_ok and (info["status"] == "OK")

# Elegir normal (prioriza oficial si existe; si no, placeholder)
normal_choice = None
for candidate in ["Training_Normal_Videos_Anomaly", "Normal_Videos_event"]:
    info = check_folder(candidate, VIDEOS_DIR / candidate, required=False)
    if info["status"] == "OK":
        normal_choice = candidate
        print(f"\nNormal seleccionado para esta fase: {normal_choice}")
        break

if subset_ok and normal_choice is not None:
    print("\nSubconjunto: LISTO para pasar a preprocesamiento y extracción de embeddings.")
else:
    print("\nSubconjunto: NO listo. Revisa carpetas faltantes/vacías o agrega una carpeta de normales.")



=== Paso 1.6 — Subconjunto de trabajo (Fase 1) ===
Abuse: cargado correctamente (50 videos)
Arrest: cargado correctamente (50 videos)
Arson: cargado correctamente (50 videos)
Assault: cargado correctamente (50 videos)

Normal seleccionado para esta fase: Normal_Videos_event

Subconjunto: LISTO para pasar a preprocesamiento y extracción de embeddings.


## Paso 1.7 — Guardar reporte como evidencia (Código)

In [40]:
output_path = DATASET_ROOT / "report_dataset_validation.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2, ensure_ascii=False)

print(f"\nReporte guardado en: {output_path}")



Reporte guardado en: UCF_Crime/report_dataset_validation.json


# Paso 2 — Generación de splits (train/val/test) y etiquetas

## Objetivo
Construir un índice reproducible de videos y generar particiones `train/val/test` para una formulación binaria:
- **1 = anomalía** (todas las clases anómalas)
- **0 = normal**

En esta fase se trabaja con el subconjunto:
- Anomalías: Abuse, Arrest, Arson, Assault
- Normal: Normal_Videos_event (placeholder)

Se guardan:
- `metadata.csv`
- `splits/train.txt`, `splits/val.txt`, `splits/test.txt`


## Paso 2.1 — Parámetros de split (Código)

In [41]:
import random
import pandas as pd

# Reproducibilidad
SEED = 42
random.seed(SEED)

# Subconjunto actual (Fase 1)
ANOMALY_SUBSET = ["Abuse", "Arrest", "Arson", "Assault"]
NORMAL_FOLDER = "Normal_Videos_event"  # placeholder actual

# Proporciones de split
TRAIN_RATIO = 0.70
VAL_RATIO   = 0.15
TEST_RATIO  = 0.15

assert abs((TRAIN_RATIO + VAL_RATIO + TEST_RATIO) - 1.0) < 1e-9
print("Parámetros OK:", TRAIN_RATIO, VAL_RATIO, TEST_RATIO, "SEED:", SEED)


Parámetros OK: 0.7 0.15 0.15 SEED: 42


## Paso 2.2 — Construir índice de videos (Código)

In [42]:
from pathlib import Path

def collect_videos_from_class(class_name: str, label: int):
    folder = VIDEOS_DIR / class_name
    vids = list_videos(folder)
    rows = []
    for v in vids:
        rows.append({
            "video_id": v.stem,
            "path": str(v.relative_to(DATASET_ROOT).as_posix()),  # path relativo (portable)
            "original_class": class_name,
            "label": label
        })
    return rows

rows = []

# Anomalías -> label 1
for cls in ANOMALY_SUBSET:
    rows.extend(collect_videos_from_class(cls, label=1))

# Normal -> label 0
rows.extend(collect_videos_from_class(NORMAL_FOLDER, label=0))

df = pd.DataFrame(rows)

print("Total videos indexados:", len(df))
print(df["original_class"].value_counts())
print(df["label"].value_counts())
df.head()


Total videos indexados: 250
original_class
Abuse                  50
Arrest                 50
Arson                  50
Assault                50
Normal_Videos_event    50
Name: count, dtype: int64
label
1    200
0     50
Name: count, dtype: int64


Unnamed: 0,video_id,path,original_class,label
0,Abuse001_x264,Videos/Abuse/Abuse001_x264.mp4,Abuse,1
1,Abuse002_x264,Videos/Abuse/Abuse002_x264.mp4,Abuse,1
2,Abuse003_x264,Videos/Abuse/Abuse003_x264.mp4,Abuse,1
3,Abuse004_x264,Videos/Abuse/Abuse004_x264.mp4,Abuse,1
4,Abuse005_x264,Videos/Abuse/Abuse005_x264.mp4,Abuse,1


## Paso 2.3 — Split estratificado por clase (recomendado para Fase 1) (Código)

In [43]:
def stratified_split(df_in: pd.DataFrame, group_col: str):
    train_parts, val_parts, test_parts = [], [], []

    for group, gdf in df_in.groupby(group_col):
        idx = list(gdf.index)
        random.shuffle(idx)

        n = len(idx)
        n_train = int(round(n * TRAIN_RATIO))
        n_val   = int(round(n * VAL_RATIO))
        n_test  = n - n_train - n_val  # asegura suma exacta

        train_idx = idx[:n_train]
        val_idx   = idx[n_train:n_train+n_val]
        test_idx  = idx[n_train+n_val:]

        train_parts.append(df_in.loc[train_idx])
        val_parts.append(df_in.loc[val_idx])
        test_parts.append(df_in.loc[test_idx])

    train_df = pd.concat(train_parts).sample(frac=1, random_state=SEED).reset_index(drop=True)
    val_df   = pd.concat(val_parts).sample(frac=1, random_state=SEED).reset_index(drop=True)
    test_df  = pd.concat(test_parts).sample(frac=1, random_state=SEED).reset_index(drop=True)

    return train_df, val_df, test_df

train_df, val_df, test_df = stratified_split(df, group_col="original_class")

print("Tamaños:", len(train_df), len(val_df), len(test_df))
print("\nTrain label counts:\n", train_df["label"].value_counts())
print("\nVal label counts:\n", val_df["label"].value_counts())
print("\nTest label counts:\n", test_df["label"].value_counts())


Tamaños: 175 40 35

Train label counts:
 label
1    140
0     35
Name: count, dtype: int64

Val label counts:
 label
1    32
0     8
Name: count, dtype: int64

Test label counts:
 label
1    28
0     7
Name: count, dtype: int64


## Paso 2.4 — Validación del split (Código)

In [44]:
def split_report(name, sdf):
    print(f"\n--- {name} ---")
    print("Total:", len(sdf))
    print("Por clase:\n", sdf["original_class"].value_counts())
    print("Binario:\n", sdf["label"].value_counts())

split_report("TRAIN", train_df)
split_report("VAL", val_df)
split_report("TEST", test_df)



--- TRAIN ---
Total: 175
Por clase:
 original_class
Normal_Videos_event    35
Arson                  35
Assault                35
Arrest                 35
Abuse                  35
Name: count, dtype: int64
Binario:
 label
1    140
0     35
Name: count, dtype: int64

--- VAL ---
Total: 40
Por clase:
 original_class
Arson                  8
Arrest                 8
Assault                8
Abuse                  8
Normal_Videos_event    8
Name: count, dtype: int64
Binario:
 label
1    32
0     8
Name: count, dtype: int64

--- TEST ---
Total: 35
Por clase:
 original_class
Assault                7
Arrest                 7
Arson                  7
Normal_Videos_event    7
Abuse                  7
Name: count, dtype: int64
Binario:
 label
1    28
0     7
Name: count, dtype: int64


## Paso 2.5 — Guardar metadata.csv y archivos splits/*.txt (Código)

In [45]:
import os

OUT_META = DATASET_ROOT / "metadata.csv"
SPLITS_DIR = DATASET_ROOT / "splits"
SPLITS_DIR.mkdir(parents=True, exist_ok=True)

# Guardar metadata completa
df.to_csv(OUT_META, index=False)
print("Guardado:", OUT_META)

# Guardar splits como listas de paths (uno por línea)
(train_df["path"]).to_csv(SPLITS_DIR / "train.txt", index=False, header=False)
(val_df["path"]).to_csv(SPLITS_DIR / "val.txt",   index=False, header=False)
(test_df["path"]).to_csv(SPLITS_DIR / "test.txt",  index=False, header=False)

print("Splits guardados en:", SPLITS_DIR)
print(" -", SPLITS_DIR / "train.txt")
print(" -", SPLITS_DIR / "val.txt")
print(" -", SPLITS_DIR / "test.txt")


Guardado: UCF_Crime/metadata.csv
Splits guardados en: UCF_Crime/splits
 - UCF_Crime/splits/train.txt
 - UCF_Crime/splits/val.txt
 - UCF_Crime/splits/test.txt


## Evidencia generada
- `metadata.csv`: índice completo de videos con clase original y etiqueta binaria.
- `splits/train.txt`, `splits/val.txt`, `splits/test.txt`: particiones reproducibles basadas en paths relativos.


# Parte 3 — Preprocesamiento por split (train/val/test)

## Objetivo
Convertir cada video listado en `splits/train.txt`, `splits/val.txt`, `splits/test.txt` en un clip estándar de `N_FRAMES` frames, respetando los splits para evitar fuga de información.

## Decisiones (Fase 1)
- FPS objetivo: 30 (según README del dataset)
- Clip por video: 1 (representativo)
- Frames por clip: `N_FRAMES` (ej. 32)
- Muestreo: uniforme en la duración del video (evita sesgo por inicio/final)

## Salidas
- `processed/<split>/frames/<video_id>/frame_0001.jpg ...`
- `processed/preprocess_index.csv` (evidencia reproducible)


## Celda 3.1 — Parámetros y carpetas de salida

In [46]:
from pathlib import Path
import pandas as pd
import numpy as np
import os
from datetime import datetime

# Parámetros fase 1
FPS_TARGET = 30
N_FRAMES = 32

PROCESSED_DIR = DATASET_ROOT / "processed"
for sp in ["train", "val", "test"]:
    (PROCESSED_DIR / sp / "frames").mkdir(parents=True, exist_ok=True)

print("PROCESSED_DIR:", PROCESSED_DIR)
print("FPS_TARGET:", FPS_TARGET, "N_FRAMES:", N_FRAMES)


PROCESSED_DIR: UCF_Crime/processed
FPS_TARGET: 30 N_FRAMES: 32


## Celda 3.2 — Cargar splits (train/val/test)

In [47]:
def load_split_paths(split_name: str):
    p = DATASET_ROOT / "splits" / f"{split_name}.txt"
    paths = [line.strip() for line in p.read_text(encoding="utf-8").splitlines() if line.strip()]
    return paths

split_paths = {sp: load_split_paths(sp) for sp in ["train", "val", "test"]}
{k: len(v) for k, v in split_paths.items()}


{'train': 175, 'val': 40, 'test': 35}

## Celda 3.3 — Backend de lectura de video (OpenCV) + verificación

In [48]:
try:
    import cv2
    cv2_available = True
except Exception as e:
    cv2_available = False
    cv2_err = str(e)

print("OpenCV disponible:", cv2_available)
if not cv2_available:
    print("Error OpenCV:", cv2_err)


OpenCV disponible: True


## Celda 3.4 — Verificar FFmpeg/FFprobe (fallback técnico)

In [49]:
import subprocess

def ffmpeg_exists():
    try:
        subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
        subprocess.run(["ffprobe", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
        return True
    except Exception:
        return False

print("FFmpeg/FFprobe disponible:", ffmpeg_exists())


FFmpeg/FFprobe disponible: True


## Celda 3.5 — Helpers de rutas e ID reproducible por video

In [50]:
def resolve_video_abs(rel_path: str) -> Path:
    # split guarda rutas relativas a DATASET_ROOT
    return (DATASET_ROOT / rel_path).resolve()

def make_video_id(rel_path: str) -> str:
    # ID reproducible basado en la ruta (sin extensión)
    p = Path(rel_path)
    no_ext = p.with_suffix("")
    return "__".join(no_ext.parts)


## Celda 3.6 — Funciones OpenCV: muestreo uniforme por índice + guardado JPG

In [51]:
def get_video_info_cv2(video_path: Path):
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        cap.release()
        return None
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    cap.release()
    if frame_count <= 0:
        return None
    return {"frame_count": frame_count, "fps": float(fps) if fps else None}

def read_frame_at_index_cv2(video_path: Path, idx: int):
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        cap.release()
        return None
    cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
    ok, frame_bgr = cap.read()
    cap.release()
    if (not ok) or (frame_bgr is None):
        return None
    return frame_bgr  # BGR

def extract_uniform_clip_cv2(video_path: Path, n_frames: int):
    info = get_video_info_cv2(video_path)
    if info is None:
        return None, {"ok": False, "reason": "cannot_open_or_probe"}

    T = info["frame_count"]
    if T < 2:
        return None, {"ok": False, "reason": "too_few_frames", "frame_count": T}

    indices = np.linspace(0, T - 1, n_frames).astype(int).tolist()

    frames_bgr = []
    for idx in indices:
        fr = read_frame_at_index_cv2(video_path, idx)
        if fr is None:
            return None, {
                "ok": False,
                "reason": "frame_read_failed",
                "failed_index": idx,
                "frame_count": T,
                "source_fps": info["fps"],
                "indices": indices
            }
        frames_bgr.append(fr)

    return frames_bgr, {
        "ok": True,
        "frame_count": T,
        "source_fps": info["fps"],
        "indices": indices
    }

def save_frames_as_jpg_bgr(frames_bgr, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    saved = 0
    for i, fr_bgr in enumerate(frames_bgr, start=1):
        out_path = out_dir / f"frame_{i:04d}.jpg"
        if cv2.imwrite(str(out_path), fr_bgr):
            saved += 1
    return saved

print("Funciones OpenCV listas.")


Funciones OpenCV listas.


## Celda 3.7 — Fallback FFmpeg: extracción simple (solo si OpenCV falla)

In [52]:
def extract_frames_ffmpeg_fallback(video_abs: Path, out_dir: Path, n_frames: int, fps: int):
    """
    Fallback simple: extrae n_frames a fps desde el inicio, clip de duración n_frames/fps.
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    out_pattern = str(out_dir / "frame_%04d.jpg")
    clip_seconds = n_frames / fps

    cmd = [
        "ffmpeg", "-y",
        "-i", str(video_abs),
        "-vf", f"fps={fps}",
        "-t", f"{clip_seconds:.3f}",
        "-vframes", str(n_frames),
        "-q:v", "2",
        out_pattern
    ]
    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if proc.returncode != 0:
        return {"ok": False, "reason": "ffmpeg_failed", "stderr": proc.stderr[-400:]}

    frames = sorted(out_dir.glob("frame_*.jpg"))
    if len(frames) < n_frames:
        return {"ok": False, "reason": "insufficient_frames", "extracted": len(frames)}

    if len(frames) > n_frames:
        for f in frames[n_frames:]:
            f.unlink()

    return {"ok": True, "extracted": n_frames}

print("Fallback FFmpeg listo.")


Fallback FFmpeg listo.


## Celda 3.8 — Preprocesamiento por split (train/val/test) + registro reproducible (CSV)

In [53]:
rows = []
run_ts = datetime.now().isoformat(timespec="seconds")

for split_name in ["train", "val", "test"]:
    rel_list = split_paths[split_name]
    print(f"\n== Preprocesando {split_name} | videos: {len(rel_list)} ==")

    for i, rel_path in enumerate(rel_list, start=1):
        video_abs = resolve_video_abs(rel_path)
        video_id = make_video_id(rel_path)
        out_dir = PROCESSED_DIR / split_name / "frames" / video_id

        # 1) Skip si ya existe completo
        existing = sorted(out_dir.glob("frame_*.jpg"))
        if len(existing) >= N_FRAMES:
            rows.append({
                "run_ts": run_ts,
                "split": split_name,
                "rel_path": rel_path,
                "video_abs": str(video_abs),
                "video_id": video_id,
                "method": "skip_exists",
                "status": "ok",
                "saved": len(existing),
                "source_fps": None,
                "frame_count": None,
                "failed_index": None,
                "reason": None,
                "stderr_tail": None,
            })
            continue

        # 2) Verificar existencia del archivo
        if not video_abs.exists():
            rows.append({
                "run_ts": run_ts,
                "split": split_name,
                "rel_path": rel_path,
                "video_abs": str(video_abs),
                "video_id": video_id,
                "method": "none",
                "status": "fail",
                "saved": 0,
                "source_fps": None,
                "frame_count": None,
                "failed_index": None,
                "reason": "missing_file",
                "stderr_tail": None,
            })
            continue

        # 3) Intento OpenCV (uniforme por índices)
        frames_bgr, meta = extract_uniform_clip_cv2(video_abs, N_FRAMES)
        if meta["ok"]:
            saved = save_frames_as_jpg_bgr(frames_bgr, out_dir)
            status = "ok" if saved == N_FRAMES else "fail"

            rows.append({
                "run_ts": run_ts,
                "split": split_name,
                "rel_path": rel_path,
                "video_abs": str(video_abs),
                "video_id": video_id,
                "method": "opencv_uniform",
                "status": status,
                "saved": saved,
                "source_fps": meta.get("source_fps"),
                "frame_count": meta.get("frame_count"),
                "failed_index": None,
                "reason": None if status == "ok" else "save_failed",
                "stderr_tail": None,
            })

        else:
            # 4) Fallback FFmpeg si OpenCV falla
            fb = extract_frames_ffmpeg_fallback(video_abs, out_dir, N_FRAMES, FPS_TARGET)
            rows.append({
                "run_ts": run_ts,
                "split": split_name,
                "rel_path": rel_path,
                "video_abs": str(video_abs),
                "video_id": video_id,
                "method": "ffmpeg_fallback",
                "status": "ok" if fb.get("ok") else "fail",
                "saved": fb.get("extracted", 0),
                "source_fps": meta.get("source_fps"),
                "frame_count": meta.get("frame_count"),
                "failed_index": meta.get("failed_index"),
                "reason": meta.get("reason") if not fb.get("ok") else "opencv_failed_used_fallback",
                "stderr_tail": fb.get("stderr"),
            })

        if i % 25 == 0:
            print(f"  Progreso {split_name}: {i}/{len(rel_list)}")

index_df = pd.DataFrame(rows)
index_csv = PROCESSED_DIR / "preprocess_index.csv"
index_df.to_csv(index_csv, index=False, encoding="utf-8")

print("\nListo. CSV:", index_csv)
index_df.head()



== Preprocesando train | videos: 175 ==

== Preprocesando val | videos: 40 ==

== Preprocesando test | videos: 35 ==

Listo. CSV: UCF_Crime/processed/preprocess_index.csv


Unnamed: 0,run_ts,split,rel_path,video_abs,video_id,method,status,saved,source_fps,frame_count,failed_index,reason,stderr_tail
0,2026-01-04T16:38:03,train,Videos/Normal_Videos_event/Normal_Videos_597_x...,/home/diego/Escritorio/Pruebas/tesispython/UCF...,Videos__Normal_Videos_event__Normal_Videos_597...,skip_exists,ok,32,,,,,
1,2026-01-04T16:38:03,train,Videos/Normal_Videos_event/Normal_Videos_603_x...,/home/diego/Escritorio/Pruebas/tesispython/UCF...,Videos__Normal_Videos_event__Normal_Videos_603...,skip_exists,ok,32,,,,,
2,2026-01-04T16:38:03,train,Videos/Arson/Arson046_x264.mp4,/home/diego/Escritorio/Pruebas/tesispython/UCF...,Videos__Arson__Arson046_x264,skip_exists,ok,32,,,,,
3,2026-01-04T16:38:03,train,Videos/Assault/Assault002_x264.mp4,/home/diego/Escritorio/Pruebas/tesispython/UCF...,Videos__Assault__Assault002_x264,skip_exists,ok,32,,,,,
4,2026-01-04T16:38:03,train,Videos/Normal_Videos_event/Normal_Videos_656_x...,/home/diego/Escritorio/Pruebas/tesispython/UCF...,Videos__Normal_Videos_event__Normal_Videos_656...,skip_exists,ok,32,,,,,


## Celda 3.9 — Validación rápida (sanity checks)

In [54]:
# Resumen por split / método / estado
summary = index_df.groupby(["split", "method", "status"]).size().reset_index(name="count")
summary.sort_values(["split", "status", "method"])


Unnamed: 0,split,method,status,count
0,test,skip_exists,ok,35
1,train,skip_exists,ok,175
2,val,skip_exists,ok,40


## Celda 3.10 — Inspeccionar fallos (si existen)

In [55]:
bad = index_df[index_df["status"] != "ok"]
print("Fallos:", len(bad))
bad.head(20)


Fallos: 0


Unnamed: 0,run_ts,split,rel_path,video_abs,video_id,method,status,saved,source_fps,frame_count,failed_index,reason,stderr_tail


## Celda 3.11 — Verificar que un par de carpetas tengan N_FRAMES

In [56]:
def count_frames_in_dir(d: Path):
    return len(list(d.glob("frame_*.jpg")))

sample_ok = index_df[index_df["status"]=="ok"].head(5)
for _, r in sample_ok.iterrows():
    d = PROCESSED_DIR / r["split"] / "frames" / r["video_id"]
    print(r["split"], r["video_id"], "frames:", count_frames_in_dir(d))


train Videos__Normal_Videos_event__Normal_Videos_597_x264 frames: 32
train Videos__Normal_Videos_event__Normal_Videos_603_x264 frames: 32
train Videos__Arson__Arson046_x264 frames: 32
train Videos__Assault__Assault002_x264 frames: 32
train Videos__Normal_Videos_event__Normal_Videos_656_x264 frames: 32


# Parte 4 — Dataset (clips desde frames) + preparación para Codificador/LoRA

En esta parte conectamos el preprocesamiento con el modelo:

1. Definimos configuración e imports.
2. Construimos un `Dataset` que carga **clips de 32 frames** desde `processed/{split}/frames/{video_id}`.
3. Cargamos el índice maestro (`processed/preprocess_index.csv`) y los splits (`splits/train.txt`).
4. Normalizamos IDs para que los splits coincidan con el `video_id` interno.
5. Inferimos `label` binario (normal vs anomalía) para VAD.
6. Hacemos un sanity check: shapes y etiqueta.


## 4.1 Imports y configuración global

Definimos el dispositivo, tamaño del clip (número de frames) y rutas base del proyecto.


In [57]:
# ================================
# Parte 4.1 — Imports y Config
# ================================
import torch
import pandas as pd
from pathlib import Path
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Usando dispositivo:", DEVICE)

# Cantidad de frames por clip (debe coincidir con tu Parte 3)
N_FRAMES = 32

# Rutas (ajustadas a tu estructura real)
NOTEBOOK_DIR = Path("/home/diego/Escritorio/Pruebas/tesispython")
DATASET_ROOT = NOTEBOOK_DIR / "UCF_Crime"
PROCESSED_DIR = DATASET_ROOT / "processed"

print("DATASET_ROOT:", DATASET_ROOT, "exists:", DATASET_ROOT.exists())
print("PROCESSED_DIR:", PROCESSED_DIR, "exists:", PROCESSED_DIR.exists())
print("Index exists:", (PROCESSED_DIR / "preprocess_index.csv").exists())



Usando dispositivo: cpu
DATASET_ROOT: /home/diego/Escritorio/Pruebas/tesispython/UCF_Crime exists: True
PROCESSED_DIR: /home/diego/Escritorio/Pruebas/tesispython/UCF_Crime/processed exists: True
Index exists: True


## 4.2 Dataset de frames (clip de T frames)

Este `Dataset` carga un clip de `N_FRAMES` imágenes desde:

`processed/{split}/frames/{video_id}/frame_0001.jpg ... frame_0032.jpg`

Devuelve:
- `video_id`
- `clip`: tensor `[T, C, H, W]`
- `label`: entero (0 normal, 1 anomalía)


In [58]:
# ================================
# Parte 4.2 — Dataset de Frames
# ================================
class FrameClipDataset(Dataset):
    def __init__(self, df, processed_dir, split, n_frames=32, transform=None):
        self.df = df.reset_index(drop=True)
        self.processed_dir = processed_dir
        self.split = split
        self.n_frames = n_frames
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        video_id = row["video_id"]
        label = int(row["label"]) if "label" in row else -1

        frames_dir = self.processed_dir / self.split / "frames" / video_id
        frame_paths = [frames_dir / f"frame_{i:04d}.jpg" for i in range(1, self.n_frames + 1)]

        frames = []
        for p in frame_paths:
            img = Image.open(p).convert("RGB")
            if self.transform:
                img = self.transform(img)
            frames.append(img)

        clip = torch.stack(frames, dim=0)  # [T, C, H, W]
        return {"video_id": video_id, "clip": clip, "label": label}



## 4.3 Construcción del `train_df` desde índice + splits

- Cargamos el índice maestro desde: `processed/preprocess_index.csv`
- Cargamos el split desde: `splits/train.txt`
- Normalizamos los IDs del split para que coincidan con el formato interno `video_id` del índice:

Split original:
`Videos/Arrest/Arrest001_x264.mp4`

Formato interno:
`Videos__Arrest__Arrest001_x264`


In [59]:
# ================================
# Parte 4.3 — Índice + split + normalización
# ================================

# 1) Índice maestro
index_df = pd.read_csv(PROCESSED_DIR / "preprocess_index.csv")
print("Index cargado:", index_df.shape)

# 2) Split train (txt)
with open(DATASET_ROOT / "splits" / "train.txt") as f:
    train_ids = [l.strip() for l in f if l.strip()]

print("Train IDs:", len(train_ids))

# 3) Normalización de IDs (split path -> video_id interno)
def split_path_to_index_id(p):
    """
    Convierte:
    Videos/Arrest/Arrest001_x264.mp4
    →
    Videos__Arrest__Arrest001_x264
    """
    p = Path(p)
    return f"{p.parts[0]}__{p.parts[1]}__{p.stem}"

train_ids_norm = [split_path_to_index_id(x) for x in train_ids]
print("Ejemplo normalizado:", train_ids_norm[:5])

# 4) Filtrar índice usando IDs normalizados
train_df = index_df[index_df["video_id"].isin(train_ids_norm)].reset_index(drop=True)
print("Train DF:", train_df.shape)

train_df.head()


Index cargado: (250, 13)
Train IDs: 175
Ejemplo normalizado: ['Videos__Normal_Videos_event__Normal_Videos_597_x264', 'Videos__Normal_Videos_event__Normal_Videos_603_x264', 'Videos__Arson__Arson046_x264', 'Videos__Assault__Assault002_x264', 'Videos__Normal_Videos_event__Normal_Videos_656_x264']
Train DF: (175, 13)


Unnamed: 0,run_ts,split,rel_path,video_abs,video_id,method,status,saved,source_fps,frame_count,failed_index,reason,stderr_tail
0,2026-01-04T16:38:03,train,Videos/Normal_Videos_event/Normal_Videos_597_x...,/home/diego/Escritorio/Pruebas/tesispython/UCF...,Videos__Normal_Videos_event__Normal_Videos_597...,skip_exists,ok,32,,,,,
1,2026-01-04T16:38:03,train,Videos/Normal_Videos_event/Normal_Videos_603_x...,/home/diego/Escritorio/Pruebas/tesispython/UCF...,Videos__Normal_Videos_event__Normal_Videos_603...,skip_exists,ok,32,,,,,
2,2026-01-04T16:38:03,train,Videos/Arson/Arson046_x264.mp4,/home/diego/Escritorio/Pruebas/tesispython/UCF...,Videos__Arson__Arson046_x264,skip_exists,ok,32,,,,,
3,2026-01-04T16:38:03,train,Videos/Assault/Assault002_x264.mp4,/home/diego/Escritorio/Pruebas/tesispython/UCF...,Videos__Assault__Assault002_x264,skip_exists,ok,32,,,,,
4,2026-01-04T16:38:03,train,Videos/Normal_Videos_event/Normal_Videos_656_x...,/home/diego/Escritorio/Pruebas/tesispython/UCF...,Videos__Normal_Videos_event__Normal_Videos_656...,skip_exists,ok,32,,,,,


## 4.3.b Etiquetado binario (VAD)

Convertimos UCF-Crime a **VAD binario**:

- `Normal_Videos_event` → label = 0 (normal)
- resto de clases → label = 1 (anomalía)

La etiqueta se infiere desde el `video_id` interno.


In [60]:
# ================================
# Parte 4.3.b — Inferir label binario
# ================================
def infer_label_from_video_id(video_id):
    if "__Normal_Videos_event__" in video_id:
        return 0
    else:
        return 1

train_df["label"] = train_df["video_id"].apply(infer_label_from_video_id)

print("Distribución de labels:")
print(train_df["label"].value_counts())


Distribución de labels:
label
1    140
0     35
Name: count, dtype: int64


## 4.4 Transform y sanity check (shape, label)

Definimos una transformación base para frames.
En esta etapa solo validamos que:

- el clip tiene shape `[32, 3, 384, 384]`
- el label es 0 o 1


In [61]:
# ================================
# Parte 4.4 — Transform + Dataset + Test
# ================================
transform = transforms.Compose([
    transforms.Resize((384, 384)),  # SigLIP suele trabajar bien con 384
    transforms.ToTensor(),
])

dataset = FrameClipDataset(
    train_df,
    PROCESSED_DIR,
    split="train",
    n_frames=N_FRAMES,
    transform=transform
)

sample = dataset[0]
print("Video ID:", sample["video_id"])
print("Clip shape:", sample["clip"].shape)  # esperado: [32, 3, 384, 384]
print("Label:", sample["label"])            # esperado: 0 o 1


Video ID: Videos__Normal_Videos_event__Normal_Videos_597_x264
Clip shape: torch.Size([32, 3, 384, 384])
Label: 0


## 4.5 Carga del codificador visual (SigLIP)

Se utiliza SigLIP como codificador visual preentrenado.
En esta etapa **NO se entrena** el modelo: solo se valida la extracción
de embeddings a partir de clips de frames.


In [62]:
# ================================
# Parte 4.5 — Cargar SigLIP
# ================================
from transformers import AutoProcessor, AutoModel

SIGLIP_NAME = "google/siglip-so400m-patch14-384"

processor = AutoProcessor.from_pretrained(SIGLIP_NAME)
encoder = AutoModel.from_pretrained(SIGLIP_NAME).to(DEVICE)
encoder.eval()

print("SigLIP cargado correctamente")


SigLIP cargado correctamente


## 4.6 Extracción de embedding de video (sin LoRA)

Cada frame del clip se procesa individualmente con SigLIP.
Luego, los embeddings temporales se agregan mediante **average pooling**
para obtener una representación única por video.


In [63]:
# ================================
# Parte 4.6 — Encoding del clip
# ================================
@torch.no_grad()
def encode_clip_siglip(encoder, processor, clip):
    """
    clip: torch.Tensor [T, C, H, W]
    return: torch.Tensor [1, D]
    """
    # Convertir frames a PIL
    imgs = [transforms.ToPILImage()(clip[t]) for t in range(clip.shape[0])]

    # Preprocesamiento SigLIP
    inputs = processor(images=imgs, return_tensors="pt").to(DEVICE)

    # Forward
    feats = encoder.get_image_features(**inputs)  # [T, D]

    # Pooling temporal
    video_emb = feats.mean(dim=0, keepdim=True)   # [1, D]
    return video_emb

# Test con un sample
emb = encode_clip_siglip(encoder, processor, sample["clip"])
print("Embedding shape:", emb.shape)


Embedding shape: torch.Size([1, 1152])


## 4.7 Congelamiento del codificador

Antes de aplicar LoRA, se congelan todos los parámetros del codificador
para asegurar que solo los adaptadores entrenables modifiquen el modelo.


In [64]:
# ================================
# Parte 4.7 — Freeze del encoder
# ================================
for p in encoder.parameters():
    p.requires_grad = False

print("Encoder congelado")


Encoder congelado


## 4.8 Adaptación del codificador mediante LoRA

Se emplea LoRA (Low-Rank Adaptation) para adaptar de forma eficiente
ciertas capas del mecanismo de atención, reduciendo el número de
parámetros entrenables y el costo computacional.


In [65]:
# ================================
# Parte 4.8 — LoRA (PEFT)
# ================================
from peft import LoraConfig, get_peft_model

lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj"]
)

encoder_lora = get_peft_model(encoder, lora_cfg).to(DEVICE)
encoder_lora.train()

# Contar parámetros
trainable = sum(p.numel() for p in encoder_lora.parameters() if p.requires_grad)
total = sum(p.numel() for p in encoder_lora.parameters())

print(f"Parámetros entrenables: {trainable:,}")
print(f"Parámetros totales:    {total:,}")
print(f"Porcentaje entrenable: {100 * trainable / total:.4f}%")


Parámetros entrenables: 1,990,656
Parámetros totales:    879,951,154
Porcentaje entrenable: 0.2262%


## 4.9 Validación del forward con SigLIP + LoRA

Se verifica que la incorporación de LoRA no altera la dimensionalidad
del embedding y que el forward pass es estable.


In [66]:
# ================================
# Parte 4.9 — Test forward con LoRA
# ================================
emb_lora = encode_clip_siglip(encoder_lora, processor, sample["clip"])
print("Embedding LoRA shape:", emb_lora.shape)


Embedding LoRA shape: torch.Size([1, 1152])


# Parte 5 — Extracción de embeddings (Baseline)

En esta etapa se extraen embeddings de video utilizando el codificador
SigLIP **preentrenado y congelado**, sin adaptación adicional (LoRA).

Estos embeddings constituyen el **baseline** del sistema y serán usados
para entrenar y evaluar un clasificador MLP en la Parte 6.


## 5.1 Construcción de DataFrames por split (train / val / test)

Se reutiliza el índice maestro y los splits originales, normalizando IDs
y asignando etiquetas binarias de VAD.


In [67]:
# ================================
# Parte 5.1 — DataFrames por split
# ================================

def build_split_df(split_name):
    # Leer IDs del split
    with open(DATASET_ROOT / "splits" / f"{split_name}.txt") as f:
        ids = [l.strip() for l in f if l.strip()]

    # Normalizar IDs
    ids_norm = [split_path_to_index_id(x) for x in ids]

    # Filtrar índice
    df = index_df[index_df["video_id"].isin(ids_norm)].reset_index(drop=True)

    # Inferir label binario
    df["label"] = df["video_id"].apply(infer_label_from_video_id)

    print(f"{split_name}: {df.shape}")
    return df

train_df = build_split_df("train")
val_df   = build_split_df("val")
test_df  = build_split_df("test")


train: (175, 14)
val: (40, 14)
test: (35, 14)


## 5.2 Organización de salida de embeddings

Los embeddings se almacenan en disco para evitar reprocesar frames
en experimentos posteriores.


In [68]:
# ================================
# Parte 5.2 — Directorios de salida
# ================================
FEATURE_ROOT = NOTEBOOK_DIR / "features" / "siglip_baseline"
FEATURE_ROOT.mkdir(parents=True, exist_ok=True)

for split in ["train", "val", "test"]:
    (FEATURE_ROOT / split).mkdir(exist_ok=True)

print("Feature root:", FEATURE_ROOT)


Feature root: /home/diego/Escritorio/Pruebas/tesispython/features/siglip_baseline


## 5.3 Extracción y guardado de embeddings

Para cada video:
1. Se cargan los frames (clip).
2. Se obtiene el embedding de video con SigLIP.
3. Se guarda un archivo `.npy` por video.
4. Se registra `video_id` y `label` en un CSV.


In [69]:
# ================================
# Parte 5.3 — Extracción de embeddings
# ================================
import numpy as np
from tqdm import tqdm

def extract_and_save_embeddings(df, split_name, encoder, processor):
    rows = []

    dataset = FrameClipDataset(
        df,
        PROCESSED_DIR,
        split=split_name,
        n_frames=N_FRAMES,
        transform=transform
    )

    out_dir = FEATURE_ROOT / split_name

    for i in tqdm(range(len(dataset)), desc=f"Extract {split_name}"):
        sample = dataset[i]
        video_id = sample["video_id"]
        label = sample["label"]
        clip = sample["clip"]

        emb = encode_clip_siglip(encoder, processor, clip)  # [1, D]
        emb = emb.squeeze(0).cpu().numpy()                  # [D]

        np.save(out_dir / f"{video_id}.npy", emb)

        rows.append({
            "video_id": video_id,
            "label": label,
            "feature_path": str(out_dir / f"{video_id}.npy")
        })

    # Guardar índice de features
    pd.DataFrame(rows).to_csv(out_dir / "labels.csv", index=False)
    print(f"Guardado {split_name}: {len(rows)} embeddings")


## 5.4 Ejecución de la extracción (Baseline)

Se extraen embeddings para los conjuntos de entrenamiento,
validación y prueba.


In [70]:
# ================================
# Parte 5.4 — Run extracción baseline
# ================================
encoder.eval()  # asegurarse que está congelado

extract_and_save_embeddings(train_df, "train", encoder, processor)
extract_and_save_embeddings(val_df,   "val",   encoder, processor)
extract_and_save_embeddings(test_df,  "test",  encoder, processor)


Extract train:  20%|██        | 35/175 [1:02:48<4:11:15, 107.68s/it]


KeyboardInterrupt: 

## 5.5 Verificación de embeddings guardados


In [71]:
# ================================
# Parte 5.5 — Check rápido
# ================================
sample_row = pd.read_csv(FEATURE_ROOT / "train" / "labels.csv").iloc[0]
emb = np.load(sample_row["feature_path"])

print("Embedding shape:", emb.shape)
print("Label:", sample_row["label"])


FileNotFoundError: [Errno 2] No such file or directory: '/home/diego/Escritorio/Pruebas/tesispython/features/siglip_baseline/train/labels.csv'

# Parte 5B — Extracción rápida (Subset)

Objetivo:
- Validar extremo a extremo que el pipeline funciona (frames → SigLIP → .npy + labels.csv)
- Evitar tiempos largos ejecutando solo un subconjunto (p. ej., 10–20 videos)
- Dejar el sistema listo para correr el conjunto completo en otro momento


## 5B.1 Subset de entrenamiento y directorio de salida

Se seleccionan N videos del train para validar la extracción.


In [None]:
# ================================
# Parte 5B.1 — Subset + Output dir
# ================================
DEBUG_N = 10   # súbelo a 20 si quieres, pero 10 es suficiente para validar

train_df_debug = train_df.iloc[:DEBUG_N].copy()

FEATURE_DEBUG_ROOT = NOTEBOOK_DIR / "features" / "siglip_baseline_debug"
(FEATURE_DEBUG_ROOT / "train_debug").mkdir(parents=True, exist_ok=True)

print("DEBUG_N:", DEBUG_N)
print("Salida:", FEATURE_DEBUG_ROOT / "train_debug")
print("train_df_debug shape:", train_df_debug.shape)


DEBUG_N: 10
Salida: /home/diego/Escritorio/Pruebas/tesispython/features/siglip_baseline_debug/train_debug
train_df_debug shape: (10, 14)


## 5B.2 Extracción de embeddings para el subset

Se guarda un `.npy` por video y un `labels.csv` para el split debug.


In [None]:
# ================================
# Parte 5B.2 — Extracción subset
# ================================
import numpy as np
from tqdm import tqdm

def extract_and_save_embeddings_debug(df, split_name, encoder, processor, out_root):
    rows = []

    dataset = FrameClipDataset(
        df,
        PROCESSED_DIR,
        split="train",           # usamos frames del split train
        n_frames=N_FRAMES,
        transform=transform
    )

    out_dir = out_root / split_name
    out_dir.mkdir(parents=True, exist_ok=True)

    encoder.eval()

    for i in tqdm(range(len(dataset)), desc=f"Extract {split_name}"):
        sample = dataset[i]
        video_id = sample["video_id"]
        label = sample["label"]
        clip = sample["clip"]

        emb = encode_clip_siglip(encoder, processor, clip)  # [1, D]
        emb = emb.squeeze(0).cpu().numpy()                  # [D]

        np.save(out_dir / f"{video_id}.npy", emb)

        rows.append({
            "video_id": video_id,
            "label": label,
            "feature_path": str(out_dir / f"{video_id}.npy")
        })

    pd.DataFrame(rows).to_csv(out_dir / "labels.csv", index=False)
    print(f"Guardado {split_name}: {len(rows)} embeddings")
    return out_dir

debug_out_dir = extract_and_save_embeddings_debug(
    train_df_debug,
    split_name="train_debug",
    encoder=encoder,
    processor=processor,
    out_root=FEATURE_DEBUG_ROOT
)


Extract train_debug: 100%|██████████| 10/10 [19:37<00:00, 117.76s/it]

Guardado train_debug: 10 embeddings





## 5B.3 Verificación de archivos generados

Se valida que:
- existe `labels.csv`
- existen archivos `.npy`
- el embedding tiene dimensión esperada (1152)


In [None]:
# ================================
# Parte 5B.3 — Check outputs
# ================================
labels_path = debug_out_dir / "labels.csv"
print("labels.csv exists:", labels_path.exists(), "-", labels_path)

labels_df_debug = pd.read_csv(labels_path)
print("labels_df_debug shape:", labels_df_debug.shape)
display(labels_df_debug.head(3))

first_path = labels_df_debug.iloc[0]["feature_path"]
emb = np.load(first_path)
print("Primer embedding shape:", emb.shape)
print("Primer label:", labels_df_debug.iloc[0]["label"])


labels.csv exists: True - /home/diego/Escritorio/Pruebas/tesispython/features/siglip_baseline_debug/train_debug/labels.csv
labels_df_debug shape: (10, 3)


Unnamed: 0,video_id,label,feature_path
0,Videos__Normal_Videos_event__Normal_Videos_597...,0,/home/diego/Escritorio/Pruebas/tesispython/fea...
1,Videos__Normal_Videos_event__Normal_Videos_603...,0,/home/diego/Escritorio/Pruebas/tesispython/fea...
2,Videos__Arson__Arson046_x264,1,/home/diego/Escritorio/Pruebas/tesispython/fea...


Primer embedding shape: (1152,)
Primer label: 0


# Parte 6 — Clasificador MLP (modo debug)

En esta etapa se entrena un clasificador MLP simple sobre los embeddings
extraídos en la Parte 5B (subset), con el objetivo de:

- Validar el pipeline de clasificación
- Verificar métricas y pérdidas
- Preparar el código para el entrenamiento completo posterior


## 6.1 Carga de embeddings y etiquetas

Se cargan los embeddings `.npy` y las etiquetas desde `labels.csv`.


In [None]:
# ================================
# Parte 6.1 — Cargar features
# ================================
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

DEBUG_FEATURE_DIR = FEATURE_DEBUG_ROOT / "train_debug"
labels_df = pd.read_csv(DEBUG_FEATURE_DIR / "labels.csv")

X = []
y = []

for _, row in labels_df.iterrows():
    emb = np.load(row["feature_path"])
    X.append(emb)
    y.append(row["label"])

X = np.stack(X)   # [N, 1152]
y = np.array(y)   # [N]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (10, 1152)
y shape: (10,)


## 6.2 Split entrenamiento / validación (debug)

Dado el tamaño reducido, se utiliza un split simple solo para validar el flujo.


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("Train:", X_train.shape, y_train.shape)
print("Val:",   X_val.shape,   y_val.shape)


Train: (7, 1152) (7,)
Val: (3, 1152) (3,)


## 6.3 Definición del clasificador MLP

Se utiliza un MLP simple con una capa oculta.
Este modelo se entrenará únicamente sobre embeddings.


In [None]:
# ================================
# Parte 6.3 — MLP
# ================================
import torch
import torch.nn as nn

class MLPClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

model = MLPClassifier(input_dim=1152).to(DEVICE)
print(model)


MLPClassifier(
  (net): Sequential(
    (0): Linear(in_features=1152, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=1, bias=True)
  )
)


## 6.4 Preparación de entrenamiento


In [None]:
# ================================
# Parte 6.4 — Setup entrenamiento
# ================================
X_train_t = torch.tensor(X_train, dtype=torch.float32).to(DEVICE)
y_train_t = torch.tensor(y_train, dtype=torch.float32).to(DEVICE)

X_val_t = torch.tensor(X_val, dtype=torch.float32).to(DEVICE)
y_val_t = torch.tensor(y_val, dtype=torch.float32).to(DEVICE)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


## 6.5 Entrenamiento del MLP (debug)


In [None]:
# ================================
# Parte 6.5 — Train loop
# ================================
EPOCHS = 20

for epoch in range(EPOCHS):
    model.train()
    optimizer.zero_grad()

    logits = model(X_train_t)
    loss = criterion(logits, y_train_t)
    loss.backward()
    optimizer.step()

    # Validación
    model.eval()
    with torch.no_grad():
        val_logits = model(X_val_t)
        val_loss = criterion(val_logits, y_val_t)

    if epoch % 5 == 0 or epoch == EPOCHS - 1:
        print(f"Epoch {epoch:02d} | train loss: {loss.item():.4f} | val loss: {val_loss.item():.4f}")


Epoch 00 | train loss: 0.6679 | val loss: 0.5481
Epoch 05 | train loss: 0.1228 | val loss: 0.3042
Epoch 10 | train loss: 0.0349 | val loss: 0.2026
Epoch 15 | train loss: 0.0044 | val loss: 0.1781
Epoch 19 | train loss: 0.0016 | val loss: 0.1676


## 6.6 Métricas (debug)


In [None]:
# ================================
# Parte 6.6 — Métricas
# ================================
from sklearn.metrics import accuracy_score, f1_score

model.eval()
with torch.no_grad():
    probs = torch.sigmoid(model(X_val_t)).cpu().numpy()
    preds = (probs > 0.5).astype(int)

acc = accuracy_score(y_val, preds)
f1  = f1_score(y_val, preds, zero_division=0)

print("Accuracy:", acc)
print("F1:", f1)


Accuracy: 1.0
F1: 1.0


# Parte 7 — LoRA entrenado y evaluación del codificador

Objetivo:
1. Entrenar LoRA sobre el codificador SigLIP usando un objetivo binario (normal vs anomalía).
2. Extraer embeddings con el codificador **ya adaptado** (SigLIP + LoRA entrenado).
3. Entrenar y evaluar un MLP sobre esos embeddings para cuantificar el impacto de LoRA.

Nota: En esta versión (debug) se usa batch_size=1 para simplificar el procesamiento de frames.
Luego se puede escalar.


## 7.1 Subsets debug para entrenamiento de LoRA

Se usa un subconjunto pequeño para validar el pipeline de entrenamiento.


In [78]:
# ================================
# Parte 7.1 — Debug subsets
# ================================
DEBUG_TRAIN_N = 10
DEBUG_VAL_N = 10

train_df_debug = train_df.iloc[:DEBUG_TRAIN_N].copy()
val_df_debug   = val_df.iloc[:DEBUG_VAL_N].copy()

print("train_df_debug:", train_df_debug.shape, train_df_debug["label"].value_counts().to_dict())
print("val_df_debug:",   val_df_debug.shape,   val_df_debug["label"].value_counts().to_dict())


train_df_debug: (10, 14) {1: 7, 0: 3}
val_df_debug: (10, 14) {1: 8, 0: 2}


## 7.2 DataLoaders (batch_size=1)

Para simplificar el uso de `processor` (HuggingFace) sobre listas de imágenes,
en debug usamos `batch_size=1`.


In [79]:
# ================================
# Parte 7.2 — Loaders (batch_size=1)
# ================================
from torch.utils.data import DataLoader

train_ds_lora = FrameClipDataset(
    train_df_debug, PROCESSED_DIR, split="train",
    n_frames=N_FRAMES, transform=transform
)
val_ds_lora = FrameClipDataset(
    val_df_debug, PROCESSED_DIR, split="val",
    n_frames=N_FRAMES, transform=transform
)

train_loader = DataLoader(train_ds_lora, batch_size=1, shuffle=True)
val_loader   = DataLoader(val_ds_lora, batch_size=1, shuffle=False)

print("train batches:", len(train_loader), "val batches:", len(val_loader))


train batches: 10 val batches: 10


## 7.3 Modelo de entrenamiento: SigLIP + LoRA + cabeza de clasificación

Se añade una cabeza lineal binaria sobre el embedding promedio del clip.
Se entrenan únicamente:
- parámetros LoRA
- parámetros de la cabeza (clasificador)


In [80]:
# ================================
# Parte 7.3 — Wrapper con head
# ================================
import torch.nn as nn

# Asegurar: encoder_lora ya existe (Parte 4.8) y peft está instalado.
# Si no, vuelve a ejecutar Parte 4.7–4.9.

class SiglipLoraWithHead(nn.Module):
    def __init__(self, encoder_lora, processor, emb_dim=1152):
        super().__init__()
        self.encoder_lora = encoder_lora
        self.processor = processor
        self.head = nn.Linear(emb_dim, 1)

    def forward(self, clip_tensor):
        """
        clip_tensor: torch.Tensor [T, C, H, W] (batch_size=1 afuera)
        output: logits [1]
        """
        # clip_tensor viene sin batch; convertir frames a PIL
        imgs = [transforms.ToPILImage()(clip_tensor[t].cpu()) for t in range(clip_tensor.shape[0])]
        inputs = self.processor(images=imgs, return_tensors="pt").to(DEVICE)

        feats = self.encoder_lora.get_image_features(**inputs)  # [T, D]
        video_emb = feats.mean(dim=0, keepdim=True)            # [1, D]
        logits = self.head(video_emb).squeeze(1)               # [1]
        return logits, video_emb

model_lora_cls = SiglipLoraWithHead(encoder_lora, processor, emb_dim=1152).to(DEVICE)
print("Modelo LoRA+Head listo")


Modelo LoRA+Head listo


## 7.4 Configuración de entrenamiento (LoRA + head)

Se entrena con BCEWithLogitsLoss.


In [81]:
# ================================
# Parte 7.4 — Optimizer & loss
# ================================
import torch

criterion = nn.BCEWithLogitsLoss()

# Solo parámetros entrenables (LoRA + head)
params = [p for p in model_lora_cls.parameters() if p.requires_grad]
print("N params trainables:", sum(p.numel() for p in params))

optimizer = torch.optim.AdamW(params, lr=1e-4, weight_decay=1e-2)


N params trainables: 1991809


## 7.5 Entrenamiento LoRA (debug)

Se entrena pocas épocas solo para validar flujo y obtener un LoRA no-trivial.


In [1]:
# ================================
# Parte 7.5 — Train loop LoRA
# ================================
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

EPOCHS_LORA = 3  # debug; luego subes (ej. 5–10)
model_lora_cls.train()

for epoch in range(EPOCHS_LORA):
    # Train
    model_lora_cls.train()
    train_losses = []

    for batch in train_loader:
        clip = batch["clip"].squeeze(0).to(DEVICE)   # [T,C,H,W]
        y = torch.tensor([batch["label"][0]], dtype=torch.float32).to(DEVICE)  # [1]

        optimizer.zero_grad()
        logits, _ = model_lora_cls(clip)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    # Val
    model_lora_cls.eval()
    val_losses = []
    probs_all, y_all = [], []

    with torch.no_grad():
        for batch in val_loader:
            clip = batch["clip"].squeeze(0).to(DEVICE)
            y = float(batch["label"][0])
            logits, _ = model_lora_cls(clip)
            loss = criterion(logits, torch.tensor([y], dtype=torch.float32).to(DEVICE))
            val_losses.append(loss.item())

            prob = torch.sigmoid(logits).cpu().numpy()[0]
            probs_all.append(prob)
            y_all.append(int(y))

    preds_all = (np.array(probs_all) > 0.5).astype(int)
    acc = accuracy_score(y_all, preds_all)
    f1  = f1_score(y_all, preds_all, zero_division=0)

    print(
        f"Epoch {epoch+1}/{EPOCHS_LORA} | "
        f"train loss: {np.mean(train_losses):.4f} | "
        f"val loss: {np.mean(val_losses):.4f} | "
        f"val acc: {acc:.3f} | val f1: {f1:.3f}"
    )


NameError: name 'model_lora_cls' is not defined

## 7.6 Guardado del adaptador LoRA

Se guarda el adaptador para reutilizarlo sin reentrenar.


In [None]:
# ================================
# Parte 7.6 — Save LoRA adapter
# ================================
LORA_OUT = NOTEBOOK_DIR / "checkpoints" / "siglip_lora_debug"
LORA_OUT.mkdir(parents=True, exist_ok=True)

# Guardar solo el adaptador LoRA
encoder_lora.save_pretrained(LORA_OUT)

# Guardar head también (para referencia)
torch.save(model_lora_cls.head.state_dict(), LORA_OUT / "head.pt")

print("LoRA guardado en:", LORA_OUT)


## 7.7 Extracción de embeddings con LoRA entrenado (debug)

Se repite el procedimiento de extracción, pero usando `encoder_lora` entrenado.
Esto permite evaluar el codificador con LoRA de forma comparable.


In [None]:
# ================================
# Parte 7.7 — Extract embeddings (LoRA trained) debug
# ================================
import numpy as np
from tqdm import tqdm

FEATURE_LORA_DEBUG_ROOT = NOTEBOOK_DIR / "features" / "siglip_lora_debug"
(FEATURE_LORA_DEBUG_ROOT / "train_debug").mkdir(parents=True, exist_ok=True)
(FEATURE_LORA_DEBUG_ROOT / "val_debug").mkdir(parents=True, exist_ok=True)

def extract_embeddings_to_dir(df, split_name, split_frames_dirname, encoder_used, processor_used, out_root):
    rows = []
    dataset = FrameClipDataset(
        df, PROCESSED_DIR, split=split_frames_dirname,
        n_frames=N_FRAMES, transform=transform
    )
    out_dir = out_root / split_name
    out_dir.mkdir(parents=True, exist_ok=True)

    encoder_used.eval()

    for i in tqdm(range(len(dataset)), desc=f"Extract {split_name}"):
        s = dataset[i]
        video_id, label, clip = s["video_id"], s["label"], s["clip"]
        emb = encode_clip_siglip(encoder_used, processor_used, clip).squeeze(0).cpu().numpy()
        np.save(out_dir / f"{video_id}.npy", emb)
        rows.append({"video_id": video_id, "label": label, "feature_path": str(out_dir / f"{video_id}.npy")})

    pd.DataFrame(rows).to_csv(out_dir / "labels.csv", index=False)
    print(f"{split_name}: guardados {len(rows)} embeddings")
    return out_dir

train_lora_dir = extract_embeddings_to_dir(
    train_df_debug, split_name="train_debug", split_frames_dirname="train",
    encoder_used=encoder_lora, processor_used=processor,
    out_root=FEATURE_LORA_DEBUG_ROOT
)

val_lora_dir = extract_embeddings_to_dir(
    val_df_debug, split_name="val_debug", split_frames_dirname="val",
    encoder_used=encoder_lora, processor_used=processor,
    out_root=FEATURE_LORA_DEBUG_ROOT
)


## 7.8 Clasificación con MLP usando embeddings LoRA (debug)

Se repite el MLP, pero entrenando sobre embeddings del codificador adaptado.


In [None]:
# ================================
# Parte 7.8 — MLP sobre embeddings LoRA
# ================================
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn

# Cargar train embeddings LoRA debug
train_labels = pd.read_csv(train_lora_dir / "labels.csv")
X, y = [], []
for _, row in train_labels.iterrows():
    X.append(np.load(row["feature_path"]))
    y.append(int(row["label"]))
X = np.stack(X)
y = np.array(y)

# Split train/val (debug)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# MLP igual que antes
class MLPClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1)
        )
    def forward(self, x):
        return self.net(x).squeeze(1)

mlp = MLPClassifier(1152).to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
opt = torch.optim.Adam(mlp.parameters(), lr=1e-3)

X_train_t = torch.tensor(X_train, dtype=torch.float32).to(DEVICE)
y_train_t = torch.tensor(y_train, dtype=torch.float32).to(DEVICE)
X_val_t   = torch.tensor(X_val, dtype=torch.float32).to(DEVICE)
y_val_t   = torch.tensor(y_val, dtype=torch.float32).to(DEVICE)

# Entrenar rápido
EPOCHS = 20
for epoch in range(EPOCHS):
    mlp.train()
    opt.zero_grad()
    logits = mlp(X_train_t)
    loss = criterion(logits, y_train_t)
    loss.backward()
    opt.step()

    if epoch % 5 == 0 or epoch == EPOCHS - 1:
        mlp.eval()
        with torch.no_grad():
            v_logits = mlp(X_val_t)
            v_loss = criterion(v_logits, y_val_t)
        print(f"Epoch {epoch:02d} | train loss {loss.item():.4f} | val loss {v_loss.item():.4f}")

# Métricas
mlp.eval()
with torch.no_grad():
    probs = torch.sigmoid(mlp(X_val_t)).cpu().numpy()
preds = (probs > 0.5).astype(int)
acc = accuracy_score(y_val, preds)
f1  = f1_score(y_val, preds, zero_division=0)
print("LoRA-embeddings MLP | Accuracy:", acc, "| F1:", f1)


# Parte 8 — Métricas SigLIP (debug): Baseline vs LoRA

En esta sección calculamos métricas estándar de clasificación para el caso SigLIP:
- Accuracy, F1, Precision, Recall
- ROC-AUC y PR-AUC (si hay ambas clases)
- Matriz de confusión
Y generamos un gráfico comparativo Baseline vs LoRA.


In [None]:
# ================================
# Parte 8.1 — Cargar embeddings SigLIP (debug)
# ================================
import numpy as np
import pandas as pd
from pathlib import Path

def load_debug_split(feature_dir):
    feature_dir = Path(feature_dir)
    labels = pd.read_csv(feature_dir / "labels.csv")
    X, y = [], []
    for _, row in labels.iterrows():
        X.append(np.load(row["feature_path"]))
        y.append(int(row["label"]))
    X = np.stack(X)  # [N, 1152]
    y = np.array(y)  # [N]
    return X, y

BASELINE_DIR = NOTEBOOK_DIR / "features" / "siglip_baseline_debug" / "train_debug"
LORA_DIR     = NOTEBOOK_DIR / "features" / "siglip_lora_debug"     / "train_debug"

Xb, yb = load_debug_split(BASELINE_DIR)
print("Baseline:", Xb.shape, yb.shape, "labels:", np.unique(yb, return_counts=True))

Xl, yl = load_debug_split(LORA_DIR)
print("LoRA:", Xl.shape, yl.shape, "labels:", np.unique(yl, return_counts=True))


In [None]:
# ================================
# Parte 8.2 — Métricas SigLIP (debug)
# ================================
import torch
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    roc_auc_score, average_precision_score, confusion_matrix
)

def predict_probs(mlp_model, X_np):
    mlp_model.eval()
    with torch.no_grad():
        X_t = torch.tensor(X_np, dtype=torch.float32).to(DEVICE)
        probs = torch.sigmoid(mlp_model(X_t)).cpu().numpy()
    return probs

def compute_metrics(y_true, probs, thr=0.5):
    y_pred = (probs >= thr).astype(int)
    out = {
        "accuracy":  accuracy_score(y_true, y_pred),
        "f1":        f1_score(y_true, y_pred, zero_division=0),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall":    recall_score(y_true, y_pred, zero_division=0),
        "cm":        confusion_matrix(y_true, y_pred),
    }
    # AUCs solo si hay 2 clases presentes
    if len(np.unique(y_true)) == 2:
        out["roc_auc"] = roc_auc_score(y_true, probs)
        out["pr_auc"]  = average_precision_score(y_true, probs)
    else:
        out["roc_auc"] = np.nan
        out["pr_auc"]  = np.nan
    return out


In [None]:
# ================================
# Parte 8.3 — Resultados Baseline vs LoRA
# ================================
probs_b = predict_probs(model, Xb)   # model = MLP baseline (Parte 6)
m_b = compute_metrics(yb, probs_b, thr=0.5)

probs_l = predict_probs(mlp, Xl)     # mlp = MLP LoRA (Parte 7.8)
m_l = compute_metrics(yl, probs_l, thr=0.5)

print("=== Baseline (SigLIP) ===")
for k in ["accuracy","f1","precision","recall","roc_auc","pr_auc"]:
    print(k, ":", m_b[k])
print("Confusion matrix:\n", m_b["cm"])

print("\n=== LoRA (SigLIP) ===")
for k in ["accuracy","f1","precision","recall","roc_auc","pr_auc"]:
    print(k, ":", m_l[k])
print("Confusion matrix:\n", m_l["cm"])


In [None]:
# ================================
# Parte 8.4 — Gráfico comparativo
# ================================
import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame([
    {"variant":"baseline", "accuracy":m_b["accuracy"], "f1":m_b["f1"], "precision":m_b["precision"], "recall":m_b["recall"], "roc_auc":m_b["roc_auc"], "pr_auc":m_b["pr_auc"]},
    {"variant":"lora",     "accuracy":m_l["accuracy"], "f1":m_l["f1"], "precision":m_l["precision"], "recall":m_l["recall"], "roc_auc":m_l["roc_auc"], "pr_auc":m_l["pr_auc"]},
])

ax = df.set_index("variant")[["f1","roc_auc","pr_auc","accuracy","precision","recall"]].plot(kind="bar")
plt.xticks(rotation=0)
plt.title("SigLIP (debug) — Baseline vs LoRA")
plt.tight_layout()
plt.show()

df
