### Extracción de características a nivel de clip (UCF-Crime)

Este notebook utiliza como entrada el archivo `processed/index_clips.csv`

El archivo `index_clips.csv` contiene, para cada clip temporal:
- La ruta al video original.
- El rango temporal del clip (`start_frame`, `end_frame`).
- La partición correspondiente (`train`, `val`, `test`).
- La etiqueta binaria (normal vs anómalo) y la categoría asociada.
- Los parámetros de segmentación utilizados (longitud del clip y solapamiento).

A partir de este índice, se cargan los frames correspondientes a cada clip y se transforman en la
representación requerida por los modelos evaluados, sin redefinir ni modificar la composición del
conjunto experimental.


In [7]:
import pandas as pd
from pathlib import Path
import cv2
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

# Ruta al índice de clips
INDEX_CLIPS_PATH = Path("processed/index_clips.csv")

# Verificar existencia
assert INDEX_CLIPS_PATH.exists(), f"No se encuentra el archivo: {INDEX_CLIPS_PATH}"

# Cargar CSV
df_clips = pd.read_csv(INDEX_CLIPS_PATH)

print("Archivo cargado correctamente")
print("Número total de clips:", len(df_clips))

display(df_clips.head())


Archivo cargado correctamente
Número total de clips: 145356


Unnamed: 0,split,y,category,path,clip_idx,start_frame,end_frame,clip_len,stride,fps,n_frames
0,train,0,Normal,/home/DIINF/dvaldes/tesis/UCF_Crime/Training-N...,0,0,32,32,16,30.0,2016
1,train,0,Normal,/home/DIINF/dvaldes/tesis/UCF_Crime/Training-N...,1,16,48,32,16,30.0,2016
2,train,0,Normal,/home/DIINF/dvaldes/tesis/UCF_Crime/Training-N...,2,32,64,32,16,30.0,2016
3,train,0,Normal,/home/DIINF/dvaldes/tesis/UCF_Crime/Training-N...,3,48,80,32,16,30.0,2016
4,train,0,Normal,/home/DIINF/dvaldes/tesis/UCF_Crime/Training-N...,4,64,96,32,16,30.0,2016


In [8]:
# Distribución por split
print("Clips por split:")
print(df_clips["split"].value_counts())

# Distribución por clase
print("\nClips por clase (y):")
print(df_clips["y"].value_counts())

# Verificar rangos temporales válidos
invalid_ranges = df_clips[df_clips["end_frame"] <= df_clips["start_frame"]]
print("\nClips con rangos inválidos:", len(invalid_ranges))

# Verificar paths únicos y existencia
missing_paths = df_clips[~df_clips["path"].apply(lambda p: Path(p).exists())]
print("Clips con path inexistente:", len(missing_paths))


Clips por split:
split
train    106527
val       19793
test      19036
Name: count, dtype: int64

Clips por clase (y):
y
0    73870
1    71486
Name: count, dtype: int64

Clips con rangos inválidos: 0
Clips con path inexistente: 0


In [9]:
# Verificar que un mismo video no aparezca en más de un split
video_split_counts = df_clips.groupby("path")["split"].nunique()
n_leak = int((video_split_counts > 1).sum())

print("Videos con clips en más de un split:", n_leak)


Videos con clips en más de un split: 0


In [11]:

# Parámetros del input
T = 8                
IMG_SIZE = 224       
BATCH_SIZE = 16
NUM_WORKERS = 8

def uniform_sample_indices(start_f: int, end_f: int, T: int):
    n = max(1, end_f - start_f)
    idx = np.linspace(0, n - 1, T).round().astype(int)
    return (start_f + idx).astype(int)

class ClipDataset(Dataset):
    def __init__(self, df, T=8, img_size=224):
        self.df = df.reset_index(drop=True)
        self.T = T
        self.img_size = img_size
        self.mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
        self.std  = np.array([0.229, 0.224, 0.225], dtype=np.float32)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        row = self.df.iloc[i]
        path = row["path"]
        start_f = int(row["start_frame"])
        end_f   = int(row["end_frame"])
        y = int(row["y"])

        cap = cv2.VideoCapture(path)
        if not cap.isOpened():
            raise RuntimeError(f"No pude abrir video: {path}")

        frame_ids = uniform_sample_indices(start_f, end_f, self.T)

        frames = []
        last_good = None
        for fid in frame_ids:
            cap.set(cv2.CAP_PROP_POS_FRAMES, int(fid))
            ok, frame = cap.read()

            if not ok:
                if last_good is None:
                    frame = np.zeros((self.img_size, self.img_size, 3), dtype=np.uint8)
                else:
                    frame = last_good
            else:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (self.img_size, self.img_size), interpolation=cv2.INTER_LINEAR)
                last_good = frame

            frames.append(frame)

        cap.release()

        # (T,H,W,C) -> float [0,1]
        arr = np.stack(frames).astype(np.float32) / 255.0
        arr = (arr - self.mean) / self.std

        # -> (C,T,H,W)
        arr = np.transpose(arr, (3, 0, 1, 2))
        clip = torch.from_numpy(arr)  # float32

        return clip, torch.tensor(y, dtype=torch.long)


In [12]:
df_train = df_clips[df_clips["split"]=="train"].copy()
df_val   = df_clips[df_clips["split"]=="val"].copy()
df_test  = df_clips[df_clips["split"]=="test"].copy()

train_ds = ClipDataset(df_train, T=T, img_size=IMG_SIZE)
val_ds   = ClipDataset(df_val,   T=T, img_size=IMG_SIZE)
test_ds = ClipDataset(df_test, T=T, img_size=IMG_SIZE)


train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=True)

test_loader = DataLoader(
    test_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True
)


xb, yb = next(iter(train_loader))
print("xb shape:", xb.shape)  # esperado: (B, 3, 8, 224, 224)
print("yb shape:", yb.shape)




xb shape: torch.Size([16, 3, 8, 224, 224])
yb shape: torch.Size([16])


In [16]:
import torch
import torch.nn as nn
from torchvision.models.video import swin3d_t, Swin3D_T_Weights

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

WEIGHTS = Swin3D_T_Weights.DEFAULT
encoder = swin3d_t(weights=WEIGHTS).to(DEVICE).eval()

for p in encoder.parameters():
    p.requires_grad = False

# Clave: reemplazar la cabeza de clasificación para obtener embeddings
encoder.head = nn.Identity()

print("MODEL CLASS:", encoder.__class__)
print("MODEL MODULE:", encoder.__class__.__module__)
print("Head:", encoder.head)



MODEL CLASS: <class 'torchvision.models.video.swin_transformer.SwinTransformer3d'>
MODEL MODULE: torchvision.models.video.swin_transformer
Head: Identity()


In [17]:
xb, yb = next(iter(train_loader))
print("xb shape:", xb.shape)  # esperado: (B, C, T, H, W)

xb = xb.to(DEVICE)

with torch.no_grad():
    emb = encoder(xb)

print("emb shape:", emb.shape)  # (B, D)
D = int(emb.shape[1])
print("D:", D)


xb shape: torch.Size([16, 3, 8, 224, 224])
emb shape: torch.Size([16, 768])
D: 768


In [18]:
from tqdm import tqdm
import numpy as np

def extract_embeddings(loader, encoder, X_mm, y_mm, desc="split"):
    encoder.eval()
    offset = 0
    with torch.no_grad():
        for xb, yb in tqdm(loader, desc=desc):
            xb = xb.to(DEVICE)          # (B, C, T, H, W)
            emb = encoder(xb)           # (B, D) porque head=Identity

            emb_np = emb.detach().cpu().numpy().astype(X_mm.dtype, copy=False)
            y_np   = yb.detach().cpu().numpy().astype(y_mm.dtype, copy=False)

            bs = emb_np.shape[0]
            X_mm[offset:offset+bs] = emb_np
            y_mm[offset:offset+bs] = y_np
            offset += bs

    X_mm.flush(); y_mm.flush()
    print(desc, "done:", offset)
    return offset


In [19]:
import numpy as np
from pathlib import Path

PROCESSED = Path("processed")
PROCESSED.mkdir(parents=True, exist_ok=True)

n_train = len(train_loader.dataset)
n_val   = len(val_loader.dataset)
n_test  = len(test_loader.dataset)

def create_memmap(path: Path, shape, dtype):
    return np.memmap(str(path), mode="w+", dtype=dtype, shape=shape)

X_train = create_memmap(PROCESSED / "emb_swin3d_t_train.mmap", (n_train, D), "float16")
y_train = create_memmap(PROCESSED / "y_train.mmap",            (n_train,),    "int8")

X_val   = create_memmap(PROCESSED / "emb_swin3d_t_val.mmap",   (n_val, D),   "float16")
y_val   = create_memmap(PROCESSED / "y_val.mmap",              (n_val,),     "int8")

X_test  = create_memmap(PROCESSED / "emb_swin3d_t_test.mmap",  (n_test, D),  "float16")
y_test  = create_memmap(PROCESSED / "y_test.mmap",             (n_test,),    "int8")

print("Memmaps creados:")
print("Train:", X_train.shape, y_train.shape)
print("Val:  ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)


Memmaps creados:
Train: (106527, 768) (106527,)
Val:   (19793, 768) (19793,)
Test:  (19036, 768) (19036,)


In [20]:
n1 = extract_embeddings(train_loader, encoder, X_train, y_train, desc="train")
n2 = extract_embeddings(val_loader,   encoder, X_val,   y_val,   desc="val")
n3 = extract_embeddings(test_loader,  encoder, X_test,  y_test,  desc="test")

print("Written train:", n1, "expected:", n_train)
print("Written val:  ", n2, "expected:", n_val)
print("Written test: ", n3, "expected:", n_test)

assert n1 == n_train and n2 == n_val and n3 == n_test, "Mismatch escrito vs tamaño de dataset."
print("OK: extracción completa.")


train: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6658/6658 [56:11<00:00,  1.97it/s]


train done: 106527


val: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1238/1238 [10:13<00:00,  2.02it/s]


val done: 19793


test: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1190/1190 [09:56<00:00,  2.00it/s]

test done: 19036
Written train: 106527 expected: 106527
Written val:   19793 expected: 19793
Written test:  19036 expected: 19036
OK: extracción completa.





In [23]:
import numpy as np

def emb_stats(X_mm, name):
    X = np.array(X_mm, copy=False)
    print(f"{name}: shape={X.shape} dtype={X.dtype} "
          f"min={float(X.min()):.4f} max={float(X.max()):.4f} "
          f"mean={float(X.mean()):.4f} std={float(X.std()):.4f}")

emb_stats(X_train, "X_train")
emb_stats(X_val,   "X_val")
emb_stats(X_test,  "X_test")

print("y_train dist:", dict(zip(*np.unique(y_train, return_counts=True))))
print("y_val dist:  ", dict(zip(*np.unique(y_val,   return_counts=True))))
print("y_test dist: ", dict(zip(*np.unique(y_test,  return_counts=True))))


X_train: shape=(106527, 768) dtype=float16 min=-2.8359 max=2.8438 mean=-0.0018 std=inf
X_val: shape=(19793, 768) dtype=float16 min=-2.5117 max=2.6406 mean=-0.0016 std=inf
X_test: shape=(19036, 768) dtype=float16 min=-2.4766 max=2.5742 mean=-0.0019 std=inf
y_train dist: {np.int8(0): np.int64(54009), np.int8(1): np.int64(52518)}
y_val dist:   {np.int8(0): np.int64(10720), np.int8(1): np.int64(9073)}
y_test dist:  {np.int8(0): np.int64(9141), np.int8(1): np.int64(9895)}


In [24]:
import json
from datetime import datetime

manifest = {
    "created_at": datetime.now().isoformat(),
    "encoder": "torchvision/swin3d_t",
    "weights": "Swin3D_T_Weights.DEFAULT",
    "T": int(T),
    "img_size": int(IMG_SIZE),
    "batch_size": int(BATCH_SIZE),
    "embedding_dim": int(D),
    "dtype": "float16",
    "files": {
        "X_train": "processed/emb_swin3d_t_train.mmap",
        "y_train": "processed/y_train.mmap",
        "X_val":   "processed/emb_swin3d_t_val.mmap",
        "y_val":   "processed/y_val.mmap",
        "X_test":  "processed/emb_swin3d_t_test.mmap",
        "y_test":  "processed/y_test.mmap",
    }
}

out_manifest = PROCESSED / "manifest_swin3d_t.json"
out_manifest.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
print("Manifest guardado:", out_manifest)


Manifest guardado: processed/manifest_swin3d_t.json
