In [None]:
!pip install transformers decord opencv-python torch torchvision


Collecting decord
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m129.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (63.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.0/63.0 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencv-python, decord
Successfully installed decord-0.6.0 opencv-python-4.11.0.86


In [None]:
from google.colab import drive
import os

# Mount your Google Drive
drive.mount('/content/drive')

# Path to your shared dataset folder (NOT a zip)
dataset_path = "/content/drive/MyDrive/dataset"  # Update this if needed
extract_path = dataset_path  # No unzipping needed

print("✅ Folder mounted. Contents:", os.listdir(extract_path))


Mounted at /content/drive
✅ Folder mounted. Contents: ['Assault', 'Explosion', 'Abuse', 'Arrest', 'Arson', 'normal', 'Fighting', 'Burglary']


In [None]:
import os
import torch
import numpy as np
import pandas as pd
from torchvision import transforms
from decord import VideoReader, cpu
from tqdm import tqdm

# Output folder for tensor files
output_path = "/content/frames"
os.makedirs(output_path, exist_ok=True)

# Frame preprocessing transformation
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

video_data = []

for root, dirs, files in tqdm(os.walk(extract_path)):
    for file in tqdm(files):
        if not file.endswith(".mp4"):
            continue

        # Use folder name as the label
        label = os.path.basename(root)
        input_video_path = os.path.join(root, file)

        # Output path for this label
        category_output_path = os.path.join(output_path, label)
        os.makedirs(category_output_path, exist_ok=True)

        # Output .pt path
        video_name = os.path.splitext(file)[0]
        output_tensor_path = os.path.join(category_output_path, f"{video_name}.pt")

        try:
            vr = VideoReader(input_video_path, ctx=cpu(0))
            total_frames = len(vr)

            if total_frames < 16:
                print(f"⚠️ Skipping short video: {file} ({total_frames} frames)")
                continue

            indices = np.linspace(0, total_frames - 1, 16).astype(int)
            frames = [transform(vr[i].asnumpy()) for i in indices]
            video_tensor = torch.stack(frames)

            torch.save(video_tensor, output_tensor_path)

            relative_path = os.path.relpath(output_tensor_path, start="/content")
            video_data.append([relative_path, label])

        except Exception as e:
            print(f"⚠️ Failed on {file}: {e}")

# Save label CSV
df = pd.DataFrame(video_data, columns=["path", "label"])
df.to_csv("/content/video_labels.csv", index=False)

print("✅ Frame extraction & tensor saving complete!")


0it [00:00, ?it/s]
0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s][A
  2%|▏         | 1/50 [00:02<02:23,  2.92s/it][A
  4%|▍         | 2/50 [00:04<01:45,  2.20s/it][A
  6%|▌         | 3/50 [00:07<01:54,  2.43s/it][A
  8%|▊         | 4/50 [00:10<01:56,  2.54s/it][A
 10%|█         | 5/50 [00:13<02:01,  2.70s/it][A
 12%|█▏        | 6/50 [00:16<02:03,  2.81s/it][A
 14%|█▍        | 7/50 [00:19<02:07,  2.96s/it][A
 16%|█▌        | 8/50 [00:23<02:14,  3.20s/it][A
 18%|█▊        | 9/50 [00:26<02:18,  3.39s/it][A
 20%|██        | 10/50 [00:31<02:27,  3.70s/it][A
 22%|██▏       | 11/50 [00:35<02:27,  3.79s/it][A
 24%|██▍       | 12/50 [00:39<02:26,  3.86s/it][A
 26%|██▌       | 13/50 [00:42<02:10,  3.53s/it][A
 28%|██▊       | 14/50 [00:43<01:49,  3.04s/it][A
 30%|███       | 15/50 [00:48<02:03,  3.54s/it][A
 32%|███▏      | 16/50 [00:53<02:14,  3.95s/it][A
 34%|███▍      | 17/50 [00:55<01:53,  3.45s/it][A
 36%|███▌      | 18/50 [00:59<01:52,  3.53s/it][A
 38%|███▊ 

✅ Frame extraction & tensor saving complete!





In [None]:
print(f"Total rows in DataFrame: {len(df)}")
print(df.head())


Total rows in DataFrame: 400
                                path    label
0  frames/Assault/Assault035_x264.pt  Assault
1  frames/Assault/Assault039_x264.pt  Assault
2  frames/Assault/Assault038_x264.pt  Assault
3  frames/Assault/Assault052_x264.pt  Assault
4  frames/Assault/Assault046_x264.pt  Assault


In [None]:
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from collections import Counter

class CrimeDataset(Dataset):
    def __init__(self, df, root_dir):
        self.df = df.copy()
        self.root_dir = root_dir
        self.df["label"] = self.df["label"].astype("category")
        self.label_map = {label: idx for idx, label in enumerate(self.df["label"].cat.categories)}
        self.df["label_idx"] = self.df["label"].map(self.label_map)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        video_tensor = torch.load(os.path.join(self.root_dir, row["path"].replace("frames/", "")))

        label = row["label_idx"]
        return video_tensor, torch.tensor(label, dtype=torch.long)

df = pd.read_csv("/content/video_labels.csv")
dataset = CrimeDataset(df, output_path)

labels = df["label"].astype("category").cat.codes.tolist()
train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=0.2, stratify=labels)

train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset = torch.utils.data.Subset(dataset, val_idx)

train_labels = [labels[i] for i in train_idx]
counts = Counter(train_labels)
weights = torch.tensor([len(train_labels) / counts[label] for label in train_labels], dtype=torch.float)
sampler = WeightedRandomSampler(weights, len(weights), replacement=True)

train_loader = DataLoader(train_dataset, batch_size=8, sampler=sampler)
val_loader = DataLoader(val_dataset, batch_size=8)


In [None]:
import torch.nn as nn
from transformers import TimesformerForVideoClassification, TimesformerConfig

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/timesformer-base-finetuned-k400"
config = TimesformerConfig.from_pretrained(model_name)
config.num_labels = len(df["label"].unique())

model = TimesformerForVideoClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True)
model.classifier = nn.Linear(config.hidden_size, config.num_labels)
model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/486M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/486M [00:00<?, ?B/s]

Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TimesformerForVideoClassification(
  (timesformer): TimesformerModel(
    (embeddings): TimesformerEmbeddings(
      (patch_embeddings): TimesformerPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
    )
    (encoder): TimesformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x TimesformerLayer(
          (drop_path): Identity()
          (attention): TimeSformerAttention(
            (attention): TimesformerSelfAttention(
              (qkv): Linear(in_features=768, out_features=2304, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
            )
            (output): TimesformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): TimesformerIntermediate(
            (dense

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

for epoch in range(5):
    model.train()
    total_loss = 0

    for batch_idx, (videos, labels) in enumerate(train_loader):
        videos, labels = videos.to(device), labels.to(device)

        if videos.shape[1] != 16:
            frame_indices = torch.linspace(0, videos.shape[1] - 1, 16).long()
            videos = videos[:, frame_indices, :, :, :]

        videos = (videos - videos.mean()) / (videos.std() + 1e-8)

        optimizer.zero_grad()
        outputs = model(pixel_values=videos)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()

        if batch_idx % 10 == 0:
            print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")

    print(f"📉 Epoch {epoch+1} avg loss: {total_loss / len(train_loader):.4f}")


In [None]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss, correct, total = 0, 0, 0

    with torch.no_grad():
        for videos, labels in dataloader:
            videos, labels = videos.to(device), labels.to(device)

            if videos.shape[1] != 16:
                frame_indices = torch.linspace(0, videos.shape[1] - 1, 16).long()
                videos = videos[:, frame_indices, :, :, :]

            videos = (videos - videos.mean()) / (videos.std() + 1e-8)
            outputs = model(pixel_values=videos)
            loss = criterion(outputs.logits, labels)

            total_loss += loss.item()
            _, preds = torch.max(outputs.logits, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = 100 * correct / total
    return total_loss / len(dataloader), accuracy

val_loss, val_acc = evaluate(model, val_loader, criterion)
print(f"✅ Validation Loss: {val_loss:.4f}, Accuracy: {val_acc:.2f}%")
