# THUMOS14 Action Recognition with Temporal Localization using PyTorch

In [None]:
import os
import cv2
import glob
import torch
import random
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from moviepy.editor import VideoFileClip
from collections import defaultdict


In [None]:
# Set paths
VIDEO_DIR = "/mnt/data/thumos14_action_recognition/videos"
ANNOTATION_DIR = "/mnt/data/thumos14_action_recognition/annotations"
FRAME_DIR = "/mnt/data/thumos14_action_recognition/frames"

os.makedirs(VIDEO_DIR, exist_ok=True)
os.makedirs(ANNOTATION_DIR, exist_ok=True)
os.makedirs(FRAME_DIR, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)


In [None]:
def parse_annotations(annotation_dir):
    annotations = defaultdict(list)
    class_map = {}
    for idx, file in enumerate(os.listdir(annotation_dir)):
        class_name = file.replace('.txt', '')
        class_map[class_name] = idx
        with open(os.path.join(annotation_dir, file), 'r') as f:
            for line in f:
                video_id, start, end = line.strip().split()
                annotations[video_id].append({
                    'start': float(start),
                    'end': float(end),
                    'label': class_name
                })
    return annotations, class_map

annotations, class_map = parse_annotations(ANNOTATION_DIR)
print("Classes:", class_map)


In [None]:
def extract_frames_from_video(video_path, output_folder, fps=10):
    os.makedirs(output_folder, exist_ok=True)
    clip = VideoFileClip(video_path)
    duration = clip.duration
    for t in np.arange(0, duration, 1.0 / fps):
        frame = clip.get_frame(t)
        frame_path = os.path.join(output_folder, f"{int(t * fps):05d}.jpg")
        cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
    clip.close()

# Only run this once
# for video_file in os.listdir(VIDEO_DIR):
#     video_id = video_file.replace('.mp4', '')
#     extract_frames_from_video(os.path.join(VIDEO_DIR, video_file), os.path.join(FRAME_DIR, video_id))


In [None]:
class THUMOS14Dataset(Dataset):
    def __init__(self, annotations, class_map, frame_root, transform=None, clip_len=16, fps=10):
        self.samples = []
        self.class_map = class_map
        self.transform = transform
        self.clip_len = clip_len
        self.fps = fps
        for video_id, anns in annotations.items():
            frame_dir = os.path.join(frame_root, video_id)
            if not os.path.exists(frame_dir): continue
            for ann in anns:
                start_f = int(float(ann['start']) * fps)
                end_f = int(float(ann['end']) * fps)
                for i in range(start_f, end_f - clip_len + 1, clip_len):
                    self.samples.append({
                        'video_id': video_id,
                        'start': i,
                        'label': class_map[ann['label']]
                    })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        video_id = sample['video_id']
        start = sample['start']
        label = sample['label']

        frames = []
        for i in range(start, start + self.clip_len):
            frame_path = os.path.join(FRAME_DIR, video_id, f"{i:05d}.jpg")
            img = cv2.imread(frame_path)
            if img is None:
                img = np.zeros((240, 320, 3), dtype=np.uint8)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            if self.transform:
                img = self.transform(img)
            else:
                img = T.ToTensor()(img)
            frames.append(img)

        return torch.stack(frames), torch.tensor(label)


In [None]:
class SimpleConvGRU(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        self.rnn = nn.GRU(32 * 60 * 80, 128, batch_first=True)
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        b, t, c, h, w = x.shape
        x = x.view(b * t, c, h, w)
        x = self.conv(x)
        x = x.view(b, t, -1)
        _, h_n = self.rnn(x)
        return self.fc(h_n[-1])


In [None]:
def train(model, loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for X, y in loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            out = model(X)
            loss = criterion(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} Loss: {total_loss / len(loader):.4f}")


In [None]:
transform = T.Compose([
    T.ToPILImage(),
    T.Resize((240, 320)),
    T.ToTensor()
])

dataset = THUMOS14Dataset(annotations, class_map, FRAME_DIR, transform=transform)
loader = DataLoader(dataset, batch_size=4, shuffle=True)

model = SimpleConvGRU(num_classes=len(class_map)).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# train(model, loader, criterion, optimizer)


In [None]:
def predict(model, video_clip):
    model.eval()
    with torch.no_grad():
        video_clip = video_clip.unsqueeze(0).to(DEVICE)
        logits = model(video_clip)
        pred = torch.argmax(logits, dim=1)
    return pred.item()

# Example:
# X, y = dataset[0]
# pred = predict(model, X)
# print("GT:", y.item(), "Pred:", pred)
