In [1]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim

In [6]:
anno_df = pd.read_csv('E:\MSc Data Science\Third term\Research Project\general\charade\CharadesEgo\CharadesEgo_v1_train_only3rd.csv')

rows = []
for _, r in anno_df.iterrows():
  vid = r['charades_video']
  actions = r['actions']
  if (isinstance(actions, str)):
    for trip in actions.split(';'):
      cls, st, en = trip.split()
      rows.append({'video': vid, 'class': cls, 'start': float(st), 'end': float(en)})

all_actions = pd.DataFrame(rows)

all_actions.head()

Unnamed: 0,video,class,start,end
0,38T9C,c141,1.9,7.4
1,38T9C,c061,0.0,20.5
2,38T9C,c006,6.4,12.5
3,38T9C,c118,5.5,20.6
4,38T9C,c156,8.9,14.5


In [9]:
# Extract the raw video IDs (without the 'EGO' suffix)
anno_ids = set(anno_df["charades_video"])

# List all .npy files in the features folder
feat_root = "E:\MSc Data Science\Third term\Research Project\general\charade\Charades_v1_features_vgg_flow_stride4"
feat_ids = {
    os.path.splitext(fname)[0]
    for fname in os.listdir(feat_root)
    if fname.endswith(".npy")
}

print(len(feat_ids))
print(len(anno_ids))


# Compute intersection
common_ids = sorted(anno_ids & feat_ids)

print(f"Found {len(common_ids)} videos in both annotations and features:")

9848
2376
Found 2375 videos in both annotations and features:


In [10]:
filtered_actions = ["c049", "c050", "c051"]

In [13]:
filtered = all_actions[(all_actions['class'].isin(filtered_actions))]

In [15]:
print(filtered.shape)
filtered.head()

(383, 4)


Unnamed: 0,video,class,start,end
248,H9U38,c051,0.0,25.1
468,ZGHLY,c051,0.3,23.38
469,ZGHLY,c049,8.8,14.8
881,E27NK,c051,0.0,11.9
911,O5D7S,c051,0.0,28.9


In [16]:
cls2idx = {c:i for i,c in enumerate(filtered_actions)}
print(cls2idx)

{'c049': 0, 'c050': 1, 'c051': 2}


In [None]:
class CharadesExoFeatureTAL(Dataset):
    def __init__(self, annots, feat_root, clip_len=48, stride=12, fps=24):
        self.clip_len = clip_len
        self.stride = stride
        self.fps = fps
        self.feat_root = feat_root
        self.samples = []

        for _, row in annots.iterrows():
            vid, cls, st, en = row["video"], row["class"], row["start"], row["end"]

            if type(vid) is not str:
                continue

            exo_feat_path = os.path.join(feat_root, vid + ".npy")
            if not os.path.isfile(exo_feat_path):
                continue  # skip if feature file missing

            # map times to feature indices (stride=4)
            start_f = int(st * fps / 4)
            end_f = int(en * fps / 4)

            for f0 in range(start_f, end_f - clip_len + 1, stride):
                self.samples.append(
                    {"exo_path": exo_feat_path, "start_idx": f0, "label": cls2idx[cls]}
                )

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        s = self.samples[idx]
        feats = np.load(s["exo_path"])  # shape: (T_total, D)
        clip = feats[s["start_idx"] : s["start_idx"] + self.clip_len]

        if clip.shape[0] < self.clip_len:
            pad_len = self.clip_len - clip.shape[0]
            pad = np.zeros((pad_len, clip.shape[1]), dtype=clip.dtype)
            clip = np.concatenate([clip, pad], axis=0)

        clip = torch.from_numpy(clip.T).float()  # (D, clip_len)
        return clip, s["label"]

In [147]:
class ExoOnlyFeatModel(nn.Module):
    def __init__(self, feat_dim, num_classes):
        super().__init__()
        # temporal averaging over feature frames
        self.avg = lambda x: x.mean(dim=2)  # x: (B, D, T) -> (B, D)
        self.fc  = nn.Linear(feat_dim, num_classes)

    def forward(self, x):
        # x: (B, D, T)
        x = self.avg(x)
        return self.fc(x)

In [148]:
len(os.listdir('Charades_v1_features_vgg_flow_stride4'))

9849

In [149]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
ds = CharadesExoFeatureTAL(filtered, feat_root='Charades_v1_features_vgg_flow_stride4')
loader = DataLoader(ds, batch_size=8, shuffle=True)

# infer feat_dim from first batch
exo_b, lbl_b = next(iter(loader))
feat_dim = exo_b.shape[1]

model = ExoOnlyFeatModel(feat_dim, num_classes=len(filtered_actions)).to(device)
opt   = optim.Adam(model.parameters(), lr=1e-4)
crit  = nn.CrossEntropyLoss()

# training loop
for epoch in range(5):
    model.train()
    total_loss = 0
    for exo, labels in loader:
        # exo: (B, D, T), labels: (B,)
        exo, labels = exo.to(device), labels.to(device)
        logits       = model(exo)
        loss         = crit(logits, labels)
        opt.zero_grad()
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} loss: {total_loss/len(loader):.4f}")

Epoch 1 loss: 1.0058
Epoch 2 loss: 0.8596
Epoch 3 loss: 0.7567
Epoch 4 loss: 0.6733
Epoch 5 loss: 0.6150
