# Features Tutorial

In [3]:
import random
random.seed(1234)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import time
import json
import os
import math
from typing import List

import torch
import h5py
import numpy as np
from torch.nn import functional as F
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import torch.nn as nn
from torch.utils.data import DataLoader

NARRATION_JSON_PATH = "/datasets01/ego4d_track2/v1/annotations/narration.json"
FEATURE_DIR = "/checkpoint/miguelmartin/ego4d_track2_features/full_scale/omnivore_video_swinL"
FEATURES_PER_SECOND = 30 / 16
FEATURE_DIM = 1536

VIDEO_UIDS = [x.split(".pt")[0] for x in os.listdir(FEATURE_DIR) if "yaml" not in x]

random.shuffle(VIDEO_UIDS)
EXAMPLE_VIDEO_UID = VIDEO_UIDS[0]
VIDEO_UIDS_EXAMPLE_SET = set(VIDEO_UIDS)

# Step 1: Prepare Data

- Preprocess:
   1. Ego4D:
       1. *Video Features*: convert to HDF5 file
       2. *Narration Features*: extract & convert to HDF5 file
   2. Kinetics400: Extract features from Labels / Videos & save to HDF5
       - Labels converted to `"The person in this video is doing <label>"`
   3. Ego-Charades: Extract features from Labels / Videos & save to HDF5
       - Labels will be as-is
       
- HDF5 to store features
- Pickle file (`torch.save` / `torch.load`) to store keys as HDF5 is slow with respect to getting keys

## Features

In [26]:
FEATURE_HDF5_OUT_PATH = "features_ex.hdf5"
NARR_HDF5_OUT_PATH = "narrs_ex.hdf5"
NARR_META_OUT_PATH = "narrs_ex.pt"

In [5]:
with h5py.File(FEATURE_HDF5_OUT_PATH, "w") as out_f:
    for uid in tqdm(video_uids, desc="video_uid", leave=True):
        feature_path = os.path.join(FEATURE_DIR, f"{uid}.pt")
        fv = torch.load(feature_path)
        out_f.create_dataset(uid, data=fv.numpy())

video_uid:   0%|          | 0/9645 [00:00<?, ?it/s]

291.9125545024872

## Narrations

In [7]:
# NOTE: this is missing validation set removal

uid_subset = VIDEO_UIDS_EXAMPLE_SET
narration_json = json.load(open(NARRATION_JSON_PATH))
narrations = [
    (uid, data["narration_text"], data["timestamp_sec"], 1)
    for uid in tqdm(uid_subset)
    for data in narration_json[uid].
    get("narration_pass_1", 
        {"narrations": []})["narrations"]
]
narrations += [
    (uid, data["narration_text"], data["timestamp_sec"], 2)
    for uid in tqdm(uid_subset)
    for data in narration_json[uid].get("narration_pass_2", {"narrations": []})["narrations"]
]

narrations.sort(key=lambda x: (x[0], x[-1]))
len(narrations)

  0%|          | 0/9645 [00:00<?, ?it/s]

  0%|          | 0/9645 [00:00<?, ?it/s]

5025980

In [8]:
# only run
random.shuffle(narrations)
narrations = narrations[0:50000]

In [9]:
def sub_tagged_tokens(text: str) -> str:
    text = text.replace("#C", "Camera wearer")
    text = text.replace("#O", "Other person")
    text = text.replace("#unsure", "something")
    return text

In [10]:
def encode_narrations():
    model = SentenceTransformer("all-mpnet-base-v2")
    return model.encode([
        sub_tagged_tokens(txt)
        for _, txt, _, _ in narrations
    ])

t1 = time.time()
fvs = encode_narrations()
t2 = time.time()
t2 - t1

35.74345922470093

In [11]:
# create keys of narrations to store in HDF5 
key_to_narr_idx = {
    f"{uid}_{txt}_{ts:.3f}_{pazz}": (idx, (uid, txt, ts))
    for idx, (uid, txt, ts, pazz) in enumerate(narrations)
}
assert len(key_to_narr_idx) == len(narrations)
list(key_to_narr_idx.keys())[0]

'95e3cefa-e6a2-4c1e-831d-616a86ff177b_#C C adjusts the weighing platform of the scale with both hands._1704.945_1'

In [12]:
t1 = time.time()
with h5py.File(NARR_HDF5_EX_PATH, "w") as out_f:
    for key, (idx, _) in tqdm(key_to_narr_idx.items(), total=len(key_to_narr_idx)):
        fv = fvs[idx]
        out_f.create_dataset(key, data=fv)
t2 = time.time()

  0%|          | 0/50000 [00:00<?, ?it/s]

In [13]:
# save off the keys/metadata with torch.save (pickle)
narration_metadata = [
    {"key": key, "uid": uid, "txt": txt, "ts": ts}
    for key, (_, (uid, txt, ts)) in key_to_narr_idx.items()
]
torch.save(narration_metadata, NARR_META_PATH)
narration_metadata[0]

{'key': '95e3cefa-e6a2-4c1e-831d-616a86ff177b_#C C adjusts the weighing platform of the scale with both hands._1704.945_1',
 'uid': '95e3cefa-e6a2-4c1e-831d-616a86ff177b',
 'txt': '#C C adjusts the weighing platform of the scale with both hands.',
 'ts': 1704.9446}

# Preprocess in the same way Kinetics and Ego-Charades

Please refer to the code in `ego4d/research/clip/preprocess.py`

# Step 2: Datasets/Dataloaders

- For **classification tasks** (zero-shot) we can build a generic dataloader which accepts as input:
   - Feature HDF5 path
   - list of `[(key, label_dict)]`
- For **video (visual) / narration pairs**: we can build a specialized dataloader

## Utility To Get Start/End Index

First thing we'll need is to get the features ranging from `[t1, t2]`

- `features_per_sec == 30 / 16` (fps of canonical video divided by stride)
- `nf` is the number number of features for video

In [28]:
def get_start_end_idx(t1: float, t2: float, feature_per_sec: float, nf: int):
    assert t2 >= 0
    x1 = min(
        max(0, math.floor(t1 * feature_per_sec)),
        nf - 1,
    )
    x2 = min(
       math.floor(t2 * feature_per_sec),
       nf - 1,
    )
    assert x2 >= x1
    return x1, x2 + 1

# Visual / Language Pair Dataset

In [29]:
class Ego4DClipDset(torch.utils.data.Dataset):
    def __init__(self, offset_sec=2):
        super().__init__()
        
        self.features = h5py.File(FEATURE_HDF5_EX_PATH)
        self.narrs = h5py.File(NARR_HDF5_EX_PATH)
        self.metadata = torch.load(NARR_META_PATH)
        self.offset_sec = offset_sec
    
    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, idx):
        meta = self.metadata[idx]
        narr_key = meta["key"]
        uid = meta["uid"]
        
        t = meta["ts"]
        t1 = t - self.offset_sec
        t2 = t + self.offset_sec
        
        vid_feat = self.features[uid]
        start_idx, end_idx = get_start_end_idx(
            t1, t2, FEATURES_PER_SECOND, len(vid_feat)
        )
        
        txt_feat = self.narrs[narr_key][0:]

        return {
            # Alternatively you could sample a constant number here
            "video": torch.tensor(vid_feat[start_idx:end_idx]).mean(0),
            "text": torch.tensor(txt_feat),
        }

In [30]:
dset = Ego4DClipDset(2)
dset[25]["video"].shape, len(dset)

(torch.Size([1536]), 50000)

# The Model

In [31]:
class ClipModel(nn.Module):
    def __init__(self, txt_in_f=768, vid_in_f=1536):
        super().__init__()
        self.visual_proj = nn.Sequential(
            nn.Linear(vid_in_f, FEATURE_DIM),
            nn.ReLU(True),
            nn.Linear(FEATURE_DIM, FEATURE_DIM)
        )
        self.text_proj = nn.Sequential(
            nn.Linear(txt_in_f, FEATURE_DIM),
            nn.ReLU(True),
            nn.Linear(FEATURE_DIM, FEATURE_DIM)
        )
        self.apply(self.init_weights)
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
    
    def init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight.data, gain=torch.nn.init.calculate_gain('relu'))
            module.bias.data.zero_()
    
    def forward(self, x):
        ve = self.visual_proj(x["video"])
        te = self.text_proj(x["text"])
        return ve, te, self.logit_scale

In [32]:
def compute_loss(vid2txt, txt2vid, device):
    N = v_f.shape[0]
    label = torch.eye(N, device=device)
    loss = (
        F.cross_entropy(vid2txt, label) +
        F.cross_entropy(txt2vid, label)
    ) / 2.0
    return loss

In [33]:
device = "cuda"
model = ClipModel().to(device)

In [34]:
dloader = DataLoader(dset, batch_size=128, num_workers=10, pin_memory=False)  # use workers > 1 for efficiency

In [35]:
optim = torch.optim.AdamW(
    model.parameters(),
    lr=0.0001,
    betas=(0.98, 0.9),
    eps=1e-6,
)

In [36]:
num_epochs = 15
model.train()
for i in range(num_epochs):
    n_ex = 0
    t1 = time.time()
    for batch in dloader:
        # xfer to devices
        batch = {x: y.to(device) for x, y in batch.items()}
        
        optim.zero_grad()
        
        v_f, t_f, logit_scale = model(batch)
        vid2txt = logit_scale * v_f @ t_f.T
        txt2vid = logit_scale * t_f @ v_f.T
        loss = compute_loss(vid2txt, txt2vid, device)
        
        loss.backward(); optim.step(); # scheduler.step()
        n_ex += batch["video"].shape[0]
        
        with torch.no_grad():
            model.logit_scale.clamp_(0, math.log(100))
    t2 = time.time()
    print(f"Epoch {i}, Loss: {loss}, Examples/s: {n_ex/(t2 - t1)}")

Epoch 0, Loss: 1.9789854288101196, Examples/s: 9339.04441825466
Epoch 1, Loss: 1.1801788806915283, Examples/s: 9667.672824670813
Epoch 2, Loss: 0.7611839771270752, Examples/s: 9693.215107174368
Epoch 3, Loss: 0.48869460821151733, Examples/s: 9558.827584694896
Epoch 4, Loss: 0.31157782673835754, Examples/s: 9649.203931657485
Epoch 5, Loss: 0.21405833959579468, Examples/s: 9936.787752282891
Epoch 6, Loss: 0.16565120220184326, Examples/s: 9673.318755371658
Epoch 7, Loss: 0.2100939154624939, Examples/s: 9830.772494114035
Epoch 8, Loss: 0.16152575612068176, Examples/s: 9538.249816628271
Epoch 9, Loss: 0.15146878361701965, Examples/s: 9364.432008657699
Epoch 10, Loss: 0.19459089636802673, Examples/s: 9769.538977798295
Epoch 11, Loss: 0.13877642154693604, Examples/s: 9718.524746821122
Epoch 12, Loss: 0.15733273327350616, Examples/s: 9673.09968040956
Epoch 13, Loss: 0.08673865348100662, Examples/s: 9640.256928013003
Epoch 14, Loss: 0.10245321691036224, Examples/s: 9506.879408286117


# Future Directions

- Other paper's directions
  - Hard-negative mining or include hard negatives in batch as done in VideoCLIP / EgoVLP
  - Heuristic for positive examples (text similarity, EgoVLP heuristic)
- Augment training with image dataset (due to omnivore)
- Self-attention for the model / feature transformation
- Extend to end-to-end training