# Features Tutorial

In [1]:
import random
random.seed(1234)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import time
import json
import os
import math
from typing import List

import torch
import h5py
import numpy as np
from torch.nn import functional as F
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import torch.nn as nn
from torch.utils.data import DataLoader

NARRATION_JSON_PATH = "/datasets01/ego4d_track2/v1/annotations/narration.json"
NARR_OUT_DIR = "/tmp/narrs/"
NARR_META_PATH = os.path.join(NARR_OUT_DIR, "meta.pt")
FEATURE_DIR = "/checkpoint/miguelmartin/ego4d_track2_features/full_scale/omnivore_video_swinL"
FEATURES_PER_SECOND = 30 / 16
FEATURE_DIM = 1536

VIDEO_UIDS = [x.split(".pt")[0] for x in os.listdir(FEATURE_DIR) if "yaml" not in x]
random.shuffle(VIDEO_UIDS)

EXAMPLE_VIDEO_UID = VIDEO_UIDS[0]
VIDEO_UIDS_EXAMPLE_SET = set(VIDEO_UIDS[0:100])

os.makedirs(NARR_OUT_DIR, exist_ok=True)

# Step 1: Prepare Data

- Preprocess:
   1. Ego4D:
       1. *Video Features*: convert to HDF5 file
       2. *Narration Features*: extract & save to disk
   2. Kinetics400: Extract features from Labels / Videos & save to HDF5
       - Labels converted to `"The person in this video is doing <label>"`
   3. Ego-Charades: Extract features from Labels / Videos & save to HDF5
       - Labels will be as-is
       
- HDF5 to store features
- Pickle file (`torch.save` / `torch.load`) to store keys as HDF5 is slow with respect to getting keys

NOTE: we're not storing narration embeddings/features to HDF5 as with 5million potential narrations to use, you will require to distribute (across many processes or machines) the writes to disk as otherwise it will take a long time to save them all.

## Features

In [2]:
FEATURE_HDF5_OUT_PATH = "features_ex.hdf5"
NARR_HDF5_OUT_PATH = "narrs_ex.hdf5"
NARR_META_OUT_PATH = "narrs_ex.pt"

In [3]:
with h5py.File(FEATURE_HDF5_OUT_PATH, "w") as out_f:
    for uid in tqdm(VIDEO_UIDS_EXAMPLE_SET, desc="video_uid", leave=True):
        feature_path = os.path.join(FEATURE_DIR, f"{uid}.pt")
        fv = torch.load(feature_path)
        out_f.create_dataset(uid, data=fv.numpy())

video_uid:   0%|          | 0/100 [00:00<?, ?it/s]

## Narrations

In [4]:
# NOTE: this is missing validation set removal
uid_subset = VIDEO_UIDS_EXAMPLE_SET
narration_json = json.load(open(NARRATION_JSON_PATH))
narrations = [
    (uid, data["narration_text"], data["timestamp_sec"], 1)
    for uid in tqdm(uid_subset)
    for data in narration_json[uid].
    get("narration_pass_1", 
        {"narrations": []})["narrations"]
]
narrations += [
    (uid, data["narration_text"], data["timestamp_sec"], 2)
    for uid in tqdm(uid_subset)
    for data in narration_json[uid].get("narration_pass_2", {"narrations": []})["narrations"]
]

narrations.sort(key=lambda x: (x[0], x[-1]))
len(narrations)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

50206

In [5]:
def sub_tagged_tokens(text: str) -> str:
    text = text.replace("#C", "Camera wearer")
    text = text.replace("#O", "Other person")
    text = text.replace("#unsure", "something")
    return text

In [6]:
def encode_narrations():
    model = SentenceTransformer("all-mpnet-base-v2")
    return model.encode([
        sub_tagged_tokens(txt)
        for _, txt, _, _ in narrations
    ])

t1 = time.time()
fvs = encode_narrations()
t2 = time.time()
t2 - t1

34.33861708641052

In [7]:
t1 = time.time()
for idx, x in tqdm(enumerate(fvs), total=len(fvs)):
    torch.save(x, os.path.join(NARR_OUT_DIR, f"{idx}.pt"))
t2 = time.time()

t2 - t1

  0%|          | 0/50206 [00:00<?, ?it/s]

8.999829292297363

In [8]:
# save off the keys/metadata with torch.save (pickle)
narration_metadata = [
    {"uid": uid, "txt": txt, "ts": ts, "idx": idx, "pass": pazz}
    for idx, (uid, txt, ts, pazz) in enumerate(narrations)
]
torch.save(narration_metadata, NARR_META_PATH)
narration_metadata[0]

{'uid': '057bf03d-b337-475c-82a2-79f0b5b6637f',
 'txt': '#C C opens a door',
 'ts': 1.2841985999999999,
 'idx': 0,
 'pass': 1}

# Preprocess in a similar way for Kinetics and Ego-Charades

Please refer to the code in:
- `ego4d/research/clep/run_preprocess.py`
- `ego4d/research/clep/preprocess/kinetics.py`
- `ego4d/research/clep/preprocess/ego_charade.py`

# Step 2: Datasets/Dataloaders

- For **classification tasks** (zero-shot) we can build a generic dataloader which accepts as input:
   - Feature HDF5 path
   - list of `[(key, label_dict)]`
- For **video (visual) / narration pairs**: we can build a specialized dataloader

## Utility To Get Start/End Index

First thing we'll need is to get the features ranging from `[t1, t2]`

- `features_per_sec == 30 / 16` (fps of canonical video divided by stride)
- `nf` is the number number of features for video

In [9]:
def get_start_end_idx(t1: float, t2: float, feature_per_sec: float, nf: int):
    assert t2 >= 0
    x1 = min(
        max(0, math.floor(t1 * feature_per_sec)),
        nf - 1,
    )
    x2 = min(
       math.floor(t2 * feature_per_sec),
       nf - 1,
    )
    assert x2 >= x1
    return x1, x2 + 1

# Visual / Language Pair Dataset

In [10]:
class Ego4DClipDset(torch.utils.data.Dataset):
    def __init__(self, offset_sec=2):
        super().__init__()
        
        self.features = h5py.File(FEATURE_HDF5_OUT_PATH)
        self.metadata = torch.load(NARR_META_PATH)
        self.offset_sec = offset_sec
    
    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, idx):
        meta = self.metadata[idx]
        narr_key = meta["idx"]
        uid = meta["uid"]
        
        t = meta["ts"]
        t1 = t - self.offset_sec
        t2 = t + self.offset_sec
        
        vid_feat = self.features[uid]
        start_idx, end_idx = get_start_end_idx(
            t1, t2, FEATURES_PER_SECOND, len(vid_feat)
        )
        
        txt_feat = torch.load(os.path.join(NARR_OUT_DIR, f"{narr_key}.pt"))

        return {
            # Alternatively you could sample a constant number here
            "video": torch.tensor(vid_feat[start_idx:end_idx]).mean(0),
            "text": torch.tensor(txt_feat),
        }

In [11]:
dset = Ego4DClipDset(2)
dset[25]["video"].shape, len(dset)

(torch.Size([1536]), 50206)

# The Model

In [12]:
class ClipModel(nn.Module):
    def __init__(self, txt_in_f=768, vid_in_f=1536):
        super().__init__()
        self.visual_proj = nn.Sequential(
            nn.Linear(vid_in_f, FEATURE_DIM),
            nn.ReLU(True),
            nn.Linear(FEATURE_DIM, FEATURE_DIM)
        )
        self.text_proj = nn.Sequential(
            nn.Linear(txt_in_f, FEATURE_DIM),
            nn.ReLU(True),
            nn.Linear(FEATURE_DIM, FEATURE_DIM)
        )
        self.apply(self.init_weights)
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
    
    def init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight.data, gain=torch.nn.init.calculate_gain('relu'))
            module.bias.data.zero_()
    
    def forward(self, x):
        ve = self.visual_proj(x["video"])
        te = self.text_proj(x["text"])
        return ve, te, self.logit_scale

In [13]:
def compute_loss(vid2txt, txt2vid, device):
    N = v_f.shape[0]
    label = torch.eye(N, device=device)
    loss = (
        F.cross_entropy(vid2txt, label) +
        F.cross_entropy(txt2vid, label)
    ) / 2.0
    return loss

In [14]:
device = "cuda"
model = ClipModel().to(device)

In [15]:
dloader = DataLoader(dset, batch_size=128, num_workers=10, pin_memory=False)  # use workers > 1 for efficiency

In [16]:
optim = torch.optim.AdamW(
    model.parameters(),
    lr=0.0001,
    betas=(0.98, 0.9),
    eps=1e-6,
)

In [17]:
num_epochs = 15
model.train()
for i in range(num_epochs):
    n_ex = 0
    t1 = time.time()
    for batch in dloader:
        # xfer to devices
        batch = {x: y.to(device) for x, y in batch.items()}
        
        optim.zero_grad()
        
        v_f, t_f, logit_scale = model(batch)
        vid2txt = logit_scale * v_f @ t_f.T
        txt2vid = logit_scale * t_f @ v_f.T
        loss = compute_loss(vid2txt, txt2vid, device)
        
        loss.backward(); optim.step(); # scheduler.step()
        n_ex += batch["video"].shape[0]
        
        with torch.no_grad():
            model.logit_scale.clamp_(0, math.log(100))
    t2 = time.time()
    print(f"Epoch {i}, Loss: {loss}, Examples/s: {n_ex/(t2 - t1)}")

Epoch 0, Loss: 4.089771747589111, Examples/s: 9071.56106958311
Epoch 1, Loss: 3.6496753692626953, Examples/s: 11182.034646274062
Epoch 2, Loss: 3.5249929428100586, Examples/s: 11152.022603662079
Epoch 3, Loss: 3.4897658824920654, Examples/s: 11082.790540068292
Epoch 4, Loss: 3.4889583587646484, Examples/s: 11138.286295208198
Epoch 5, Loss: 3.4732935428619385, Examples/s: 11052.156029850274
Epoch 6, Loss: 3.3928422927856445, Examples/s: 11061.04516618785
Epoch 7, Loss: 3.3515167236328125, Examples/s: 11080.060260427097
Epoch 8, Loss: 3.309297561645508, Examples/s: 11108.393463083657
Epoch 9, Loss: 3.2793796062469482, Examples/s: 11134.949795987173
Epoch 10, Loss: 3.2377395629882812, Examples/s: 11111.494778770793
Epoch 11, Loss: 3.257960319519043, Examples/s: 11183.151065605582
Epoch 12, Loss: 3.3925580978393555, Examples/s: 11013.553845665432
Epoch 13, Loss: 3.3396925926208496, Examples/s: 11161.99962959284
Epoch 14, Loss: 3.4016289710998535, Examples/s: 11162.724453935993


# Future Directions

- Other paper's directions
  - Hard-negative mining or include hard negatives in batch as done in VideoCLIP / EgoVLP
  - Heuristic for positive examples (text similarity, EgoVLP heuristic)
- Augment training with image dataset (due to omnivore)
- Self-attention for the model / feature transformation
- Extend to end-to-end training