# Feature Visualization

This is an example to show how to use visual features in Ego4D. This notebook uses the action features from the SlowFast model. The SlowFast model gives us features from both the slow pathway and the fast pathway.

This notebook:
1. Loads features
    - This assumes your features have been downloaded and are locally available in some directory on your machine.
    a) Aggregates the features into a fixed window size
       - This can be set in the "customize variables" section
2. Runs TSNE
3. Visualizes them in plotly
4. View videos

Please note that SlowFast action features are every 16 frames (0.5333s).

## Requirements
- plotly
- sklearn
- pytorch
- moviepy (optional)
   - For visualization in juypter moviepy is used
   
## Notes
- See: https://ego4d-data.org/docs/data/features/

In [None]:
import random
import math
import time
import os
import json
import torch
import numpy as np

import plotly.express as px

USING_SKLEARN = True
from sklearn import preprocessing
try:
    # https://pypi.org/project/tsne-torch/
    # https://github.com/CannyLab/tsne-cuda/blob/master/INSTALL.md
    from tsnecuda import TSNE
    USING_SKLEARN = False
    print("Using CannyLab's tsnecuda")
except:
    from sklearn.manifold import TSNE

from moviepy.editor import VideoFileClip

# Customize Variables

In [None]:
EGO4D_JSON_PATH = "/private/home/miguelmartin/ego4d/ego4d.json"
FEATURE_DIR = "/datasets01/ego4d_track2/v1/slowfast8x8_r101_k400"
VIDEOS_DIR = "/datasets01/ego4d_track2/v1/full_scale/"

NUM_VIDEOS_LIMIT = -1  # use -1 for no limit

# how many seconds to reduce each point to
AGGREGATION_SEC = 600  # every 30s of video

FEATURE_STRIDE = 16
FEATURE_WINDOW_SIZE = 32
FPS = 30

random.seed(0)

In [None]:
def sec_to_window_size(sec, stride_frames, window_size_frames, fps):
    if sec == -1:
        return -1
    
    num_frames = sec * fps - window_size_frames
    return math.ceil(num_frames / stride_frames + 1)

In [None]:
AGGR_WINDOW_SIZE_FRAMES = sec_to_window_size(AGGREGATION_SEC, FEATURE_STRIDE, FEATURE_WINDOW_SIZE, FPS)
AGGR_WINDOW_SIZE_FRAMES

# 1 - Load Features

In [None]:
def is_in_region(t, t1, t2):
    return t >= t1 and t <= t2

def is_in_any_region(t, start_ends):
    for red in start_ends:
        if is_in_region(t, red["start_sec"], red["end_sec"]):
            return True
    return False

In [None]:
def frame_idx_to_time(start, end, uid):
    t_start_frames = start*FEATURE_STRIDE
    t_end_frames = end*FEATURE_STRIDE+FEATURE_WINDOW_SIZE
    t_s = t_start_frames / FPS
    t_e = t_end_frames / FPS
    
    meta = meta_for_features[uid]["video_metadata"]
    vid_dur = meta["video_duration_sec"] + meta["video_start_sec"] 
    if t_e > vid_dur:
        return t_s, vid_dur
    return t_s, t_e

In [None]:
feature_paths = [os.path.join(FEATURE_DIR, x) for x in os.listdir(FEATURE_DIR) if x.endswith(".pt")]
random.shuffle(feature_paths)

feature_uids = [path.split("/")[-1][:-3] for path in feature_paths]  # remove ".pt"
features_to_load = {
    uid: path
    for uid, path in zip(feature_uids, feature_paths)
}

metadata = json.load(open(EGO4D_JSON_PATH))
meta_per_uid = {v["video_uid"]: v for v in metadata["videos"]}
meta_for_features = {k: v for k, v in meta_per_uid.items() if k in feature_uids}

def get_agg_features(feature_path, uid, window_size):
    f = torch.load(feature_path)
#     print(f.shape)
    return [
        (f[i:i+window_size].mean(0), i, min(i + window_size - 1, len(f) - 1))
        for i in range(0, f.shape[0], window_size)
        # remove outlier features
        if 
        (
            f[i:i+window_size].shape[0] >= int(0.5*window_size)
            or window_size >= f.shape[0]
        )
        and 
        not (
            is_in_any_region(
                frame_idx_to_time(i, i+window_size-1, uid)[0], meta_for_features[uid]['redacted_intervals']
            )
            or
            is_in_any_region(
                frame_idx_to_time(i, i+window_size-1, uid)[1], meta_for_features[uid]['redacted_intervals']
            )
        )
    ]

feature_uids = feature_uids[0:NUM_VIDEOS_LIMIT] if NUM_VIDEOS_LIMIT != -1 else feature_uids
features = [
    (uid, get_agg_features(features_to_load[uid], uid, AGGR_WINDOW_SIZE_FRAMES))
    for uid in feature_uids
]

feature_with_identifiers = [(x, (uid, i, j)) for uid, xx in features for x, i, j in xx]
agg_features = torch.stack([x for x, _ in feature_with_identifiers])
video_indices = [idx for _, idx in feature_with_identifiers]
len(agg_features), len(features)

In [None]:
video_uids = [uid for uid, _, _ in video_indices]
start_end_times = [frame_idx_to_time(start, end, uid) for uid, start, end in video_indices]
labels = [f"{i}" for i, _, _ in video_indices]

start_end_times[0:10]

# 2 - Run TSNE

In [None]:
kwargs = {}
if not USING_SKLEARN:
    # some decent parameters for the entire dataset
    kwargs = {
        "n_iter": 300000,
        "learning_rate": 1.5,
    }
    assert len(features) < 10000, "are you sure you want run SKLearn with this many features? (it's slow)"
kwargs

In [None]:
t1 = time.time()
X = agg_features
X_norm = preprocessing.MinMaxScaler(feature_range=(-1, 1)).fit_transform(X)
X_tsne = TSNE(
    n_components=2,
    verbose=1,
    perplexity=300.0, # for the entire dataset
#     perplexity=500.0, # for the entire dataset
    **kwargs,
).fit_transform(X_norm)
t2 = time.time()

In [None]:
t2 - t1

# 3 - Visualize

In [None]:
xys = X_tsne.tolist()
data_df = {
    "x": [x for x, _ in xys],
    "y": [y for _, y in xys],
    "labels": labels,
    "feature_idx": [idx for idx in range(len(xys))],
    "video_uid": [uid for uid in video_uids],
    "start_s": [t1 for t1, t2 in start_end_times],
    "end_s": [t2 for t1, t2 in start_end_times],
    "scenarios": [meta_for_features[uid]["scenarios"] for uid in video_uids],
    "is_redacted": [
        is_in_any_region(ts[0], meta_for_features[uid]['redacted_intervals'])
        and is_in_any_region(ts[1], meta_for_features[uid]['redacted_intervals'])
        for uid, ts in zip(video_uids, start_end_times)
    ],
    "has_redacted": [
        is_in_any_region(ts[0], meta_for_features[uid]['redacted_intervals'])
        or is_in_any_region(ts[1], meta_for_features[uid]['redacted_intervals'])
        for uid, ts in zip(video_uids, start_end_times)
    ],
}

In [None]:
px.scatter(data_df, x="x", y="y", color="feature_idx",
           hover_data=["feature_idx", "video_uid", "start_s", "end_s", "is_redacted", "has_redacted", "scenarios"])

# 4 - View The Videos

In [None]:
# Here you can take two features by index and observe the region for which the features represent

feature_idx_1 = 11678
feature_idx_2 = 21936

uid1 = video_uids[feature_idx_1]
uid2 = video_uids[feature_idx_2]

vid1_start_end = start_end_times[feature_idx_1]
vid2_start_end = start_end_times[feature_idx_2]
(uid1, vid1_start_end), (uid2, vid2_start_end)

In [None]:
meta_for_features[uid1]['scenarios'], meta_for_features[uid1]['redacted_intervals']

In [None]:
meta_for_features[uid2]['scenarios'], meta_for_features[uid2]['redacted_intervals']

In [None]:
def create_clip(video_uid, start_end, clip_path, scale_size):
    path_to_video = f"{VIDEOS_DIR}/{video_uid}.mp4"
    t1 = start_end[0]
    t2 = start_end[1]
    dur = t2 - t1
    ss_str=f"{t1:.6f}"
    dur_str=f"{dur:.6f}"
    
    !ffmpeg -y -ss "$ss_str" -i "$path_to_video" -t "$dur_str" -vf "scale=-1:$scale_size" "$clip_path"

In [None]:
clip_dir = "/tmp/clips"
!rm -r $clip_dir
!mkdir $clip_dir

In [None]:
clip_1_path = f"{clip_dir}/clip1.mp4"
clip_2_path = f"{clip_dir}/clip2.mp4"

In [None]:
create_clip(uid1, vid1_start_end, clip_1_path, 540)
create_clip(uid2, vid2_start_end, clip_2_path, 540)

In [None]:
# VideoFileClip(clip_1_path).ipython_display(maxduration=1000000)

In [None]:
# VideoFileClip(clip_2_path).ipython_display(maxduration=1000000)

# Misc

In [None]:
import json

In [None]:
# Save off some dim reduced features
data = {}
data["schema"] = ["start_sec", "end_sec", "embedding_vector"]

for idx, video_uid in enumerate(video_uids):
    if video_uid not in data:
        data[video_uid] = []
    data[video_uid].append(
        (
            start_end_times[idx][0],
            start_end_times[idx][1],
            X_tsne[idx].tolist(),
        )
    )

In [None]:
data[video_uids[0]][0:3]

In [None]:
json.dump(data, open("/tmp/data.json", "w"), indent=2)