# Unit-test (kinda)

- Unit test for SMCN non-decomposoable search 

Making sure operations do what we wanna do.

In case we need to speed up things, we may need to write our own cuda kernel. The operations resembles an ROI-like operation. We would have to trace that the bottleneck is in this part.

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

for i in range(100):
    clips_per_segment = torch.randint(1, 8, (4,))
    clips_per_segment_list = clips_per_segment.tolist()
    list_clips_score = [torch.rand(int(i)) for i in clips_per_segment]
    clip_score = torch.cat(list_clips_score)
    clip_score_ = clip_score.split(clips_per_segment_list)    
    sorted_clips_per_segment, ind = clips_per_segment.sort(descending=True)
    _, original_ind = ind.sort(descending=False)
    clip_distance_padded = pad_sequence([clip_score_[i] for i in ind], batch_first=True)
    sorted_segment_distance = (clip_distance_padded.sum(dim=1) /
                               sorted_clips_per_segment)
    _, original_ind = ind.sort(descending=False)
    segment_distance = sorted_segment_distance[original_ind]
    gt = torch.tensor([i.sum() / len(i) for i in list_clips_score])
    # print(segment_distance)
    # print(gt)
    assert torch.nn.functional.mse_loss(segment_distance, gt) < 1e-9

- Debuging mix of heterogenous data sources (video and images)

In [None]:
from didemo import DidemoSMCNHeterogeneous

filename = 'data/interim/didemo_yfcc100m/train_data.json'
h5_video = 'data/interim/didemo/resnet152/320x240_max.h5'
h5_img = 'data/interim/yfcc100m/resnet152/320x240_001.h5'
cues = {'rgb': {'file': [h5_video, h5_img]}}
blah = DidemoSMCNHeterogeneous(filename, cues, DEBUG=True)

for i, v in enumerate(blah):
    if blah.metadata[i]['source'] == 1:
        aja = blah[i]
        break```

In [None]:
!python loss.py

In [None]:
!python model.py

In [None]:
# This takes a couple of minutes
!python didemo.py

# Bash

Test training loop
```bash
for i in {001..015}; do python train.py --epochs 2 --gpu-id 1 &> $i".log"; done
```

# Sandbox

Always stacking, It's better than scrolling :)

Checking that at least two annotators in DiDeMo agrees with tIOU=1.0

Requested by Bryan on Oct 22 to adapt DiDeMo metrics.

In [None]:
import random
import json
import numpy as np
from np_segments_ops import iou as segment_iou

filename = 'data/raw/train_data.json'
min_iou = 
with open(filename, 'r') as fid:
    data = json.load(fid)
    for i, moment in enumerate(data):
        # trasnform to seconds
        moment['times'] = np.array(moment['times'])
        moment['times'] *= 5
        moment['times'][:, 1] += 5
        
        # compute iou among annotators
        iou_matrix = segment_iou(
            moment['times'], moment['times'])
        ind = np.where(iou_matrix < 1)
        iou_matrix[ind] = 0
        assert np.any(iou_matrix.sum(axis=1) >= 2)

Reduce train/val set for testing

In [None]:
import json
import random

file1 = 'workers/skynet/data/processed/activitynet-captions/train.json'
file2 = 'workers/skynet/data/processed/activitynet-captions/train-reduced.json'
# size of dataset
K = 1280

with open(file1, 'r') as fr, open(file2, 'w') as fw:
    data = json.load(fr)
    copy = {}
    for k, v in data.items():
        if k != 'moments':
            copy[k] = v
        else:
            copy[k] = random.choices(v, k=K)
    json.dump(copy, fw)

In [None]:
from torch.utils.data import Dataset, DataLoader

class Simple(Dataset):
    def __init__(self):
        self.x = list(range(5))
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        return x * 2, {'x': x, 'y': x + 5}
    
data = Simple()
data[0]

loader = DataLoader(data)

for i in loader:
    print(i)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


class NetVLAD(nn.Module):
    
    def __init__(self, cluster_size, feature_size, add_batch_norm=True):
        super(NetVLAD, self).__init__()
        self.feature_size = feature_size
        self.cluster_size = cluster_size
        self.add_batch_norm = add_batch_norm
        self.out_dim = cluster_size * feature_size
        self.clusters = nn.Parameter(
            (1 / math.sqrt(feature_size)) * torch.randn(feature_size, cluster_size))
        self.clusters2 = nn.Parameter(
            (1 / math.sqrt(feature_size)) * th.randn(1, feature_size, cluster_size))
        if add_batch_norm:
            self.batch_norm = nn.BatchNorm1d(cluster_size)

    def forward(self, x):
        max_sample = x.shape[1]
        x = x.view(-1, self.feature_size)
        assignment = th.matmul(x, self.clusters)

        if self.add_batch_norm:
            assignment = self.batch_norm(assignment)

        assignment = F.softmax(assignment, dim=1)
        assignment = assignment.view(-1, max_sample, self.cluster_size)

        a_sum = th.sum(assignment, -2, keepdim=True)
        a = a_sum * self.clusters2
        assignment = assignment.transpose(1, 2)

        x = x.view(-1, max_sample, self.feature_size)
        vlad = th.matmul(assignment, x)
        vlad = vlad.transpose(1, 2)
        vlad = vlad - a

        # L2 intra norm
        vlad = F.normalize(vlad)
        
        # flattening + L2 norm
        vlad = vlad.view(-1, self.cluster_size * self.feature_size)
        vlad = F.normalize(vlad)

        return vlad

class NetRVLAD(nn.Module):
    
    def __init__(self, cluster_size, feature_size, add_batch_norm=True):
        super(NetRVLAD, self).__init__()
        self.feature_size = feature_size
        self.cluster_size = cluster_size
        self.add_batch_norm = add_batch_norm
        self.out_dim = cluster_size * feature_size
        self.clusters = nn.Parameter(
            (1 / math.sqrt(feature_size)) * th.randn(feature_size, cluster_size))
        if self.add_batch_norm:
            self.batch_norm = nn.BatchNorm1d(cluster_size)

    def forward(self,x):
        max_sample = x.shape[1]
        x = x.view(-1, self.feature_size)
        assignment = th.matmul(x, self.clusters)

        if self.add_batch_norm:
            assignment = self.batch_norm(assignment)

        assignment = F.softmax(assignment, dim=1)
        assignment = assignment.view(-1, max_sample, self.cluster_size)
        assignment = assignment.transpose(1, 2)

        x = x.view(-1, max_sample, self.feature_size)
        rvlad = th.matmul(assignment, x)
        rvlad = rvlad.transpose(-1, 1)

        # L2 intra norm
        rvlad = F.normalize(rvlad)
        
        # flattening + L2 norm
        rvlad = rvlad.view(-1, self.cluster_size * self.feature_size)
        rvlad = F.normalize(rvlad)

        return rvlad

- Used to check if parameter are changing

```python
for k, v in net.img_encoder.named_parameters(): print(k, v.sum())
for k, v in net.sentence_encoder.named_parameters(): print(k, v.sum())
```

- Checking that we can hash video-name with 8 integers

In [None]:
import hashlib
from didemo import DidemoSMCNRetrieval
from didemo import RetrievalMode
import numpy as np

RGB_FEAT_PATH = 'data/interim/didemo/resnet152/320x240_max.h5'
args = dict(context=False, loc=False,
            cues=dict(rgb=dict(file=RGB_FEAT_PATH)))

for subset in ['val', 'test']:
    LIST_PATH = f'data/raw/{subset}_data_wwa.json'
    val_dataset = DidemoSMCNRetrieval(LIST_PATH, **args)
    val_dataset.mode = RetrievalMode.VIDEO_TO_DESCRIPTION
    video_ids = []
    for video_j_data in val_dataset:
        video_j_ind = video_j_data[0]
        video_id = val_dataset.metada_per_video[video_j_ind][0]
        video_id_int = int(hashlib.sha256(video_id.encode('utf-8')).hexdigest(), 16) % 10**8
        video_ids.append(video_id_int)
    print(len(video_ids), len(np.unique(video_ids)))

- Pick last step of LSTM

In [None]:
import torch
import torch.nn as nn

batch_size = 4
max_length = 3
hidden_size = 2
n_layers = 1
input_dim = 1
batch_first = True

# Data
vec_1 = torch.FloatTensor([[1, 2, 3]])
vec_2 = torch.FloatTensor([[1, 2, 0]])
vec_3 = torch.FloatTensor([[1, 0, 0]])
vec_4 = torch.FloatTensor([[2, 0, 0]])

# Put the data into a tensor.
batch_in = torch.cat([vec_1, vec_2, vec_3, vec_4])
batch_in = torch.unsqueeze(batch_in, -1)

# Wrap RNN input in a Variable. Shape: (batch_size, max_length, input_dim)
# The lengths of each example in the batch. Padding is 0.
lengths = torch.LongTensor([3, 2, 1, 1])

# Wrap input in packed sequence, with batch_first=True
packed_input = torch.nn.utils.rnn.pack_padded_sequence(
    batch_in, lengths, batch_first=True)

# Create an RNN object, set batch_first=True
rnn = nn.RNN(input_dim, hidden_size, n_layers, batch_first=True) 

# Run input through RNN 
packed_output, _ = rnn(packed_input)

# Unpack, with batch_first=True.
output, _ = torch.nn.utils.rnn.pad_packed_sequence(
    packed_output, batch_first=True)
print("Unpacked, padded output: ")
print(output)
last_step = output[range(batch_size), lengths - 1, :]
print(last_step)

- "empirical" evidence that sort indices return the correct array

Huge credits to @ModarTensai who came with the idea of sorting indices faster than @escorciav could find the solution in Google.

In [None]:
import torch
for i in range(100000):
    a = torch.rand(100)
    # print(a)
    b, ind = torch.sort(a, descending=True)
    #print(b)
    #print(ind)
    _, ind2 = torch.sort(ind, descending=False)
    assert (b[ind2] == a).sum() == len(a)

- Making sure my IOU does what it should

In [None]:
from np_segments_ops import iou

def random_segments(n):
    x_ = np.random.rand(n, 2).astype(np.float32)
    x = np.empty_like(x_)
    x[:, 0] = np.min(x_, axis=1)
    x[:, 1] = np.max(x_, axis=1)
    return x

N, M = 50, 75
a = random_segments(N)
b = random_segments(M)
a_list = a.tolist()
b_list = b.tolist()
gt = np.empty((N, M))
def python_iou(s_pred, s_gt):
    # adapted from didemo evaluation for time
    intersection = max(
        0, min(s_pred[1], s_gt[1]) - max(s_pred[0], s_gt[0]))
    union = max(s_pred[1], s_gt[1]) - min(s_pred[0], s_gt[0])
    return intersection / union
for i, a_i in enumerate(a_list):
    for j, b_j in enumerate(b_list):
        gt[i, j] = python_iou(a_i, b_j)
np.testing.assert_array_almost_equal(iou(a, b), gt)

- Making sure evaluation is equivalent

In [None]:
import random
import torch
from np_segments_ops import torch_iou

def approach1(true_segments, pred_segments, k_iou):
    iou_matrix = torch_iou(pred_segments, true_segments)
    max_k = k_iou[-1][0]
    if iou_matrix.shape[0] < max_k:
        n_times = round(max_k / iou_matrix.shape[0])
        iou_matrix = iou_matrix.repeat(n_times, 1)
    hit_topk_iou = []
    for top_k, iou_threshold in k_iou:
        best_iou_topk, _ = iou_matrix[:top_k, :].max(dim=0)
        hit_topk_iou.append(best_iou_topk >= iou_threshold)
    return hit_topk_iou

def approach2(true_segments, pred_segments, iou_thresholds, topk):
    concensus_among_annotators = 1 if len(true_segments) == 1 else 2
    P, Q = len(iou_thresholds), len(topk)
    iou_matrix = torch_iou(pred_segments, true_segments)
    # TODO: check type
    hit_k_iou = torch.empty(P * Q, dtype=iou_matrix.dtype,
                            device=iou_matrix.device)
    for i, threshold in enumerate(iou_thresholds):
        hit_iou = ((iou_matrix >= threshold).sum(dim=1) >=
                    concensus_among_annotators)
        rank_iou = (hit_iou != 0).nonzero()
        if len(rank_iou) == 0:
            hit_k_iou[i * Q:(i + 1) * Q] = 0
        else:
            # 0-indexed -> +1
            hit_k_iou[i * Q:(i + 1) * Q] = topk >= (rank_iou[0] + 1)
    return hit_k_iou

def random_segments(n):
    x_ = torch.rand(n, 2)
    x = torch.empty_like(x_)
    x[:, 0] = torch.min(x_, dim=1)[0]
    x[:, 1] = torch.max(x_, dim=1)[0]
    return x

for i in range(100):
    M = 1
    N = random.randint(3, 15)
    k_iou = [(1, 0.5), (5, 0.5), (1, 0.7), (5, 0.7)]
    ious = [0.5, 0.7]
    topk = torch.tensor([1, 5])

    x1 = random_segments(M)
    x2 = random_segments(N)
    gt = torch.cat(approach1(x1, x2, k_iou)).float()
    pred = approach2(x1, x2, ious, topk).float()
    assert torch.nn.functional.mse_loss(pred, gt) < 1e-9
    
for i in range(1000):
    M = random.randint(1, 10)
    N = random.randint(3, 15)
    k_iou = [(1, 0.5), (5, 0.5), (1, 0.7), (5, 0.7)]
    ious = [0.5, 0.7]
    topk = torch.tensor([1, 5])

    x1 = random_segments(M)
    x2 = random_segments(N)
    pred = approach2(x1, x2, ious, topk).float()

- Get MD5SUM

In [None]:
import json
gt_fmt = 'data/processed/didemo/{}.json'
file_fmt = 'data/interim/didemo/{}.json'
for i in ['train', 'test', 'val']:
    # Only edit metadata-related to tracking
    # if u over-write everything, of course they will match!
    new_file = file_fmt.format(i)
    with open(gt_fmt.format(i), 'r') as fr, open(new_file, 'r') as fn:
        data_r = json.load(fr)
        data_n = json.load(fn)
        data_n['date'] = data_r['date']
        data_n['git_hash'] = data_r['git_hash']
    with open(new_file, 'w') as fn:
        json.dump(data_n, fn)
    # TODO: automate comparison
    !md5sum $new_file

- Transform warining into errors, useful to debug zero division

  ```python
  import warnings
  warnings.filterwarnings("error")
  ```

## Crude ideas

1. Feature engineering

    - Images

        - resnet 152/101

        - facenet

    - Text

        - word2vec

        - openai language model

        - skipthought

    - audio

        - VGGs from google

        - AudioNet Vondrick and others

    - Video

        - I3D

        - NLN

1. LSTM with variable sequence length

    - query features
    
    - unroll until the end and predict sequence length
    
    - 0/1 representing time to match