# Unit-test (kinda)

In [None]:
from didemo import DidemoSMCNHeterogeneous

filename = 'data/interim/didemo_yfcc100m/train_data.json'
h5_video = 'data/interim/didemo/resnet152/320x240_max.h5'
h5_img = 'data/interim/yfcc100m/resnet152/320x240_001.h5'
cues = {'rgb': {'file': [h5_video, h5_img]}}
blah = DidemoSMCNHeterogeneous(filename, cues, DEBUG=True)

for i, v in enumerate(blah):
    if blah.metadata[i]['source'] == 1:
        aja = blah[i]
        break```

In [None]:
!python loss.py

In [None]:
!python model.py

In [None]:
# This takes a couple of minutes
!python didemo.py

# Bash

Test training loop
```bash
for i in {001..015}; do python train.py --epochs 2 --gpu-id 1 &> $i".log"; done
```

# Sandbox

Always stacking, It's better than scrolling :)

In [None]:
from torch.utils.data import Dataset, DataLoader

class Simple(Dataset):
    def __init__(self):
        self.x = list(range(5))
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        return x * 2, {'x': x, 'y': x + 5}
    
data = Simple()
data[0]

loader = DataLoader(data)

for i in loader:
    print(i)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


class NetVLAD(nn.Module):
    
    def __init__(self, cluster_size, feature_size, add_batch_norm=True):
        super(NetVLAD, self).__init__()
        self.feature_size = feature_size
        self.cluster_size = cluster_size
        self.add_batch_norm = add_batch_norm
        self.out_dim = cluster_size * feature_size
        self.clusters = nn.Parameter(
            (1 / math.sqrt(feature_size)) * torch.randn(feature_size, cluster_size))
        self.clusters2 = nn.Parameter(
            (1 / math.sqrt(feature_size)) * th.randn(1, feature_size, cluster_size))
        if add_batch_norm:
            self.batch_norm = nn.BatchNorm1d(cluster_size)

    def forward(self, x):
        max_sample = x.shape[1]
        x = x.view(-1, self.feature_size)
        assignment = th.matmul(x, self.clusters)

        if self.add_batch_norm:
            assignment = self.batch_norm(assignment)

        assignment = F.softmax(assignment, dim=1)
        assignment = assignment.view(-1, max_sample, self.cluster_size)

        a_sum = th.sum(assignment, -2, keepdim=True)
        a = a_sum * self.clusters2
        assignment = assignment.transpose(1, 2)

        x = x.view(-1, max_sample, self.feature_size)
        vlad = th.matmul(assignment, x)
        vlad = vlad.transpose(1, 2)
        vlad = vlad - a

        # L2 intra norm
        vlad = F.normalize(vlad)
        
        # flattening + L2 norm
        vlad = vlad.view(-1, self.cluster_size * self.feature_size)
        vlad = F.normalize(vlad)

        return vlad

class NetRVLAD(nn.Module):
    
    def __init__(self, cluster_size, feature_size, add_batch_norm=True):
        super(NetRVLAD, self).__init__()
        self.feature_size = feature_size
        self.cluster_size = cluster_size
        self.add_batch_norm = add_batch_norm
        self.out_dim = cluster_size * feature_size
        self.clusters = nn.Parameter(
            (1 / math.sqrt(feature_size)) * th.randn(feature_size, cluster_size))
        if self.add_batch_norm:
            self.batch_norm = nn.BatchNorm1d(cluster_size)

    def forward(self,x):
        max_sample = x.shape[1]
        x = x.view(-1, self.feature_size)
        assignment = th.matmul(x, self.clusters)

        if self.add_batch_norm:
            assignment = self.batch_norm(assignment)

        assignment = F.softmax(assignment, dim=1)
        assignment = assignment.view(-1, max_sample, self.cluster_size)
        assignment = assignment.transpose(1, 2)

        x = x.view(-1, max_sample, self.feature_size)
        rvlad = th.matmul(assignment, x)
        rvlad = rvlad.transpose(-1, 1)

        # L2 intra norm
        rvlad = F.normalize(rvlad)
        
        # flattening + L2 norm
        rvlad = rvlad.view(-1, self.cluster_size * self.feature_size)
        rvlad = F.normalize(rvlad)

        return rvlad

- Used to check if parameter are changing

```python
for k, v in net.img_encoder.named_parameters(): print(k, v.sum())
for k, v in net.sentence_encoder.named_parameters(): print(k, v.sum())
```

- Checking that we can hash video-name with 8 integers

In [None]:
import hashlib
from didemo import DidemoSMCNRetrieval
from didemo import RetrievalMode
import numpy as np

RGB_FEAT_PATH = 'data/interim/didemo/resnet152/320x240_max.h5'
args = dict(context=False, loc=False,
            cues=dict(rgb=dict(file=RGB_FEAT_PATH)))

for subset in ['val', 'test']:
    LIST_PATH = f'data/raw/{subset}_data_wwa.json'
    val_dataset = DidemoSMCNRetrieval(LIST_PATH, **args)
    val_dataset.mode = RetrievalMode.VIDEO_TO_DESCRIPTION
    video_ids = []
    for video_j_data in val_dataset:
        video_j_ind = video_j_data[0]
        video_id = val_dataset.metada_per_video[video_j_ind][0]
        video_id_int = int(hashlib.sha256(video_id.encode('utf-8')).hexdigest(), 16) % 10**8
        video_ids.append(video_id_int)
    print(len(video_ids), len(np.unique(video_ids)))

- Pick last step of LSTM

In [None]:
import torch
import torch.nn as nn

batch_size = 4
max_length = 3
hidden_size = 2
n_layers = 1
input_dim = 1
batch_first = True

# Data
vec_1 = torch.FloatTensor([[1, 2, 3]])
vec_2 = torch.FloatTensor([[1, 2, 0]])
vec_3 = torch.FloatTensor([[1, 0, 0]])
vec_4 = torch.FloatTensor([[2, 0, 0]])

# Put the data into a tensor.
batch_in = torch.cat([vec_1, vec_2, vec_3, vec_4])
batch_in = torch.unsqueeze(batch_in, -1)

# Wrap RNN input in a Variable. Shape: (batch_size, max_length, input_dim)
# The lengths of each example in the batch. Padding is 0.
lengths = torch.LongTensor([3, 2, 1, 1])

# Wrap input in packed sequence, with batch_first=True
packed_input = torch.nn.utils.rnn.pack_padded_sequence(
    batch_in, lengths, batch_first=True)

# Create an RNN object, set batch_first=True
rnn = nn.RNN(input_dim, hidden_size, n_layers, batch_first=True) 

# Run input through RNN 
packed_output, _ = rnn(packed_input)

# Unpack, with batch_first=True.
output, _ = torch.nn.utils.rnn.pad_packed_sequence(
    packed_output, batch_first=True)
print("Unpacked, padded output: ")
print(output)
last_step = output[range(batch_size), lengths - 1, :]
print(last_step)

- Transform warining into errors, useful to debug zero division

  ```python
  import warnings
  warnings.filterwarnings("error")
  ```

## Crude ideas

1. Feature engineering

    - Images

        - resnet 152/101

        - facenet

    - Text

        - word2vec

        - openai language model

        - skipthought

    - audio

        - VGGs from google

        - AudioNet Vondrick and others

    - Video

        - I3D

        - NLN

1. LSTM with variable sequence length

    - query features

    - 0/1 representing time to match

1. Test approach in other dataset

    - Charades or ActivityNet