# Training Collaborative Experts on MSR-VTT
This notebook shows how to download code that trains a modified Collaborative Experts model with BERT + CLS + NetVLAD on the MSR-VTT Dataset.


## Setup

*   Download Code and Dependencies
*   Import Modules
*   Download Language Model Weights
*   Download Datasets
*   Generate Encodings for Dataset Captions 



### Code Downloading and Dependency Downloading
*   Specify tensorflow version
*   Clone repository from Github
*   `cd` into the correct directory
*   Install the requirements




In [None]:
%tensorflow_version 2.x

In [None]:
!git clone https://github.com/googleinterns/via-content-understanding.git

In [None]:
%cd via-content-understanding/videoretrieval/

In [None]:
!pip install -r requirements.txt

In [None]:
!pip install --upgrade tensorflow_addons

### Importing Modules

In [None]:
import tensorflow as tf
import languagemodels
import train.encoder_datasets
import train.language_model
import experts
import datasets
import datasets.msrvtt.constants
import os
import models.components
import models.encoder
import helper.precomputed_features
from tensorflow_addons.activations import mish  
import tensorflow_addons as tfa
import metrics.loss

### Language Model Downloading

*   Download BERT



In [None]:
bert_model = languagemodels.BERTModel()

### Dataset downloading


*   Downlaod Datasets
*   Download Precomputed Features



In [None]:
datasets.msrvtt_dataset.download_dataset()

Note: The system `curl` is more memory efficent than the download function in our codebase, so here `curl` is used rather than the download function in our codebase.

In [None]:
url = datasets.msrvtt.constants.features_tar_url
path = datasets.msrvtt.constants.features_tar_path
os.system(f"curl {url} > {path}") 

In [None]:
helper.precomputed_features.cache_features(
    datasets.msrvtt_dataset,
    datasets.msrvtt.constants.expert_to_features,
    datasets.msrvtt.constants.features_tar_path,)

### Encoding Generation

* Generate Encodings for MSR-VTT

In [None]:
train.language_model.generate_and_cache_encodings(
    bert_model, datasets.msrvtt_dataset)

## Training


*  Build Train Datasets
*  Initialize Models
*  Compile Encoders
*  Fit Model
* Test Model


### Datasets Generation

In [None]:
experts_used = [
  experts.i3d,
  experts.r2p1d,
  experts.resnext,
  experts.senet,
  experts.speech_expert,
  experts.ocr_expert,
  experts.audio_expert,
  experts.densenet,
  experts.face_expert]

In [None]:
train_ds, valid_ds, test_ds = (
    train.encoder_datasets.generate_language_model_fine_tuning_datasets(
        bert_model, datasets.msrvtt_dataset, experts_used))

### Model Initialization

In [None]:
class MishLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        return mish(inputs)

In [None]:
mish(tf.Variable([1.0]))

In [None]:
text_encoder = models.components.TextEncoder(
    len(experts_used),
    num_netvlad_clusters=28,
    ghost_clusters=1,
    language_model_dimensionality=768,
    encoded_expert_dimensionality=512,
    residual_cls_token=True,
)

In [None]:
video_encoder = models.components.VideoEncoder(
    num_experts=len(experts_used),
    experts_use_netvlad=[False, False, False, False, True, True, True, False, False],
    experts_netvlad_shape=[None, None, None, None, 19, 43, 8, None, None],
    expert_aggregated_size=512,
    encoded_expert_dimensionality=512,
    g_mlp_layers=3,
    h_mlp_layers=0,
    make_activation_layer=MishLayer)

In [None]:
encoder = models.encoder.EncoderForLanguageModelTuning(
    video_encoder,
    text_encoder,
    0.05,
    [1, 5, 10, 50],
    20,
    bert_model.model,
    64)

### Encoder Compliation

In [None]:
def build_optimizer(lr=0.001):
    learning_rate_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=lr,
        decay_steps=1000,
        decay_rate=0.95,
        staircase=True)

    return tf.keras.optimizers.Adam(learning_rate_scheduler)

In [None]:
encoder.compile(build_optimizer(5e-5), metrics.loss.bidirectional_max_margin_ranking_loss)

In [None]:
train_ds_prepared = (train_ds
  .shuffle(7000)
  .batch(32, drop_remainder=True)
  .prefetch(tf.data.experimental.AUTOTUNE))
valid_ds_prepared = (valid_ds
  .prefetch(tf.data.experimental.AUTOTUNE)
  .batch(497 * 20, drop_remainder=True)
  .cache())

In [None]:
encoder.language_model.trainable = True
encoder.video_encoder.trainable = True
encoder.text_encoder.trainable = True

### Model fitting

In [None]:
encoder.fit(
    train_ds_prepared,
    #validation_data=valid_ds_prepared,
    epochs=250,
)

### Tests

In [None]:
captions_per_video = 20
num_videos_upper_bound = 100000 

In [None]:
ranks = []

for caption_index in range(captions_per_video):
    batch = next(iter(test_ds.shard(captions_per_video, caption_index).batch(
        num_videos_upper_bound)))
    video_embeddings, text_embeddings, mixture_weights = encoder.forward_pass(
        batch, training=False)
    
    similarity_matrix = metrics.loss.build_similarity_matrix(
        video_embeddings,
        text_embeddings,
        mixture_weights,
        batch[-1])
    rankings = metrics.rankings.compute_ranks(similarity_matrix)
    ranks += list(rankings.numpy())

In [None]:
def recall_at_k(ranks, k):
    return len(list(filter(lambda i: i <= k, ranks))) / len(ranks)

In [None]:
median_rank = sorted(ranks)[len(ranks)//2]

In [None]:
mean_rank = sum(ranks)/len(ranks)

In [None]:
print(f"Median Rank: {median_rank}")

In [None]:
print(f"Mean Rank: {mean_rank}")

In [None]:
for k in [1, 5, 10, 50]:
    recall = recall_at_k(ranks, k)
    print(f"R@{k}: {recall}")