<a href="https://colab.research.google.com/github/cosmo3769/s3d-mil-nce/blob/main/notebooks/s3d_mil_nce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print(f'Installing Weights and Biases')
!pip install -qq --upgrade wandb

Installing Weights and Biases
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.3/277.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import cv2
import math
import wandb
import numpy as np
import tensorflow_hub as hub
import tensorflow.compat.v2 as tf

from IPython import display

In [3]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
# Load the model once from TF-Hub.
hub_handle = 'https://tfhub.dev/deepmind/mil-nce/s3d/1'
hub_model = hub.load(hub_handle)

def generate_embeddings(model, input_frames, input_words):
  """Generate embeddings from the model from video frames and input words."""
  # Input_frames must be normalized in [0, 1] and of the shape Batch x T x H x W x 3
  vision_output = model.signatures['video'](tf.constant(tf.cast(input_frames, dtype=tf.float32)))
  text_output = model.signatures['text'](tf.constant(input_words))
  return vision_output['video_embedding'], text_output['text_embedding']

In [5]:
def crop_center_square(frame):
  y, x = frame.shape[0:2]
  min_dim = min(y, x)
  start_x = (x // 2) - (min_dim // 2)
  start_y = (y // 2) - (min_dim // 2)
  return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]

In [6]:
def load_video(video_url, max_frames=32, resize=(224, 224)):
  path = tf.keras.utils.get_file(os.path.basename(video_url)[-128:], video_url)
  cap = cv2.VideoCapture(path)
  frames = []
  try:
    while True:
      ret, frame = cap.read()
      if not ret:
        break
      frame = crop_center_square(frame)
      frame = cv2.resize(frame, resize)
      frame = frame[:, :, [2, 1, 0]]
      frames.append(frame)

      if len(frames) == max_frames:
        break
  finally:
    cap.release()
  frames = np.array(frames)
  if len(frames) < max_frames:
    n_repeat = int(math.ceil(max_frames / float(len(frames))))
    frames = frames.repeat(n_repeat, axis=0)
  frames = frames[:max_frames]
  frames = frames / 255.0
  return frames

In [7]:
def display_video(urls):
    html = '<table>'
    html += '<tr><th>Video 1</th><th>Video 2</th><th>Video 3</th></tr><tr>'
    for url in urls:
        html += '<td>'
        html += '<img src="{}" height="224">'.format(url)
        html += '</td>'
    html += '</tr></table>'
    return display.HTML(html)

In [8]:
def display_query_and_results_video(query, urls, scores):
  """Display a text query and the top result videos and scores."""
  sorted_ix = np.argsort(-scores)
  html = ''
  html += '<h2>Input query: <i>{}</i> </h2><div>'.format(query)
  html += 'Results: <div>'
  html += '<table>'
  html += '<tr><th>Rank #1, Score:{:.2f}</th>'.format(scores[sorted_ix[0]])
  html += '<th>Rank #2, Score:{:.2f}</th>'.format(scores[sorted_ix[1]])
  html += '<th>Rank #3, Score:{:.2f}</th></tr><tr>'.format(scores[sorted_ix[2]])
  for i, idx in enumerate(sorted_ix):
    url = urls[sorted_ix[i]];
    html += '<td>'
    html += '<img src="{}" height="224">'.format(url)
    html += '</td>'
  html += '</tr></table>'
  return html

In [11]:
import wandb
import requests
import os

def download_video(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
        return True
    else:
        print(f"Failed to download video from URL: {url}")
        return False

def display_and_log_videos(urls):
    for i, url in enumerate(urls):
        video_filename = f"video_{i+1}.gif"
        if download_video(url, video_filename):
            # Log the video to wandb
            wandb.log({f"video_{i+1}": wandb.Video(video_filename)})
            os.remove(video_filename)  # Remove the local video file after logging

# video_1_url = 'https://i.pinimg.com/originals/96/d8/98/96d89818c091b500c3a41ffb05d858ff.gif'
# video_2_url = 'https://mir-s3-cdn-cf.behance.net/project_modules/max_1200/43d68193783231.62642905a82e2.gif'
# video_3_url = 'https://i.gifer.com/8tVa.gif'
# all_videos_urls = [video_1_url, video_2_url, video_3_url]

video_1_url = 'https://github.com/cosmo3769/s3d-mil-nce/blob/main/gif_dir/dance.gif'
video_2_url = 'https://github.com/cosmo3769/s3d-mil-nce/blob/main/gif_dir/swim.gif'
video_3_url = 'https://github.com/cosmo3769/s3d-mil-nce/blob/main/gif_dir/cycle.gif'

all_videos_urls = [video_1_url, video_2_url, video_3_url]

# Display and log the videos
wandb.init(
    entity='cosmo3769',
    project='s3d-mil-nce',
    # config=vars(configs),
    # name=f'{configs.env_id}'
)
display_and_log_videos(all_videos_urls)
wandb.finish()

VBox(children=(Label(value='0.433 MB of 0.433 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [22]:
def log_videos_to_wandb(videos):
    for i, video in enumerate(videos):
        wandb.log({f"video_{i+1}": wandb.Video(video)})

In [35]:
# video_1_url = 'https://github.com/cosmo3769/s3d-mil-nce/blob/main/gif_dir/dance.gif'
# video_2_url = 'https://github.com/cosmo3769/s3d-mil-nce/blob/main/gif_dir/swim.gif'
# video_3_url = 'https://github.com/cosmo3769/s3d-mil-nce/blob/main/gif_dir/cycle.gif'

# all_videos_urls = [video_1_url, video_2_url, video_3_url]
# all_videos_urls = [video_3_url]

video_1_url = 'https://i.pinimg.com/originals/96/d8/98/96d89818c091b500c3a41ffb05d858ff.gif'
video_2_url = 'https://mir-s3-cdn-cf.behance.net/project_modules/max_1200/43d68193783231.62642905a82e2.gif'
video_3_url = 'https://i.gifer.com/8tVa.gif'
all_videos_urls = [video_1_url, video_2_url, video_3_url]

video_1 = load_video(video_1_url)
video_2 = load_video(video_2_url)
video_3 = load_video(video_3_url)
all_videos = [video_1, video_2, video_3]

query_1_video = 'Dancing'
query_2_video = 'Swimming'
query_3_video = 'Cycling'
all_queries_video = [query_1_video, query_2_video, query_3_video]

display_video(all_videos_urls)

Video 1,Video 2,Video 3
,,


In [36]:
video_1.shape

(32, 224, 224, 3)

In [45]:
# Initialize wandb run
wandb.init(
    entity='cosmo3769',
    project='s3d-mil-nce',
    # config=vars(configs),
    # name=f'{configs.env_id}'
)

# log_videos_to_wandb(all_videos)
wandb.log({"video_urls": all_videos_urls})

# Finish wandb run
wandb.finish()

VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [37]:
# Prepare video inputs.
videos_np = np.stack(all_videos, axis=0)

# Prepare text input.
words_np = np.array(all_queries_video)

# Generate the video and text embeddings.
video_embd, text_embd = generate_embeddings(hub_model, videos_np, words_np)

# Scores between video and text is computed by dot products.
all_scores = np.dot(text_embd, tf.transpose(video_embd))

In [38]:
videos_np.shape

(3, 32, 224, 224, 3)

In [39]:
words_np

array(['Dancing', 'Swimming', 'Cycling'], dtype='<U8')

In [40]:
words_np.shape

(3,)

In [41]:
video_embd.shape

TensorShape([3, 512])

In [42]:
text_embd.shape

TensorShape([3, 512])

In [43]:
all_scores.shape

(3, 3)

In [44]:
all_scores

array([[ 2.911647  ,  4.2406607 ,  0.09560519],
       [ 2.5165358 ,  1.4838493 , -1.885814  ],
       [ 1.3625085 ,  1.783996  ,  6.2053566 ]], dtype=float32)

In [None]:
# Display results.
html = ''
for i, words in enumerate(words_np):
  html += display_query_and_results_video(words, all_videos_urls, all_scores[i, :])
  html += '<br>'
display.HTML(html)