In [None]:
!git clone https://github.com/giorgiodemarchi/ImageBind.git

In [None]:
!git clone https://github.com/giorgiodemarchi/Visually-Indicated-Sounds.git  ## This was done in a external colab notebook

In [11]:
!pip install boto3

Collecting boto3
  Downloading boto3-1.34.81-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.35.0,>=1.34.81 (from boto3)
  Downloading botocore-1.34.81-py3-none-any.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.1-py3-none-any.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.2/82.2 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.34.81 botocore-1.34.81 jmespath-1.0.1 s3transfer-0.10.1


In [None]:
!cd ImageBind; pip install .

In [1]:
from imagebind import data
import torch

from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text_list=["A dog.", "A car", "A bird"]
image_paths=[".assets/dog_image.jpg", ".assets/car_image.jpg", ".assets/bird_image.jpg"]
audio_paths=[".assets/dog_audio.wav", ".assets/car_audio.wav", ".assets/bird_audio.wav"]

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()

model.to(device)

KeyboardInterrupt: 

In [38]:
from torchvision.transforms import Compose, Normalize
import torch.nn.functional as F


def transform_and_sample_video_tensor(
    video_tensor,
    device,
    clip_duration=2,
    clips_per_video=5,
    # Assume video_tensor is in (num_frames, x, y, 3) format
):
    video_transform = Compose([
        Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])]
    )

    total_frames = video_tensor.shape[0]
    frames_per_clip = int(total_frames / clips_per_video)

    all_clips = []
    for i in range(clips_per_video):
      start_frame = i*frames_per_clip
      end_frame = start_frame + frames_per_clip
      clip = video_tensor[start_frame:end_frame]

      clip = clip/255
      clip = clip.permute(0, 3, 1, 2)
      clip = video_transform(clip)

      all_clips.append(clip)

    video_output = torch.stack(all_clips, dim=0).to(device)

    return video_output

def scale_shortest_side_to(video_tensor, target_size=224):
    num_clips, num_frames, channels, height, width = video_tensor.shape
    # Calculate the scaling factor
    scale_factor = target_size / min(height, width)
    # Calculate new dimensions
    new_height, new_width = int(height * scale_factor), int(width * scale_factor)
    # Resize
    new_clips = []
    for clip in video_tensor:
        scaled_clip = F.interpolate(clip, size=(new_height, new_width), mode='bilinear', align_corners=False)
        new_clips.append(scaled_clip)

    scaled_video = torch.stack(new_clips, dim=0)
    return scaled_video

def scale_and_crop_to_target(video_tensor, target_size=224):
    num_clips, num_frames, channels, height, width = video_tensor.shape
    # Calculate the scaling factor to scale the shortest side to target_size
    scale_factor = target_size / min(height, width)
    # Calculate new dimensions
    new_height, new_width = int(height * scale_factor), int(width * scale_factor)

    # Initialize a list to hold the processed clips
    new_clips = []

    for clip in video_tensor:
        # Scale each clip
        # We need to permute the dimensions of the clip to [num_frames, channels, height, width] for F.interpolate
        clip = clip.permute(0, 2, 3, 1).float()  # Changing to [num_frames, height, width, channels]
        clip = clip.permute(0, 3, 1, 2)  # Now [num_frames, channels, height, width], suitable for F.interpolate
        scaled_clip = F.interpolate(clip, size=(new_height, new_width), mode='bilinear', align_corners=False)

        # After scaling, we crop the center to ensure it's 224x224
        # Calculate the start point for cropping
        crop_start_height = max((new_height - target_size) // 2, 0)
        crop_start_width = max((new_width - target_size) // 2, 0)

        # Crop
        cropped_clip = scaled_clip[:, :, crop_start_height:crop_start_height + target_size, crop_start_width:crop_start_width + target_size]

        # Permute back to [num_frames, height, width, channels] before appending
        cropped_clip = cropped_clip.permute(0, 2, 3, 1)

        new_clips.append(cropped_clip)

    # Stack all the processed clips together
    scaled_and_cropped_video = torch.stack(new_clips, dim=0)

    # Ensure the output is of the same dtype as the input (likely uint8 if input are images)
    scaled_and_cropped_video = scaled_and_cropped_video.to(dtype=video_tensor.dtype)

    return scaled_and_cropped_video

**Create Dataset Instance and process video**

In [23]:
from VisuallyIndicatedSounds.utils.datasets import StronglyLabelledDataset

dataset = StronglyLabelledDataset()

In [24]:
type(min(video.size(2), video.size(3)))

int

In [39]:
video, audio, labels_df = dataset[0]

print(f"Video Shape: {video.shape}")
print(f"Audio Shape: {audio.shape}")

labels_df



Video Shape: torch.Size([235, 360, 480, 3])
Audio Shape: torch.Size([1, 428032])


Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label
0,--HXYSM3ydo_2000,0.0,4.877,/m/05zppz
1,--HXYSM3ydo_2000,0.0,9.703,/m/03m9d0z
2,--HXYSM3ydo_2000,0.0,9.703,/t/dd00066
3,--HXYSM3ydo_2000,7.138,9.703,/m/0912c9


In [40]:
## Working with the example video
## Required Transformations (Overwriting imagebind.data.load_and_transform_video)
scaled_video = transform_and_sample_video_tensor(video, device)

print(f"Video Shape: {scaled_video.shape}")
scaled_video = scale_and_crop_to_target(scaled_video)

scaled_video.shape ## (num_clips, frames_per_clip, channels, width, height)

Video Shape: torch.Size([5, 47, 3, 360, 480])


torch.Size([5, 47, 224, 224, 3])

In [44]:
scaled_video = scaled_video.permute(0, 1, 4, 2, 3)
scaled_video.shape

torch.Size([5, 47, 3, 224, 224])

In [45]:
inputs = {
    ModalityType.VISION: scaled_video,
}

In [46]:
with torch.no_grad():
    outputs = model(inputs)

In [49]:
outputs['vision'].shape

torch.Size([5, 1024])

In [50]:
embeddings = outputs['vision']

In [51]:
embeddings

tensor([[ 0.0302,  0.0109, -0.0077,  ...,  0.0244, -0.0118, -0.0253],
        [ 0.0201,  0.0096, -0.0142,  ...,  0.0157, -0.0169, -0.0196],
        [ 0.0185,  0.0028, -0.0041,  ...,  0.0045,  0.0094, -0.0254],
        [ 0.0139, -0.0088, -0.0198,  ..., -0.0144,  0.0258, -0.0004],
        [ 0.0097, -0.0125, -0.0110,  ..., -0.0187,  0.0265,  0.0056]],
       device='cuda:0')