## Sound Search

In [125]:
from imagebind import data
import torch
import numpy as np
import pandas as pd
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import math
import os

from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType
from imagebind.data import transform_and_sample_video_tensor, load_and_transform_audio_data_tensors
from imagebind import data

from moviepy.editor import ImageSequenceClip, VideoFileClip

from IPython.display import display, Audio

from scenedetect import detect, AdaptiveDetector

from pinecone import Pinecone

from utils.datasets import StronglyLabelledDataset

In [103]:
# Pinecone connection
pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))

index = pc.Index("audioset-adorno-cv")

In [29]:
# Ontology audioset dataset
ontology = pd.read_json('data/ontology.json')[['id','name', 'description']]
augmented = pd.read_csv('data/augmented_labels_cleaned.csv', index_col=0)

ontology = pd.merge(ontology, augmented.drop(columns=['description']), how='left', on='name')

ontology

Unnamed: 0,id,name,description,MajorityType,Object,Exclude
0,/m/0dgw9r,Human sounds,Sounds produced by the human body through the ...,SFX,human,0
1,/m/09l8g,Human voice,The human voice consists of sound made by a hu...,SFX,human,1
2,/m/09x0r,Speech,Speech is the vocalized form of human communic...,SFX,human,1
3,/m/05zppz,"Male speech, man speaking",Speech uttered by an adult male human.,SFX,human,1
4,/m/02zsn,"Female speech, woman speaking",Speech uttered by an adult female human.,SFX,human,1
...,...,...,...,...,...,...
627,/m/025l19,Recording,A sound that appears to come from a recording ...,SFX,recorder/player,0
628,/m/07hvw1,Field recording,A sound that appears to have been recorded in ...,AMB,,0
629,/m/0174nj,Gramophone record,A sound which appears to come from a gramophon...,SFX,gramophone,0
630,/m/01www,Compact disc,A sound which appears to come from a digital a...,SFX,compact disc,0


In [69]:
def play_video(video):
    mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
    
    # Adjust the normalization of the video to match 0-255 scale
    video = video * std + mean  # Denormalize
    video = video.clamp(0, 1)  # Clamp to the range [0, 1]
    video = video.permute(0, 2, 3, 1)  # Change to [frames, height, width, channels]
    
    # Scale to 0-255 and convert to uint8
    video_tensor = (video * 255).numpy().astype(np.uint8)
    
    # Writing to a video file using OpenCV
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter('output_video.mp4', fourcc, video_fps, (224, 224))
    for i in range(video_tensor.shape[0]):
        out.write(video_tensor[i])
    out.release()

    # Load your video file
    clip = VideoFileClip('output_video.mp4')
    
    # Display the video
    display(clip.ipython_display(width=480, autoplay=1, loop=1, maxduration=120))

**Load Video**

In [3]:
dataset = StronglyLabelledDataset()

In [145]:
video, audio, labels_df, info = dataset[25110]

video_fps = info['video_fps']
audio_fps = info['audio_fps']

print(video.shape)
print(info)

torch.Size([250, 3, 224, 224])
{'video_fps': 25.0, 'audio_fps': 44100}


In [42]:
video = video.permute(0, 3, 1, 2)

In [146]:
play_video(video)

Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                                

Moviepy - Done !
Moviepy - video ready __temp__.mp4




**Scene Splitting**

In [147]:
video_local_path = "output_video.mp4"

scenes = detect(video_local_path, AdaptiveDetector())

In [148]:
scenes

[]

**Embed Video**

In [48]:
# This must be done by scene too.
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()

model.to(device)

ImageBindModel(
  (modality_preprocessors): ModuleDict(
    (vision): RGBDTPreprocessor(
      (cls_token): tensor((1, 1, 1280), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Sequential(
          (0): PadIm2Video()
          (1): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
        )
      )
      (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
        (pos_embed): tensor((1, 257, 1280), requires_grad=True)
        
      )
    )
    (text): TextPreprocessor(
      (pos_embed): tensor((1, 77, 1024), requires_grad=True)
      (mask): tensor((77, 77), requires_grad=False)
      
      (token_embedding): Embedding(49408, 1024)
    )
    (audio): AudioPreprocessor(
      (cls_token): tensor((1, 1, 768), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10), bias=False)
        (norm_layer): LayerNorm((768,), eps=1e-05, elementwise_affine=

In [49]:
labels_df = pd.merge(labels_df, ontology, how='left', left_on='label', right_on='id')

In [151]:
video = video.permute(0, 2, 3, 1)

In [152]:
video.shape

torch.Size([250, 224, 224, 3])

In [153]:
scaled_video = transform_and_sample_video_tensor(video, device, clips_per_video=1)

print(f"Video Shape: {scaled_video.shape}")

inputs = {
    ModalityType.VISION: scaled_video,
}

with torch.no_grad():
    outputs = model(inputs)

print(f"Output Shape: {outputs['vision'].shape}")
video_embeddings = outputs['vision']

Video Shape: torch.Size([1, 250, 3, 224, 224])
Output Shape: torch.Size([1, 1024])


**Embed Text**

In [56]:
labels_list = ontology.description.unique()
# labels_list = ontology.name.unique()

inputs = {
    ModalityType.TEXT: data.load_and_transform_text(labels_list, device)
}

with torch.no_grad():
    output = model(inputs)

text_embeddings = output[ModalityType.TEXT]

In [57]:
text_embeddings.shape

torch.Size([632, 1024])

**Embed Audio**

In [34]:
# Already done in Pinecone

**Video-Text-Audio**

Video -> Text

In [156]:
vision_text_similarity = torch.softmax(video_embeddings @ text_embeddings.T, dim=-1)

ontology['similarity'] = vision_text_similarity.cpu().numpy()[0]
ontology = ontology[ontology['MajorityType'] == 'SFX']
ontology.sort_values('similarity', ascending=False).head(10)

Unnamed: 0,id,name,description,MajorityType,Object,Exclude,similarity
586,/m/07rcgpl,Hum,"A continuous, dull tone.",SFX,,0,0.045403
631,/m/04zc0,MP3,A sound which appears to come from a compresse...,SFX,digital file,0,0.025898
512,/m/07q7njn,"Chink, clink",A short light metallic sound.,SFX,metallic object,0,0.018617
626,/m/01b7fy,Headphones,A sound that appears to have been produced by ...,SFX,headphones,0,0.007721
154,/m/07qn4z3,Rattle,A rapid series of short loud sounds as of smal...,SFX,container,0,0.004409
546,/m/07pws3f,Bang,A brief and loud noise.,SFX,,0,0.003801
625,/m/0cfpc,Loudspeaker,A sound that appears to have been produced by ...,SFX,loudspeaker,0,0.003478
472,/t/dd00077,Mechanisms,Sounds that originate from human-created machi...,SFX,technology,0,0.003474
596,/m/07pt_g0,Pulse,"A sound that is modulated in short, regular bu...",SFX,,0,0.002643
457,/m/07pp8cl,Telephone bell ringing,The sound of a physical or synthesized electri...,SFX,telephone,0,0.001667


In [164]:
first_match, similarity = ontology.sort_values('similarity', ascending=False).iloc[2]['id'], ontology.sort_values('similarity', ascending=False).iloc[0]['similarity']

In [165]:
first_match

'/m/07q7njn'

Text -> Audio

In [166]:
def query_with_metadata_filter(query_vector, metadata_filter, top_k=5):
    results = index.query(
        vector=video_embeddings.cpu().tolist(),  ## NOTE: pinecone expects list for serialization
        top_k=3,
        include_values=False,  # This returns embeddings of result
        include_metadata=True,
        filter = metadata_filter
    )
    return results

metadata_filter = {"type": "SFX",
                   "label_id": first_match}

search_result = query_with_metadata_filter(video_embeddings.cpu().tolist(), metadata_filter)

In [167]:
search_result

{'matches': [{'id': '5m_mczN2Q9U_150000_3',
              'metadata': {'end_time': 2.272,
                           'label_id': '/m/07q7njn',
                           'mode': 'audio',
                           'start_time': 1.505,
                           'type': 'SFX'},
              'score': 3.2493248,
              'values': []},
             {'id': '8fKrqIhSrw4_60000_5',
              'metadata': {'end_time': 7.165,
                           'label_id': '/m/07q7njn',
                           'mode': 'audio',
                           'start_time': 6.693,
                           'type': 'SFX'},
              'score': 3.24879766,
              'values': []},
             {'id': '5m_mczN2Q9U_150000_9',
              'metadata': {'end_time': 7.201,
                           'label_id': '/m/07q7njn',
                           'mode': 'audio',
                           'start_time': 4.59,
                           'type': 'SFX'},
              'score': 2.93189335,
      

In [168]:
mtdf = pd.DataFrame(folder_names)

mtdf.columns = ['video_id']

index_n = mtdf[mtdf['video_id'] == search_result['matches'][0]['id'][:-2]].index[0]

In [169]:
_, audio, _, info = dataset[index_n]

In [170]:
audio_np = audio.numpy()

num_channels, num_samples = audio_np.shape
sample_rate = info['audio_fps']

Audio(audio_np, rate=sample_rate)

**Video-Audio**

In [181]:
def query_with_metadata_filter(query_vector, metadata_filter, top_k=5):
    results = index.query(
        vector=video_embeddings.cpu().tolist(),  ## NOTE: pinecone expects list for serialization
        top_k=5,
        include_values=False,  # This returns embeddings of result
        include_metadata=True,
        filter = metadata_filter
    )
    return results

metadata_filter = {"type": "SFX"}

search_result = query_with_metadata_filter(video_embeddings.cpu().tolist(), metadata_filter)

In [185]:
ids = []
sim = []
start_time = []
end_time = []
label = []
type = []

for match in search_result['matches']:
    ids.append(match['id'])
    sim.append(match['score'])
    start_time.append(match['metadata']['start_time'])
    end_time.append(match['metadata']['end_time'])
    label.append(match['metadata']['label_id'])
    type.append(match['metadata']['type'])

data_dict = {
    'id': ids,
    'similarity': sim, 
    'start_time': start_time,
    'end_time': end_time,
    'label': label,
    'type': type
}

In [190]:
matches_df = pd.DataFrame(data_dict)
matches_df = pd.merge(matches_df, ontology[['id', 'name', 'description']], how='left', left_on='label', right_on='id')
matches_df

Unnamed: 0,id_x,similarity,start_time,end_time,label,type,id_y,name,description
0,AdfTS1LDw2o_120000_13,6.373542,4.609,5.615,/m/05tny_,SFX,/m/05tny_,Bark,Principal communication sound produced by dogs...
1,E3D_z0aoUEg_30000_2,6.231184,4.492,8.219,/m/09ld4,SFX,/m/09ld4,Frog,"Sounds associated with the short-bodied, taill..."
2,AdfTS1LDw2o_120000_12,6.184546,3.687,4.483,/m/05tny_,SFX,/m/05tny_,Bark,Principal communication sound produced by dogs...
3,8gqsHGNsvNY_50000_3,6.007124,9.575,10.0,/m/07r_80w,SFX,/m/07r_80w,Hoot,The loud raucous cry of an owl.
4,AdfTS1LDw2o_120000_19,5.876167,6.746,7.723,/m/05tny_,SFX,/m/05tny_,Bark,Principal communication sound produced by dogs...


In [None]:
mtdf = pd.DataFrame(folder_names)
mtdf.columns = ['video_id']

In [196]:
match_n = 0

video_id = matches_df.iloc[match_n]['id_x'][:-3] # change
index_n = mtdf[mtdf['video_id'] == video_id].index[0]

In [197]:
_, audio, _, info = dataset[index_n]
audio_np = audio.numpy()

num_channels, num_samples = audio_np.shape
sample_rate = info['audio_fps']

Audio(audio_np, rate=sample_rate)