### Pinecone connection

In [1]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-3.2.2-py3-none-any.whl.metadata (16 kB)
Downloading pinecone_client-3.2.2-py3-none-any.whl (215 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.9/215.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pinecone-client
Successfully installed pinecone-client-3.2.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [29]:
from imagebind import data
import torch
import pandas as pd

from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

from imagebind.data import transform_and_sample_video_tensor

import numpy as np
from moviepy.editor import ImageSequenceClip

In [30]:
import torch
import torchaudio
from torchvision.transforms import Normalize
from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler

from imagebind.data import get_clip_timepoints, waveform2melspec

def load_and_transform_audio_data(
    audio_tensors,
    sample_rates,
    device,
    num_mel_bins=128,
    target_length=204,
    desired_sample_rate=16000,
    clip_duration=2,
    clips_per_video=3,
    mean=-4.268,
    std=9.138,
):
    if audio_tensors is None:
        return None

    audio_outputs = []
    clip_sampler = ConstantClipsPerVideoSampler(
        clip_duration=clip_duration, clips_per_video=clips_per_video
    )

    for waveform, sr in zip(audio_tensors, sample_rates):
        if desired_sample_rate != sr:
            waveform = torchaudio.functional.resample(
                waveform, orig_freq=sr, new_freq=desired_sample_rate
            )
        
        all_clips_timepoints = get_clip_timepoints(
            clip_sampler, waveform.size(1) / desired_sample_rate
        )
        all_clips = []
        for clip_timepoints in all_clips_timepoints:
            waveform_clip = waveform[
                :,
                int(clip_timepoints[0] * desired_sample_rate) : int(
                    clip_timepoints[1] * desired_sample_rate
                ),
            ]
            waveform_melspec = waveform2melspec(
                waveform_clip, desired_sample_rate, num_mel_bins, target_length
            )
            all_clips.append(waveform_melspec)

        normalize = Normalize(mean=mean, std=std)
        all_clips = [normalize(ac).to(device) for ac in all_clips]

        all_clips = torch.stack(all_clips, dim=0)
        audio_outputs.append(all_clips)

    return torch.stack(audio_outputs, dim=0)

In [42]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [43]:
# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()

model.to(device)

Downloading imagebind weights to .checkpoints/imagebind_huge.pth ...


  0%|          | 0.00/4.47G [00:00<?, ?B/s]

ImageBindModel(
  (modality_preprocessors): ModuleDict(
    (vision): RGBDTPreprocessor(
      (cls_token): tensor((1, 1, 1280), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Sequential(
          (0): PadIm2Video()
          (1): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
        )
      )
      (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
        (pos_embed): tensor((1, 257, 1280), requires_grad=True)
        
      )
    )
    (text): TextPreprocessor(
      (pos_embed): tensor((1, 77, 1024), requires_grad=True)
      (mask): tensor((77, 77), requires_grad=False)
      
      (token_embedding): Embedding(49408, 1024)
    )
    (audio): AudioPreprocessor(
      (cls_token): tensor((1, 1, 768), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10), bias=False)
        (norm_layer): LayerNorm((768,), eps=1e-05, elementwise_affine=

**Initialize Index**

In [77]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))

index = pc.Index("audioset-adorno")

In [78]:
index

<pinecone.data.index.Index at 0x7f9e501f5b70>

**Upload data**

In [7]:
from utils.datasets import StronglyLabelledDataset

In [8]:
dataset = StronglyLabelledDataset()

In [51]:
def split_audio_in_segments(audio_tensor, audio_fps, labels_df):
    segments = []
    for i, row in labels_df.iterrows():
        segment = {}
        # Find indexes of start/end frames in the tensor
        start_frame = int(row['start_time_seconds'] * audio_fps)
        end_frame = int(row['end_time_seconds'] * audio_fps)
        segment_tensor = audio_tensor[:, start_frame:end_frame]
        length = row['end_time_seconds'] - row['start_time_seconds']

        segment['video_id'] = row['segment_id']
        segment['start_time'] = row['start_time_seconds']
        segment['end_time'] = row['end_time_seconds']
        segment['audio_tensor'] = segment_tensor
        segment['label_id'] = row['label']
        segment['type'] = row['MajorityType']
        segment['object'] = row['Object']
                       
        segments.append(segment)
    return segments

In [19]:
video, audio, labels_df, info = dataset[10]



In [50]:
onto = pd.read_csv('data/augmented_labels.csv', index_col=0)
onto2 = pd.read_json('data/ontology.json')
onto = pd.merge(onto, onto2[['id','description','name']], on='name', how='left')

labels_df = pd.merge(labels_df, onto, how='left', left_on='label', right_on='id')
labels_df

Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label,name,MajorityType,Object,id,description
0,--ekDLDTUXA_30000,0.0,10.0,/m/04rlf,Music,AMB,"{'name': 'Music', 'type': 'AMB'}",/m/04rlf,Music is an art form and cultural activity who...
1,--ekDLDTUXA_30000,5.486,6.452,/m/07p6fty,Shout,SFX,Individual,/m/07p6fty,Talk in a loud voice to deliberately command a...
2,--ekDLDTUXA_30000,7.249,10.0,/t/dd00003,Male singing,SFX,person,/t/dd00003,Singing produced by an adult human male.


In [20]:
labels_df

Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label
0,--ekDLDTUXA_30000,0.0,10.0,/m/04rlf
1,--ekDLDTUXA_30000,5.486,6.452,/m/07p6fty
2,--ekDLDTUXA_30000,7.249,10.0,/t/dd00003


In [52]:
segments = split_audio_in_segments(audio, info['audio_fps'], labels_df)

In [53]:
segments

[{'video_id': '--ekDLDTUXA_30000',
  'start_time': 0.0,
  'end_time': 10.0,
  'audio_tensor': tensor([[ 0.0000,  0.0000,  0.0000,  ..., -0.1150, -0.1732, -0.2013],
          [ 0.0000,  0.0000,  0.0000,  ..., -0.1204, -0.0944, -0.0642]]),
  'label_id': '/m/04rlf',
  'type': 'AMB',
  'object': "{'name': 'Music', 'type': 'AMB'}"},
 {'video_id': '--ekDLDTUXA_30000',
  'start_time': 5.486,
  'end_time': 6.452,
  'audio_tensor': tensor([[-0.0639, -0.0614, -0.0637,  ...,  0.0459,  0.0495,  0.0330],
          [ 0.0285,  0.0022, -0.0202,  ...,  0.0739,  0.0414, -0.0147]]),
  'label_id': '/m/07p6fty',
  'type': 'SFX',
  'object': 'Individual'},
 {'video_id': '--ekDLDTUXA_30000',
  'start_time': 7.249,
  'end_time': 10.0,
  'audio_tensor': tensor([[ 0.1071,  0.0460,  0.0252,  ..., -0.1150, -0.1732, -0.2013],
          [ 0.0405, -0.0101, -0.0284,  ..., -0.1204, -0.0944, -0.0642]]),
  'label_id': '/t/dd00003',
  'type': 'SFX',
  'object': 'person'}]

In [36]:
segment_tensors = [item['audio_tensor'] for item in segments]
fps_list = [info['audio_fps'] for _ in range(len(segment_tensors))]

In [37]:
transformed_segments = load_and_transform_audio_data(
      segment_tensors, fps_list, device='cuda'
    )



In [44]:
inputs = {
    ModalityType.AUDIO: transformed_segments.to(device)
}

with torch.no_grad():
    outputs = model(inputs)

audio_embeddings = outputs['audio']

In [46]:
audio_embeddings.shape

torch.Size([3, 1024])

In [71]:
i=0
datapoints = []
for embedding, audio_metadata in zip(audio_embeddings, segments):
    segment_id = audio_metadata['video_id'] + "_" + str(i)
    
    metadata = {key: audio_metadata[key] for key in ['start_time', 'end_time', 'label_id', 'type', 'object']}
    metadata['mode'] = 'audio'
    
    datapoint = (segment_id, embedding.cpu().numpy(), metadata)
    datapoints.append(datapoint)

    i+=1

In [72]:
datapoints

[('--ekDLDTUXA_30000_0',
  array([ 0.8216046 , -0.4695502 , -1.2003841 , ...,  0.5313113 ,
          0.8068339 , -0.84560025], dtype=float32),
  {'start_time': 0.0,
   'end_time': 10.0,
   'label_id': '/m/04rlf',
   'type': 'AMB',
   'object': "{'name': 'Music', 'type': 'AMB'}",
   'mode': 'audio'}),
 ('--ekDLDTUXA_30000_1',
  array([ 0.6621524 , -1.0037177 , -1.460051  , ...,  0.79632586,
          0.6254753 , -0.72505474], dtype=float32),
  {'start_time': 5.486,
   'end_time': 6.452,
   'label_id': '/m/07p6fty',
   'type': 'SFX',
   'object': 'Individual',
   'mode': 'audio'}),
 ('--ekDLDTUXA_30000_2',
  array([ 0.62519026, -0.32169655, -1.2164917 , ...,  0.5417554 ,
          0.7649085 , -0.83140916], dtype=float32),
  {'start_time': 7.249,
   'end_time': 10.0,
   'label_id': '/t/dd00003',
   'type': 'SFX',
   'object': 'person',
   'mode': 'audio'})]

In [79]:
def upload_data_with_metadata(data):
    for item_id, vector, meta in data:
        index.upsert(vectors=[(item_id, vector, meta)])
        
upload_data_with_metadata(datapoints)

**Query data**

In [57]:
new_video, new_audio, new_label_df, new_info = dataset[20]




In [64]:
scaled_video = transform_and_sample_video_tensor(new_video.permute(0, 2, 3, 1), device, clips_per_video=1) ## TODO: fix permute/dimension issue at read

print(f"Video Shape: {scaled_video.shape}")

inputs = {
    ModalityType.VISION: scaled_video,
}

with torch.no_grad():
    outputs = model(inputs)

print(f"Output Shape: {outputs['vision'].shape}")
video_embeddings = outputs['vision']

Video Shape: torch.Size([1, 300, 3, 224, 224])
Output Shape: torch.Size([1, 1024])


In [74]:
video_embeddings

tensor([[-0.0046,  0.0298,  0.0124,  ...,  0.0102, -0.0088, -0.0248]],
       device='cuda:0')

In [75]:
segments

[{'video_id': '--ekDLDTUXA_30000',
  'start_time': 0.0,
  'end_time': 10.0,
  'audio_tensor': tensor([[ 0.0000,  0.0000,  0.0000,  ..., -0.1150, -0.1732, -0.2013],
          [ 0.0000,  0.0000,  0.0000,  ..., -0.1204, -0.0944, -0.0642]]),
  'label_id': '/m/04rlf',
  'type': 'AMB',
  'object': "{'name': 'Music', 'type': 'AMB'}"},
 {'video_id': '--ekDLDTUXA_30000',
  'start_time': 5.486,
  'end_time': 6.452,
  'audio_tensor': tensor([[-0.0639, -0.0614, -0.0637,  ...,  0.0459,  0.0495,  0.0330],
          [ 0.0285,  0.0022, -0.0202,  ...,  0.0739,  0.0414, -0.0147]]),
  'label_id': '/m/07p6fty',
  'type': 'SFX',
  'object': 'Individual'},
 {'video_id': '--ekDLDTUXA_30000',
  'start_time': 7.249,
  'end_time': 10.0,
  'audio_tensor': tensor([[ 0.1071,  0.0460,  0.0252,  ..., -0.1150, -0.1732, -0.2013],
          [ 0.0405, -0.0101, -0.0284,  ..., -0.1204, -0.0944, -0.0642]]),
  'label_id': '/t/dd00003',
  'type': 'SFX',
  'object': 'person'}]

In [85]:
index.query(
    vector=video_embeddings.cpu().tolist(),  ## NOTE: pinecone expects list for serialization
    top_k=3,
    include_values=False,  # This returns embeddings of result
    include_metadata=True
)

{'matches': [{'id': '--ekDLDTUXA_30000_1',
              'metadata': {'end_time': 6.452,
                           'label_id': '/m/07p6fty',
                           'mode': 'audio',
                           'object': 'Individual',
                           'start_time': 5.486,
                           'type': 'SFX'},
              'score': 0.24307856,
              'values': []},
             {'id': '--ekDLDTUXA_30000_0',
              'metadata': {'end_time': 10.0,
                           'label_id': '/m/04rlf',
                           'mode': 'audio',
                           'object': "{'name': 'Music', 'type': 'AMB'}",
                           'start_time': 0.0,
                           'type': 'AMB'},
              'score': -1.10898578,
              'values': []},
             {'id': '--ekDLDTUXA_30000_2',
              'metadata': {'end_time': 10.0,
                           'label_id': '/t/dd00003',
                           'mode': 'audio',
             

In [89]:
def query_with_metadata_filter(query_vector, metadata_filter, top_k=5):
    # query = {
    #     "vector": query_vector,
    #     "filter": metadata_filter,
    #     "top_k": top_k
    # }
    results = index.query(
        vector=video_embeddings.cpu().tolist(),  ## NOTE: pinecone expects list for serialization
        top_k=3,
        include_values=False,  # This returns embeddings of result
        include_metadata=True,
        filter = metadata_filter
    )
    return results

metadata_filter = {"type": "SFX"}

print(query_with_metadata_filter(video_embeddings.cpu().tolist(), metadata_filter))

{'matches': [{'id': '--ekDLDTUXA_30000_1',
              'metadata': {'end_time': 6.452,
                           'label_id': '/m/07p6fty',
                           'mode': 'audio',
                           'object': 'Individual',
                           'start_time': 5.486,
                           'type': 'SFX'},
              'score': 0.24307856,
              'values': []},
             {'id': '--ekDLDTUXA_30000_2',
              'metadata': {'end_time': 10.0,
                           'label_id': '/t/dd00003',
                           'mode': 'audio',
                           'object': 'person',
                           'start_time': 7.249,
                           'type': 'SFX'},
              'score': -2.12069511,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}
