### Set the corresponding values in the cell below. Afterwards, just run the following cells

In [1]:
# Class label csv path
labels_csv_path = 'datasets/kinetics_400_labels.csv'
# Video example
video_path = 'datasets/UCF-101/CuttingInKitchen/v_CuttingInKitchen_g01_c01.avi'

### Import libraries 

In [2]:
from msclap import CLAP
import torch.nn.functional as F
import numpy as np
import torch
import os
from moviepy.editor import VideoFileClip

### Get Audio from video

In [3]:
video = VideoFileClip(video_path)
if video.audio is None:
    print(f"This video {video_path} has no audio!")
else:
    audio_path = 'datasets/audios' + video_path[8:-3] + 'wav'
    audio_dir = "/".join(audio_path.split("/")[:-1])
    print(audio_dir)
    if not os.path.exists(audio_dir):
            os.makedirs(audio_dir)
    audio_arr = video.audio.write_audiofile(audio_path, verbose=False)
    print(f"Audio saved to {audio_path}")

datasets/audios/UCF-101/CuttingInKitchen
MoviePy - Writing audio in datasets/audios/UCF-101/CuttingInKitchen/v_CuttingInKitchen_g01_c01.wav


                                                        

MoviePy - Done.
Audio saved to datasets/audios/UCF-101/CuttingInKitchen/v_CuttingInKitchen_g01_c01.wav




### Read class Labels

In [4]:
import csv

label2id = {}
id2label = {}
with open(labels_csv_path, mode='r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)

    for row in csv_reader:
        class_id, class_name = row
        label2id[class_name] = class_id
        id2label[class_id] = class_name

class_labels = list(label2id.keys())

print(f"{len(class_labels)} Unique classes: {class_labels}.")

400 Unique classes: ['abseiling', 'air drumming', 'answering questions', 'applauding', 'applying cream', 'archery', 'arm wrestling', 'arranging flowers', 'assembling computer', 'auctioning', 'baby waking up', 'baking cookies', 'balloon blowing', 'bandaging', 'barbequing', 'bartending', 'beatboxing', 'bee keeping', 'belly dancing', 'bench pressing', 'bending back', 'bending metal', 'biking through snow', 'blasting sand', 'blowing glass', 'blowing leaves', 'blowing nose', 'blowing out candles', 'bobsledding', 'bookbinding', 'bouncing on trampoline', 'bowling', 'braiding hair', 'breading or breadcrumbing', 'breakdancing', 'brush painting', 'brushing hair', 'brushing teeth', 'building cabinet', 'building shed', 'bungee jumping', 'busking', 'canoeing or kayaking', 'capoeira', 'carrying baby', 'cartwheeling', 'carving pumpkin', 'catching fish', 'catching or throwing baseball', 'catching or throwing frisbee', 'catching or throwing softball', 'celebrating', 'changing oil', 'changing wheel', 'c

### CLAP inference with given audio

In [5]:
with torch.no_grad():
    # Load model (Choose between versions '2022' or '2023')
    # The model weight will be downloaded automatically if `model_fp` is not specified
    clap_model = CLAP(version = '2023', use_cuda=False)

    # Extract text embeddings
    text_embeddings = clap_model.get_text_embeddings([f"This is a sound of {c}"for c in class_labels])

    # Extract audio embeddings
    audio_embeddings = clap_model.get_audio_embeddings([audio_path])

    # Compute similarity between audio and text embeddings 
    similarities = clap_model.compute_similarity(audio_embeddings, text_embeddings)

similarity = F.softmax(similarities, dim=1)
values, indices = similarity[0].topk(5)
pred_index = indices[0]

# Print the results
print("Ground Truth: {}".format(video_path.split('/')[-2]))
print("Top predictions:\n")
for value, index in zip(values, indices):
    print(f"{class_labels[index]:>16s}: {100 * value.item():.2f}%")

Ground Truth: CuttingInKitchen
Top predictions:

cutting pineapple: 89.85%
cutting watermelon: 6.34%
    making sushi: 1.64%
breading or breadcrumbing: 0.67%
arranging flowers: 0.33%
