### Set the corresponding values in the cell below. Afterwards, just run the following cells

In [7]:
import pathlib
# Class label csv path
labels_csv_path = 'datasets/kinetics_400_labels.csv'
# Datset folder
dataset_root_path = pathlib.Path('datasets')

### Import libraries 

In [8]:
from msclap import CLAP
import torch.nn.functional as F
import numpy as np
import torch
import os
from moviepy.editor import VideoFileClip

### Get Audio from video

In [15]:
video = VideoFileClip(video_path)
if video.audio is None:
    print(f"This video {video_path} has no audio!")
else:
    audio_path = 'datasets/audios' + video_path[8:-3] + 'wav'
    audio_dir = "/".join(audio_path.split("/")[:-1])
    print(audio_dir)
    if not os.path.exists(audio_dir):
            os.makedirs(audio_dir)
    audio_arr = video.audio.write_audiofile(audio_path, verbose=False)
    print(f"Audio saved to {audio_path}")

datasets/audios/Kinetics400/test/crying
MoviePy - Writing audio in datasets/audios/Kinetics400/test/crying/_AUtYTaYMrE_000000_000010.wav


                                                        

MoviePy - Done.
Audio saved to datasets/audios/Kinetics400/test/crying/_AUtYTaYMrE_000000_000010.wav




### Read class Labels

In [9]:
import csv

label2id = {}
id2label = {}
with open(labels_csv_path, mode='r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)

    for row in csv_reader:
        class_id, class_name = row
        label2id[class_name] = class_id
        id2label[class_id] = class_name

class_labels = list(label2id.keys())

print(f"{len(class_labels)} Unique classes: {class_labels}.")

400 Unique classes: ['abseiling', 'air drumming', 'answering questions', 'applauding', 'applying cream', 'archery', 'arm wrestling', 'arranging flowers', 'assembling computer', 'auctioning', 'baby waking up', 'baking cookies', 'balloon blowing', 'bandaging', 'barbequing', 'bartending', 'beatboxing', 'bee keeping', 'belly dancing', 'bench pressing', 'bending back', 'bending metal', 'biking through snow', 'blasting sand', 'blowing glass', 'blowing leaves', 'blowing nose', 'blowing out candles', 'bobsledding', 'bookbinding', 'bouncing on trampoline', 'bowling', 'braiding hair', 'breading or breadcrumbing', 'breakdancing', 'brush painting', 'brushing hair', 'brushing teeth', 'building cabinet', 'building shed', 'bungee jumping', 'busking', 'canoeing or kayaking', 'capoeira', 'carrying baby', 'cartwheeling', 'carving pumpkin', 'catching fish', 'catching or throwing baseball', 'catching or throwing frisbee', 'catching or throwing softball', 'celebrating', 'changing oil', 'changing wheel', 'c

### CLAP inference with given audio

In [18]:
from tqdm import tqdm

train_paths = list(dataset_root_path.glob("train/*/*.mp4"))

# Load model (Choose between versions '2022' or '2023')
# The model weight will be downloaded automatically if `model_fp` is not specified
clap_model = CLAP(version = '2023', use_cuda=True)

correct = 0
total = 0
for video_path in tqdm(train_paths):
    video_path = video_path.as_posix()
    video = VideoFileClip(video_path)
    if video.audio is None:
        print(f"This video {video_path} has no audio!")
    else:
        audio_path = 'datasets/audios' + video_path[8:-3] + 'wav'
        audio_dir = "/".join(audio_path.split("/")[:-1])
        print(audio_dir)
        if not os.path.exists(audio_dir):
                os.makedirs(audio_dir)
        audio_arr = video.audio.write_audiofile(audio_path, verbose=False)
        print(f"Audio saved to {audio_path}")

    label = video_path.split('/')[-2]

    with torch.no_grad():

        # Extract text embeddings
        text_embeddings = clap_model.get_text_embeddings([f"This is a sound of {c}"for c in class_labels])

        # Extract audio embeddings
        audio_embeddings = clap_model.get_audio_embeddings([audio_path])

        # Compute similarity between audio and text embeddings 
        similarities = clap_model.compute_similarity(audio_embeddings, text_embeddings)

    similarity = F.softmax(similarities, dim=1)
    values, indices = similarity[0].topk(5)
    pred_index = indices[0]

    # Print the results
    print("Ground Truth: {}".format(video_path.split('/')[-2]))
    print("Top predictions:\n")
    for value, index in zip(values, indices):
        print(f"{class_labels[index]:>16s}: {100 * value.item():.2f}%")

    if label == class_labels[pred_index]: 
        correct += 1
    total += 1

print(correct, '/', total)

  0%|          | 0/50 [00:00<?, ?it/s]

datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/9q8mf6GUiiE_000009_000019.wav


  0%|          | 0/50 [00:00<?, ?it/s]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/9q8mf6GUiiE_000009_000019.wav


  2%|▏         | 1/50 [00:04<03:43,  4.56s/it]

Ground Truth: crying
Top predictions:

    making sushi: 13.32%
blowing out candles: 10.22%
     fixing hair: 9.50%
         hugging: 8.49%
    washing feet: 7.91%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/9s4bfiyc9Yc_000195_000205.wav


  2%|▏         | 1/50 [00:04<03:43,  4.56s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/9s4bfiyc9Yc_000195_000205.wav


  4%|▍         | 2/50 [00:09<03:37,  4.52s/it]

Ground Truth: crying
Top predictions:

          crying: 50.74%
   carrying baby: 11.85%
   crawling baby: 7.73%
   cutting nails: 6.13%
getting a haircut: 5.81%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/a4xHx59uFgk_000003_000013.wav


  4%|▍         | 2/50 [00:09<03:37,  4.52s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/a4xHx59uFgk_000003_000013.wav


  6%|▌         | 3/50 [00:13<03:30,  4.48s/it]

Ground Truth: crying
Top predictions:

   crawling baby: 37.16%
getting a haircut: 9.51%
         hugging: 6.85%
   carrying baby: 6.54%
using remote controller (not gaming): 3.93%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/a50yimv7Lqg_000026_000036.wav


  6%|▌         | 3/50 [00:13<03:30,  4.48s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/a50yimv7Lqg_000026_000036.wav


  8%|▊         | 4/50 [00:18<03:25,  4.46s/it]

Ground Truth: crying
Top predictions:

sticking tongue out: 31.35%
   crawling baby: 21.29%
    shaking head: 11.67%
  baby waking up: 7.49%
petting animal (not cat): 6.38%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/Afhkn2h-wLE_000000_000010.wav


  8%|▊         | 4/50 [00:18<03:25,  4.46s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/Afhkn2h-wLE_000000_000010.wav


 10%|█         | 5/50 [00:22<03:21,  4.48s/it]

Ground Truth: crying
Top predictions:

   carrying baby: 38.99%
getting a haircut: 20.26%
          crying: 18.77%
   crawling baby: 4.46%
  baby waking up: 4.09%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/aMAJ-DRt6bU_000116_000126.wav


 10%|█         | 5/50 [00:22<03:21,  4.48s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/aMAJ-DRt6bU_000116_000126.wav


 12%|█▏        | 6/50 [00:26<03:13,  4.40s/it]

Ground Truth: crying
Top predictions:

  baby waking up: 37.50%
sticking tongue out: 19.98%
   crawling baby: 16.55%
    shaking head: 11.57%
        sneezing: 3.69%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/a_yBdlVHrK8_000000_000010.wav


 12%|█▏        | 6/50 [00:26<03:13,  4.40s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/a_yBdlVHrK8_000000_000010.wav


 14%|█▍        | 7/50 [00:31<03:07,  4.36s/it]

Ground Truth: crying
Top predictions:

          crying: 81.99%
   carrying baby: 5.94%
   crawling baby: 5.17%
  baby waking up: 1.99%
getting a haircut: 1.72%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/fKrcKrs-o_w_000000_000010.wav


 14%|█▍        | 7/50 [00:31<03:07,  4.36s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/fKrcKrs-o_w_000000_000010.wav


 16%|█▌        | 8/50 [00:35<03:02,  4.33s/it]

Ground Truth: crying
Top predictions:

          crying: 24.87%
   crawling baby: 23.40%
  baby waking up: 22.60%
   carrying baby: 11.79%
sticking tongue out: 4.87%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/fLzGu07zA-I_000008_000018.wav


 16%|█▌        | 8/50 [00:35<03:02,  4.33s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/fLzGu07zA-I_000008_000018.wav


 18%|█▊        | 9/50 [00:39<02:56,  4.31s/it]

Ground Truth: crying
Top predictions:

          crying: 88.08%
getting a haircut: 6.80%
   carrying baby: 1.35%
   crawling baby: 0.97%
   cutting nails: 0.51%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/FM06rkKxFms_000010_000020.wav


 18%|█▊        | 9/50 [00:39<02:56,  4.31s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/FM06rkKxFms_000010_000020.wav


 20%|██        | 10/50 [00:43<02:51,  4.30s/it]

Ground Truth: crying
Top predictions:

          crying: 58.74%
getting a haircut: 12.85%
   carrying baby: 11.21%
   cutting nails: 4.55%
     fixing hair: 2.92%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/G6QENvmNkRE_000000_000010.wav


 20%|██        | 10/50 [00:43<02:51,  4.30s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/G6QENvmNkRE_000000_000010.wav


 22%|██▏       | 11/50 [00:48<02:47,  4.29s/it]

Ground Truth: crying
Top predictions:

   crawling baby: 34.72%
  baby waking up: 14.39%
    shaking head: 13.98%
sticking tongue out: 10.59%
   carrying baby: 5.67%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/Ga9lFu4gurw_000015_000025.wav


 22%|██▏       | 11/50 [00:48<02:47,  4.29s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/Ga9lFu4gurw_000015_000025.wav


 24%|██▍       | 12/50 [00:52<02:42,  4.27s/it]

Ground Truth: crying
Top predictions:

          crying: 75.58%
   carrying baby: 8.30%
   crawling baby: 7.86%
  baby waking up: 2.48%
getting a haircut: 2.31%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/GnIgOUPiMqE_000023_000033.wav


 24%|██▍       | 12/50 [00:52<02:42,  4.27s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/GnIgOUPiMqE_000023_000033.wav


 26%|██▌       | 13/50 [00:56<02:37,  4.27s/it]

Ground Truth: crying
Top predictions:

          crying: 57.69%
getting a haircut: 28.46%
   carrying baby: 2.53%
      dying hair: 2.42%
     fixing hair: 2.34%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/gpaANcmFUOI_000003_000013.wav


 26%|██▌       | 13/50 [00:56<02:37,  4.27s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/gpaANcmFUOI_000003_000013.wav


 28%|██▊       | 14/50 [01:00<02:33,  4.26s/it]

Ground Truth: crying
Top predictions:

          crying: 77.82%
  baby waking up: 5.05%
   carrying baby: 3.70%
   crawling baby: 3.24%
using remote controller (not gaming): 2.39%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/G_-Hx6u40nE_000032_000042.wav


 28%|██▊       | 14/50 [01:00<02:33,  4.26s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/G_-Hx6u40nE_000032_000042.wav


 30%|███       | 15/50 [01:05<02:29,  4.26s/it]

Ground Truth: crying
Top predictions:

          crying: 48.00%
getting a haircut: 30.27%
   carrying baby: 8.02%
   crawling baby: 4.98%
   cutting nails: 1.34%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/H85OllZBK70_000013_000023.wav


 30%|███       | 15/50 [01:05<02:29,  4.26s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/H85OllZBK70_000013_000023.wav


 32%|███▏      | 16/50 [01:09<02:24,  4.26s/it]

Ground Truth: crying
Top predictions:

          crying: 53.11%
getting a haircut: 16.14%
   crawling baby: 12.29%
   carrying baby: 10.24%
   cutting nails: 1.50%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/Ha3vwLcjMAs_000157_000167.wav


 32%|███▏      | 16/50 [01:09<02:24,  4.26s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/Ha3vwLcjMAs_000157_000167.wav


 34%|███▍      | 17/50 [01:13<02:20,  4.26s/it]

Ground Truth: crying
Top predictions:

getting a haircut: 20.17%
   crawling baby: 11.22%
     fixing hair: 7.90%
          crying: 7.82%
    washing hair: 7.04%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/l5UWczmUVeY_000019_000029.wav


 34%|███▍      | 17/50 [01:13<02:20,  4.26s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/l5UWczmUVeY_000019_000029.wav


 36%|███▌      | 18/50 [01:17<02:16,  4.25s/it]

Ground Truth: crying
Top predictions:

          crying: 57.25%
   crawling baby: 11.41%
   carrying baby: 5.85%
getting a haircut: 4.15%
  baby waking up: 2.97%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/oH_BDY4bWp4_000000_000010.wav


 36%|███▌      | 18/50 [01:18<02:16,  4.25s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/oH_BDY4bWp4_000000_000010.wav


 38%|███▊      | 19/50 [01:22<02:12,  4.26s/it]

Ground Truth: crying
Top predictions:

          crying: 61.99%
getting a haircut: 21.87%
   carrying baby: 4.40%
   cutting nails: 2.81%
     fixing hair: 2.25%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/Oi7DvmU-tF0_000000_000010.wav


 38%|███▊      | 19/50 [01:22<02:12,  4.26s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/Oi7DvmU-tF0_000000_000010.wav


 40%|████      | 20/50 [01:26<02:08,  4.27s/it]

Ground Truth: crying
Top predictions:

          crying: 79.84%
getting a haircut: 10.75%
   carrying baby: 2.71%
     fixing hair: 1.88%
      dying hair: 1.54%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/oNiWBhu1-JQ_000000_000010.wav


 40%|████      | 20/50 [01:26<02:08,  4.27s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/oNiWBhu1-JQ_000000_000010.wav


 42%|████▏     | 21/50 [01:30<02:03,  4.26s/it]

Ground Truth: crying
Top predictions:

          crying: 46.79%
   crawling baby: 20.75%
   carrying baby: 17.47%
  baby waking up: 7.22%
   cutting nails: 2.64%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/Pa4kyZAUTXQ_000017_000027.wav


 42%|████▏     | 21/50 [01:30<02:03,  4.26s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/Pa4kyZAUTXQ_000017_000027.wav


 44%|████▍     | 22/50 [01:34<01:59,  4.25s/it]

Ground Truth: crying
Top predictions:

          crying: 45.02%
  baby waking up: 22.26%
   crawling baby: 10.17%
petting animal (not cat): 6.12%
   carrying baby: 4.80%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/pg4HbzuxCIE_000022_000032.wav


 44%|████▍     | 22/50 [01:35<01:59,  4.25s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/pg4HbzuxCIE_000022_000032.wav


 46%|████▌     | 23/50 [01:39<01:54,  4.25s/it]

Ground Truth: crying
Top predictions:

          crying: 60.09%
   crawling baby: 15.07%
   carrying baby: 3.93%
     fixing hair: 2.63%
  baby waking up: 2.52%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/pHgqoFKTBDc_000004_000014.wav


 46%|████▌     | 23/50 [01:39<01:54,  4.25s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/pHgqoFKTBDc_000004_000014.wav


 48%|████▊     | 24/50 [01:43<01:50,  4.25s/it]

Ground Truth: crying
Top predictions:

          crying: 57.86%
getting a haircut: 10.88%
  baby waking up: 10.48%
   carrying baby: 9.33%
   crawling baby: 4.17%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/pZxdg1Stvb8_000000_000010.wav


 48%|████▊     | 24/50 [01:43<01:50,  4.25s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/pZxdg1Stvb8_000000_000010.wav


 50%|█████     | 25/50 [01:47<01:46,  4.25s/it]

Ground Truth: crying
Top predictions:

          crying: 56.20%
   cutting nails: 9.22%
getting a haircut: 8.92%
   carrying baby: 5.93%
     fixing hair: 2.56%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/QApH290EHcU_000035_000045.wav


 50%|█████     | 25/50 [01:47<01:46,  4.25s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/QApH290EHcU_000035_000045.wav


 52%|█████▏    | 26/50 [01:51<01:41,  4.25s/it]

Ground Truth: crying
Top predictions:

petting animal (not cat): 17.25%
  baby waking up: 16.48%
          crying: 14.29%
   carrying baby: 11.32%
   crawling baby: 9.67%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/qFOg87dNSh4_000001_000011.wav


 52%|█████▏    | 26/50 [01:52<01:41,  4.25s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/qFOg87dNSh4_000001_000011.wav


 54%|█████▍    | 27/50 [01:56<01:37,  4.25s/it]

Ground Truth: crying
Top predictions:

          crying: 89.00%
   crawling baby: 2.09%
   carrying baby: 1.69%
   cutting nails: 1.35%
  baby waking up: 1.33%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/quf-iuFOT4s_000000_000010.wav


 54%|█████▍    | 27/50 [01:56<01:37,  4.25s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/quf-iuFOT4s_000000_000010.wav


 56%|█████▌    | 28/50 [02:00<01:33,  4.26s/it]

Ground Truth: crying
Top predictions:

   crawling baby: 58.30%
sticking tongue out: 6.55%
    shaking head: 6.18%
    pumping fist: 4.82%
using remote controller (not gaming): 2.67%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/rCwNyAjkGPQ_000031_000041.wav


 56%|█████▌    | 28/50 [02:00<01:33,  4.26s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/rCwNyAjkGPQ_000031_000041.wav


 58%|█████▊    | 29/50 [02:03<01:20,  3.83s/it]

Ground Truth: crying
Top predictions:

         hugging: 24.79%
riding mechanical bull: 14.88%
getting a haircut: 10.85%
 waiting in line: 9.02%
    making sushi: 7.12%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/sjZsedv-jG8_000008_000018.wav


 58%|█████▊    | 29/50 [02:03<01:20,  3.83s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/sjZsedv-jG8_000008_000018.wav


 60%|██████    | 30/50 [02:05<01:09,  3.46s/it]

Ground Truth: crying
Top predictions:

          crying: 28.84%
        tickling: 11.73%
     fixing hair: 9.88%
   carrying baby: 7.15%
      dying hair: 6.39%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/tIm0yA_u5Qc_000000_000010.wav


 60%|██████    | 30/50 [02:05<01:09,  3.46s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/tIm0yA_u5Qc_000000_000010.wav


 62%|██████▏   | 31/50 [02:08<01:00,  3.20s/it]

Ground Truth: crying
Top predictions:

        tickling: 44.00%
        laughing: 12.58%
   carrying baby: 9.45%
   cutting nails: 6.58%
petting animal (not cat): 4.67%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/tlY2Cy-VH_g_000005_000015.wav


 62%|██████▏   | 31/50 [02:08<01:00,  3.20s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/tlY2Cy-VH_g_000005_000015.wav


 64%|██████▍   | 32/50 [02:11<00:54,  3.01s/it]

Ground Truth: crying
Top predictions:

          crying: 45.94%
getting a haircut: 41.81%
    washing hair: 1.67%
   carrying baby: 1.48%
     fixing hair: 1.30%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/TNl9NjfWZ1o_000084_000094.wav


 64%|██████▍   | 32/50 [02:11<00:54,  3.01s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/TNl9NjfWZ1o_000084_000094.wav


 66%|██████▌   | 33/50 [02:13<00:49,  2.89s/it]

Ground Truth: crying
Top predictions:

          crying: 37.19%
getting a haircut: 21.86%
   carrying baby: 10.32%
   cutting nails: 9.04%
     fixing hair: 4.80%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/u-oc-Eln_yc_000003_000013.wav


 66%|██████▌   | 33/50 [02:13<00:49,  2.89s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/u-oc-Eln_yc_000003_000013.wav


 68%|██████▊   | 34/50 [02:16<00:44,  2.80s/it]

Ground Truth: crying
Top predictions:

getting a haircut: 46.50%
          crying: 38.75%
   carrying baby: 3.98%
   cutting nails: 1.48%
   crawling baby: 1.35%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/Ux2iImyAiL4_000001_000011.wav


 68%|██████▊   | 34/50 [02:16<00:44,  2.80s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/Ux2iImyAiL4_000001_000011.wav


 70%|███████   | 35/50 [02:18<00:40,  2.73s/it]

Ground Truth: crying
Top predictions:

   crawling baby: 33.83%
  baby waking up: 26.00%
   carrying baby: 7.82%
          crying: 6.47%
sticking tongue out: 6.30%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/VcHysqkJPSE_000002_000012.wav


 70%|███████   | 35/50 [02:18<00:40,  2.73s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/VcHysqkJPSE_000002_000012.wav


 72%|███████▏  | 36/50 [02:21<00:37,  2.69s/it]

Ground Truth: crying
Top predictions:

getting a haircut: 20.30%
         hugging: 8.55%
    making sushi: 8.53%
dancing macarena: 7.89%
   carrying baby: 7.56%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/X3G7b1JNAfs_000000_000010.wav


 72%|███████▏  | 36/50 [02:21<00:37,  2.69s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/X3G7b1JNAfs_000000_000010.wav


 74%|███████▍  | 37/50 [02:24<00:34,  2.67s/it]

Ground Truth: crying
Top predictions:

sticking tongue out: 9.71%
        laughing: 9.24%
   cutting nails: 8.73%
          crying: 8.33%
   carrying baby: 7.56%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/XBNgVxsrqo4_000010_000020.wav


 74%|███████▍  | 37/50 [02:24<00:34,  2.67s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/XBNgVxsrqo4_000010_000020.wav


 76%|███████▌  | 38/50 [02:26<00:31,  2.64s/it]

Ground Truth: crying
Top predictions:

getting a haircut: 69.07%
   carrying baby: 7.73%
          crying: 7.33%
         hugging: 2.53%
petting animal (not cat): 2.13%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/xDLo1xt8P6U_000014_000024.wav


 76%|███████▌  | 38/50 [02:26<00:31,  2.64s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/xDLo1xt8P6U_000014_000024.wav


 78%|███████▊  | 39/50 [02:29<00:28,  2.63s/it]

Ground Truth: crying
Top predictions:

          crying: 45.63%
getting a haircut: 29.49%
   crawling baby: 6.37%
   carrying baby: 5.18%
   cutting nails: 2.88%
datasets/audios/train/crying


 78%|███████▊  | 39/50 [02:29<00:28,  2.63s/it]

MoviePy - Writing audio in datasets/audios/train/crying/yAgm_XnqBec_000010_000020.wav


 78%|███████▊  | 39/50 [02:29<00:28,  2.63s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/yAgm_XnqBec_000010_000020.wav


 80%|████████  | 40/50 [02:31<00:26,  2.62s/it]

Ground Truth: crying
Top predictions:

          crying: 63.42%
getting a haircut: 21.32%
   cutting nails: 2.93%
   carrying baby: 2.50%
     fixing hair: 1.78%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/yDD0alN95O8_000015_000025.wav


 80%|████████  | 40/50 [02:31<00:26,  2.62s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/yDD0alN95O8_000015_000025.wav


 82%|████████▏ | 41/50 [02:34<00:23,  2.61s/it]

Ground Truth: crying
Top predictions:

getting a haircut: 29.25%
         hugging: 16.96%
   carrying baby: 12.23%
          crying: 10.67%
   holding snake: 10.36%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/yDKz5An0qd0_000024_000034.wav


 82%|████████▏ | 41/50 [02:34<00:23,  2.61s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/yDKz5An0qd0_000024_000034.wav


 84%|████████▍ | 42/50 [02:37<00:20,  2.60s/it]

Ground Truth: crying
Top predictions:

sticking tongue out: 24.99%
   crawling baby: 22.42%
    shaking head: 9.26%
getting a haircut: 5.31%
          crying: 5.20%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/zQWdo_Er87M_000008_000018.wav


 84%|████████▍ | 42/50 [02:37<00:20,  2.60s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/zQWdo_Er87M_000008_000018.wav


 86%|████████▌ | 43/50 [02:39<00:18,  2.60s/it]

Ground Truth: crying
Top predictions:

          crying: 61.70%
getting a haircut: 10.70%
   holding snake: 7.55%
  bungee jumping: 4.37%
    riding camel: 2.31%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/ZrFW1iPMnig_000060_000070.wav


 86%|████████▌ | 43/50 [02:39<00:18,  2.60s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/ZrFW1iPMnig_000060_000070.wav


 88%|████████▊ | 44/50 [02:42<00:15,  2.60s/it]

Ground Truth: crying
Top predictions:

          crying: 75.10%
getting a haircut: 6.94%
   crawling baby: 3.94%
  baby waking up: 3.80%
   carrying baby: 3.08%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/zwOBqeFTgiU_000012_000022.wav


 88%|████████▊ | 44/50 [02:42<00:15,  2.60s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/zwOBqeFTgiU_000012_000022.wav


 90%|█████████ | 45/50 [02:44<00:12,  2.60s/it]

Ground Truth: crying
Top predictions:

          crying: 48.54%
getting a haircut: 26.80%
   carrying baby: 8.24%
   cutting nails: 3.68%
   crawling baby: 3.34%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/_ceBK5pQTrs_000033_000043.wav


 90%|█████████ | 45/50 [02:44<00:12,  2.60s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/_ceBK5pQTrs_000033_000043.wav


 92%|█████████▏| 46/50 [02:47<00:10,  2.59s/it]

Ground Truth: crying
Top predictions:

          crying: 66.06%
getting a haircut: 18.92%
   carrying baby: 4.55%
   crawling baby: 3.82%
  baby waking up: 1.10%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/_fVz9qyplBc_000078_000088.wav


 92%|█████████▏| 46/50 [02:47<00:10,  2.59s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/_fVz9qyplBc_000078_000088.wav


 94%|█████████▍| 47/50 [02:50<00:07,  2.60s/it]

Ground Truth: crying
Top predictions:

          crying: 80.92%
getting a haircut: 8.52%
   carrying baby: 3.35%
   cutting nails: 1.93%
   crawling baby: 0.83%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/_k0fnTiKEMc_000077_000087.wav


 94%|█████████▍| 47/50 [02:50<00:07,  2.60s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/_k0fnTiKEMc_000077_000087.wav


 96%|█████████▌| 48/50 [02:52<00:05,  2.60s/it]

Ground Truth: crying
Top predictions:

          crying: 66.62%
getting a haircut: 11.42%
     fixing hair: 3.21%
   carrying baby: 2.94%
   cutting nails: 2.76%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/_uPh9i-xaaE_000094_000104.wav


 96%|█████████▌| 48/50 [02:52<00:05,  2.60s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/_uPh9i-xaaE_000094_000104.wav


 98%|█████████▊| 49/50 [02:55<00:02,  2.60s/it]

Ground Truth: crying
Top predictions:

          crying: 67.28%
   carrying baby: 14.02%
   crawling baby: 5.54%
  baby waking up: 5.13%
getting a haircut: 3.95%
datasets/audios/train/crying
MoviePy - Writing audio in datasets/audios/train/crying/_wRQiJdk2Rw_000004_000014.wav


 98%|█████████▊| 49/50 [02:55<00:02,  2.60s/it]

MoviePy - Done.
Audio saved to datasets/audios/train/crying/_wRQiJdk2Rw_000004_000014.wav


100%|██████████| 50/50 [02:57<00:00,  3.55s/it]

Ground Truth: crying
Top predictions:

getting a haircut: 27.44%
dancing macarena: 19.39%
    making sushi: 10.16%
     fixing hair: 6.28%
riding mechanical bull: 3.81%
31 / 50



