In [10]:
#to suppress warnings
from transformers.utils import logging
logging.set_verbosity_error()

### prepare the datasets with audio

In [11]:
from datasets import load_dataset , load_from_disk

#this dataset contains a is collection of different sounds of 5seconds

dataset = load_dataset("ashraq/esc50",
                       split="train[0:10]")

Repo card metadata block was not found. Setting CardData to empty.


In [12]:
#audio sample that we will use 
audio_sample = dataset[8]

#print the audio sample
print(audio_sample)

{'filename': '1-103298-A-9.wav', 'fold': 1, 'target': 9, 'category': 'crow', 'esc10': False, 'src_file': 103298, 'take': 'A', 'audio': {'path': None, 'array': array([-0.44656372, -0.47012329, -0.4803772 , ..., -0.06384277,
       -0.05136108, -0.04431152]), 'sampling_rate': 44100}}


In [13]:
#lets give the audio sample a listen
from IPython.display import Audio as IPythonAudio

"""
The ipython audio function takes in the audio array and the sampling rate 
The audio array is the audio data and the sampling rate is the number of samples per second 
"""
IPythonAudio(audio_sample["audio"]['array']
            ,rate = audio_sample['audio']["sampling_rate"])

### classifying the Audio sound

In [14]:
from transformers import pipeline

In [15]:
zero_shot_classifier = pipeline(
    task="zero-shot-audio-classification",
    model="laion/clap-htsat-unfused")

### Sampling Rate for Transformer Models
- How long does 1 second of high resolution audio (192,000 Hz) appear to the Whisper model (which is trained to expect audio files at 16,000 Hz)? 

In [16]:
(1 * 192000) / 16000

12.0

* The 1 second of high resolution audio appears to the model as if it is 12 seconds of audio.

* This means that for the Whisper model, which is accustomed to processing audio at a standard rate of 16,000 times per second, when it encounters high-resolution audio sampled at a much faster rate of 192,000 times per second, it perceives each second of the high-resolution audio as if it were stretched out over a duration of 12 seconds. This is because the model is designed to process audio at a slower pace, so the faster rate of the high-resolution audio makes each second seem much longer to it.

* What would be the perceived duration of a 5-second audio sample when processed by the Whisper model, trained on audio sampled at 16,000 Hz, considering the discrepancy in sampling rates?

In [17]:
(5*192000) / 16000

60.0

* 5 seconds high resolution audio appears to the model as if it is 60 seconds of audio

In [18]:
print(zero_shot_classifier.feature_extractor.sampling_rate)

48000


* this model was trained on model who's sampling rate was 48000

In [19]:
# lets check the audio sample rate for our audio sample
audio_sample['audio']["sampling_rate"]

44100

* we see the difference between our dataset and the model sampling rate is not as huge which is okay in our case 

In [20]:
# we can check the duration of the audio sample by dividing the length of the audio array by the sampling rate
len(audio_sample['audio']["array"]) / audio_sample['audio']["sampling_rate"]

5.0

* The audio duration is 5 seconds 

In [21]:
# let's cast the whole dataset to a pandas dataframe to the correct sampling rate or the sampling rate of the model

from datasets import Audio

""""

The cast_column method is used to cast the column to the specified type for example we can cast the audio column to the sampling rate of the model
"""
dataset = dataset.cast_column(
    "audio",
    Audio(sampling_rate=48_000))

In [22]:
#check if it worked by observing the audio sample
audio_sample = dataset[8]
print(audio_sample)

{'filename': '1-103298-A-9.wav', 'fold': 1, 'target': 9, 'category': 'crow', 'esc10': False, 'src_file': 103298, 'take': 'A', 'audio': {'path': None, 'array': array([-0.42142388, -0.47985518, -0.45885402, ..., -0.05264271,
       -0.04051008,  0.        ]), 'sampling_rate': 48000}}


* This confirms that the whole dataset sampling rate is 48000 which is what we need to work with model 

In [26]:
candidate_labels =['sound of a dog barking',
                   'sound of a cat meowing',
]

In [27]:
zero_shot_classifier(audio_sample['audio']['array'],
                     candidate_labels=candidate_labels)

[{'score': 0.8030040860176086, 'label': 'sound of a cat meowing'},
 {'score': 0.19699585437774658, 'label': 'sound of a dog barking'}]

In [28]:
candidate_labels = ['sound of a crow crowing',
                    'sound of a car engine',
                    'sound of a person walking'
                    ]

In [29]:
zero_shot_classifier(audio_sample['audio']['array'],
                     candidate_labels=candidate_labels)

[{'score': 0.9995779395103455, 'label': 'sound of a crow crowing'},
 {'score': 0.00021903881861362606, 'label': 'sound of a car engine'},
 {'score': 0.00020299474999774247, 'label': 'sound of a person walking'}]

In [30]:
audio_sample = dataset[0]
audio_sample

{'filename': '1-100032-A-0.wav',
 'fold': 1,
 'target': 0,
 'category': 'dog',
 'esc10': True,
 'src_file': 100032,
 'take': 'A',
 'audio': {'path': None,
  'array': array([0., 0., 0., ..., 0., 0., 0.]),
  'sampling_rate': 48000}}