# Calculating mean and standard deviation of the dataset based on provided path

In [2]:
#line to render the plots under the code cell that created it
%matplotlib inline
import sys  # Python system library needed to load custom functions
import numpy as np  # for performing calculations on numerical arrays
import os     # for changing the directory

from datasets import load_dataset, Audio  # required tools to create, load and process our audio dataset
from transformers import ASTFeatureExtractor, ASTForAudioClassification, TrainingArguments, Trainer  # required classes to perform the model training
from preprocessing import calculate_stats, preprocess_audio_arrays  # functions to calculate dataset statistics and preprocess the dataset with ASTFeatureExtractor
sys.path.append('../src')  # add the source directory to the PYTHONPATH. This allows to import local functions and modules.
os.chdir('../..') # changing our directory to root

In [3]:
import numpy as np
from transformers import ASTFeatureExtractor
from typing import Dict, Any, List

In [4]:
print(os.getcwd())

/root/data


In [5]:
# path for the train dataset
train_path = 'data/data_chunks_10sec_small/train_chunked/'

In [6]:
def calculate_stats(examples: Dict[str, Dict[str, Any]], 
                    audio_field: str, 
                    array_field: str,
                    feature_extractor: ASTFeatureExtractor) -> Dict[str, List[float]]:
    """
    Calculates the mean and standard deviation of the spectrogram of the audio examples in the provided batch.

    Args:
        examples (Dict[str, Any]): A dictionary of audio examples, where each example is itself a dictionary with an audio 
            field containing an audio array, and a label field containing a label value.
        audio_field (str): The name of the field in the examples that contains the audio file information.
        array_field (str): The name of the field in the audio_fielf that contains the audio arrays.
        feature_extractor (ASTFeatureExtractor): An instance of the Hugging Face feature extractor to be used.

    Returns:
        Dict[str, List[float]]: A dictionary containing two keys: 'mean' and 'std', each with a 
        list of floats representing the corresponding statistic for each example in the dataset.
    """
    audio_arrays = [x[f"{array_field}"] - x[f"{array_field}"].mean() for x in examples[f"{audio_field}"]]
    fbanks = feature_extractor(audio_arrays, sampling_rate=feature_extractor.sampling_rate)
    mean = [np.mean(fbank) for fbank in fbanks['input_values']]
    std = [np.std(fbank) for fbank in fbanks['input_values']]
    return {'mean':mean, 'std':std}

In [7]:
def calculate_mean_and_std(train_path, 
                           model_sampling_rate=22050, 
                           batch_size=100, 
                           num_mel_bins=128, 
                           max_length=1024, 
                           time_stride=10, 
                           frequency_stride=10,
                           patch_size=16):
    train_dataset = load_dataset("audiofolder", data_dir=train_path).get('train').shuffle(seed=42)
    train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=model_sampling_rate))
    feature_extractor_stats = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", 
                                                                  do_normalize=False, 
                                                                  num_mel_bins=num_mel_bins, 
                                                                  max_length=max_length, 
                                                                  time_stride=time_stride, 
                                                                  frequency_stride=frequency_stride,
                                                                  patch_size=patch_size)
    train_dataset = train_dataset.map(lambda x: calculate_stats(x, audio_field='audio', array_field='array', feature_extractor=feature_extractor_stats), batched=True, batch_size=batch_size)
    dataset_mean = np.mean(train_dataset['mean'])
    dataset_std = np.mean(train_dataset['std'])
    
    return dataset_mean, dataset_std 

In [8]:
dataset_mean, dataset_std = calculate_mean_and_std(train_path=train_path, 
                                                   model_sampling_rate=44100, 
                                                   batch_size=100, 
                                                   num_mel_bins=128, 
                                                   max_length=1024, 
                                                   time_stride=10, 
                                                   frequency_stride=10,
                                                   patch_size=16)

Resolving data files:   0%|          | 0/2823 [00:00<?, ?it/s]

Downloading and preparing dataset audiofolder/default to /root/.cache/huggingface/datasets/audiofolder/default-e5287c4342aaea44/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files:   0%|          | 0/2823 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /root/.cache/huggingface/datasets/audiofolder/default-e5287c4342aaea44/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/2822 [00:00<?, ? examples/s]

In [9]:
print(dataset_mean, dataset_std)

-9.35247846373339 4.1925171296027095


In [8]:
train_path = 'data/data_chunks_5sec/train_chunked/'

In [None]:
dataset_mean, dataset_std = calculate_mean_and_std(train_path=train_path, 
                                                   model_sampling_rate=44100, 
                                                   batch_size=100, 
                                                   num_mel_bins=256, 
                                                   max_length=512, 
                                                   time_stride=10, 
                                                   frequency_stride=10,
                                                   patch_size=16)

Resolving data files:   0%|          | 0/11911 [00:00<?, ?it/s]

Downloading and preparing dataset audiofolder/default to /root/.cache/huggingface/datasets/audiofolder/default-39f225b47cdb55fd/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files:   0%|          | 0/11911 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /root/.cache/huggingface/datasets/audiofolder/default-39f225b47cdb55fd/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Map:   0%|          | 0/11910 [00:00<?, ? examples/s]

In [None]:
print(dataset_mean, dataset_std)

-10.372330144330496 4.223032533401806
