### Installing necessary packages

In [None]:
#!pip install datasets
#!pip install huggingface_hub

### Loading packages

In [1]:
import os
import pandas as pd
from datasets import load_dataset, load_metric
from evaluate import load
from data_engineering import create_distribution_dict, creating_random_split_df
from audio_mixer import mixer
from tqdm.notebook import trange, tqdm
import torch
from pydub import AudioSegment



### Downloading whole bigos v2 polish ASR dataset. WARNING, dataset contains ~80 GB od data

In [2]:
# if the dataset is already downloaded it just makes dictionary out downloaded data (approx. 10 seconds)
data = load_dataset("amu-cai/pl-asr-bigos-v2",'all', 'all')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Running script


Loading dataset shards:   0%|          | 0/55 [00:00<?, ?it/s]

#### Creating dictionary showing database distribution inside whole database, getting dataset names iterating by dict (faster version - 110 sec)

In [6]:
#Iterating by Dataset dictionary to get datasets names

datasets_list = []
for i in trange(len(data['train'])):
    datasets_list.append(data['train'][i]['dataset'])

df_datasets_distribution = pd.DataFrame()
df_datasets_distribution['datasets'] = datasets_list

#creating dictionary showing database distribution inside whole database
dict_dst = create_distribution_dict(df_datasets_distribution['datasets'])
dict_dst

{'fair-mls-20': 0.305,
 'mozilla-common_voice_15-23': 0.233,
 'mailabs-corpus_librivox-19': 0.144,
 'pjatk-clarin_studio-15': 0.134,
 'pwr-maleset-unk': 0.046,
 'pjatk-clarin_mobile-15': 0.035,
 'google-fleurs-22': 0.035,
 'pwr-viu-unk': 0.026,
 'pwr-azon_read-20': 0.022,
 'pwr-shortwords-unk': 0.009,
 'polyai-minds14-21': 0.006,
 'pwr-azon_spont-20': 0.004}

### Creating dataframe from the training set with 2500 randomly chosen examples


In [24]:
df_model_testing = creating_random_split_df(data['train'], 2500)

### Getting the distribution of each dataset in choosen test-set

In [25]:
create_distribution_dict(df_model_testing['dataset'])

{'fair-mls-20': 0.29,
 'mozilla-common_voice_15-23': 0.247,
 'mailabs-corpus_librivox-19': 0.145,
 'pjatk-clarin_studio-15': 0.138,
 'pwr-maleset-unk': 0.046,
 'google-fleurs-22': 0.042,
 'pjatk-clarin_mobile-15': 0.036,
 'pwr-viu-unk': 0.026,
 'pwr-azon_read-20': 0.015,
 'pwr-shortwords-unk': 0.008,
 'polyai-minds14-21': 0.004,
 'pwr-azon_spont-20': 0.004}

We can compare to see distribution of sets from whole dataset and from our choosen batch

In [8]:
df_model_testing

Unnamed: 0,audioname,split,dataset,speaker_id,ref_orig,audio,samplingrate_orig,sampling_rate,audiopath_bigos,audiopath_local
0,pjatk-clarin_mobile-15-train-0029-00026,train,pjatk-clarin_mobile-15,29,malinda musiała stanowczo oznajmić że nie potr...,{'path': 'pjatk-clarin_mobile-15-train-0029-00...,16000,16000,pjatk-clarin_mobile-15-train-0029-00026.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
1,fair-mls-20-train-0024-00346,train,fair-mls-20,24,szczęśliwej podróży do widzenia wołał lord puc...,"{'path': 'fair-mls-20-train-0024-00346.wav', '...",16000,16000,fair-mls-20-train-0024-00346.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
2,mailabs-corpus_librivox-19-train-1002-00097,train,mailabs-corpus_librivox-19,1002,Wreszcie machnął ręką i poszedł dalej może myś...,{'path': 'mailabs-corpus_librivox-19-train-100...,16000,16000,mailabs-corpus_librivox-19-train-1002-00097.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
3,mailabs-corpus_librivox-19-train-2019-00258,train,mailabs-corpus_librivox-19,2019,Co do mnie przekonany jestem że uśpiono mnie z...,{'path': 'mailabs-corpus_librivox-19-train-201...,16000,16000,mailabs-corpus_librivox-19-train-2019-00258.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
4,google-fleurs-22-train-0001-01189,train,google-fleurs-22,1,imprezy te standardowo trwają od trzech do sze...,{'path': 'google-fleurs-22-train-0001-01189.wa...,16000,16000,google-fleurs-22-train-0001-01189.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
...,...,...,...,...,...,...,...,...,...,...
2495,mozilla-common_voice_15-23-train-2851-01283,train,mozilla-common_voice_15-23,2851,Przede wszystkim musimy trwać przy instrumenta...,{'path': 'mozilla-common_voice_15-23-train-285...,16000,16000,mozilla-common_voice_15-23-train-2851-01283.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
2496,mozilla-common_voice_15-23-train-2849-00111,train,mozilla-common_voice_15-23,2849,Tak rozstali się.,{'path': 'mozilla-common_voice_15-23-train-284...,16000,16000,mozilla-common_voice_15-23-train-2849-00111.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
2497,fair-mls-20-train-0024-05850,train,fair-mls-20,24,nie bój się pan oni go nie wypuszczą od siebie...,"{'path': 'fair-mls-20-train-0024-05850.wav', '...",16000,16000,fair-mls-20-train-0024-05850.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
2498,pjatk-clarin_studio-15-train-0539-00019,train,pjatk-clarin_studio-15,539,my mówimy że najlepiej zadba o siebie o...,{'path': 'pjatk-clarin_studio-15-train-0539-00...,16000,16000,pjatk-clarin_studio-15-train-0539-00019.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...


### Saving audio dataframe to parquet

In [26]:
df_model_testing=df_model_testing[['audioname','dataset','ref_orig','sampling_rate','audiopath_bigos','audiopath_local']]
#line commented not to overwrite accidentally
#df_model_testing.to_parquet('./data/parquets/testing_batch.parquet.gzip', compression = 'gzip')

## Noise 

### UrbanSound dataset

In [7]:
df_urban_sounds = pd.read_csv('./data/UrbanSound8K/metadata/UrbanSound8K.csv')
df_urban_sounds

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.000000,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.500000,62.500000,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.500000,64.500000,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.000000,67.000000,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.500000,72.500000,1,5,2,children_playing
...,...,...,...,...,...,...,...,...
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.209850,255.741948,2,7,1,car_horn


In [8]:
#deleting classes that are not necessary, and leaving: dog_bark, air_conditioner, jackhammer, drilling
df_urban_sounds = df_urban_sounds[~df_urban_sounds['classID'].isin([1, 2, 5, 6, 8, 9])]

In [9]:
print('Each class distributution: ',create_distribution_dict(df_urban_sounds['class'],False))
print('Length of the result UrbanNoises dataframe: ',len(df_urban_sounds['class']))

Each class distributution:  {'dog_bark': 1000, 'air_conditioner': 1000, 'jackhammer': 1000, 'drilling': 1000}
Length of the result UrbanNoises dataframe:  4000


In [11]:
df_urban_sounds_2000 = creating_random_split_df(df_urban_sounds, 2000)
df_urban_sounds_2000

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,79089-0-0-106.wav,79089,62.664375,66.664375,2,9,0,air_conditioner
1,167750-4-1-0.wav,167750,14.330286,18.330286,1,10,4,drilling
2,165039-7-5-0.wav,165039,64.924130,68.924130,1,3,7,jackhammer
3,57320-0-0-22.wav,57320,11.000000,15.000000,2,1,0,air_conditioner
4,146709-0-0-20.wav,146709,10.000000,14.000000,1,4,0,air_conditioner
...,...,...,...,...,...,...,...,...
1995,194754-3-0-1.wav,194754,0.812357,4.812357,1,7,3,dog_bark
1996,30206-7-0-1.wav,30206,0.500000,4.500000,1,6,7,jackhammer
1997,46669-4-0-54.wav,46669,27.000000,31.000000,1,1,4,drilling
1998,24364-4-0-0.wav,24364,0.633371,4.633371,1,6,4,drilling


##### Creating path to each file in dataframe

In [12]:
# Function to create the file path
def create_file_path(row, folder_path):
    folder_number = row['fold']
    file_name = row['slice_file_name']
    file_path = os.path.join(folder_path, f'fold{folder_number}', file_name)
    return file_path

base_path = '.\\data\\UrbanSound8K\\audio\\'

# Apply the function to create the new column
df_urban_sounds_2000['audio_path'] = df_urban_sounds_2000.apply(create_file_path, axis=1, folder_path=base_path)

In [13]:
df_urban_sounds_2000

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,audio_path
0,79089-0-0-106.wav,79089,62.664375,66.664375,2,9,0,air_conditioner,.\data\UrbanSound8K\audio\fold9\79089-0-0-106.wav
1,167750-4-1-0.wav,167750,14.330286,18.330286,1,10,4,drilling,.\data\UrbanSound8K\audio\fold10\167750-4-1-0.wav
2,165039-7-5-0.wav,165039,64.924130,68.924130,1,3,7,jackhammer,.\data\UrbanSound8K\audio\fold3\165039-7-5-0.wav
3,57320-0-0-22.wav,57320,11.000000,15.000000,2,1,0,air_conditioner,.\data\UrbanSound8K\audio\fold1\57320-0-0-22.wav
4,146709-0-0-20.wav,146709,10.000000,14.000000,1,4,0,air_conditioner,.\data\UrbanSound8K\audio\fold4\146709-0-0-20.wav
...,...,...,...,...,...,...,...,...,...
1995,194754-3-0-1.wav,194754,0.812357,4.812357,1,7,3,dog_bark,.\data\UrbanSound8K\audio\fold7\194754-3-0-1.wav
1996,30206-7-0-1.wav,30206,0.500000,4.500000,1,6,7,jackhammer,.\data\UrbanSound8K\audio\fold6\30206-7-0-1.wav
1997,46669-4-0-54.wav,46669,27.000000,31.000000,1,1,4,drilling,.\data\UrbanSound8K\audio\fold1\46669-4-0-54.wav
1998,24364-4-0-0.wav,24364,0.633371,4.633371,1,6,4,drilling,.\data\UrbanSound8K\audio\fold6\24364-4-0-0.wav


### Creating dataframe from Vechicle Interior Sound folder

In [35]:
visc_folder_path = '.\\data\\VISC Dataset SON\\'

file_paths = []
class_ids = []


# Traverse the directory
for filename in os.listdir(visc_folder_path):
    # Join the folder path with the filename to get the full file path
    file_path = os.path.join(visc_folder_path, filename)
    
    # Extract the class ID from the file name
    class_id = int(filename.split()[0])
    
    # Append the values to the lists
    file_paths.append(file_path)
    class_ids.append(class_id)

# Create a DataFrame
df = pd.DataFrame({'file_path': file_paths, 'class_id': class_ids})



#### Ading each noise class name to the dataframe as a column

In [16]:
visc_noises_dataframe = creating_random_split_df(df,500)

visc_noises_dict = {1 : 'bus_interior',
                    2 : 'minibus_interior',
                    3 : 'pickup_interior',
                    4 : 'sports_car_interior',
                    5 : 'jeep_interior',
                    6 : 'truck_interior',
                    7 : 'crossover_interior',
                    8 : 'other_car_interior'}
visc_noises_dataframe['class'] = visc_noises_dataframe['class_id'].map(visc_noises_dict)

In [20]:
visc_noises_dataframe

Unnamed: 0,file_path,class_id,class
0,.\data\VISC Dataset SON\5 (496).wav,5,jeep_interior
1,.\data\VISC Dataset SON\7 (522).wav,7,crossover_interior
2,.\data\VISC Dataset SON\1 (766).wav,1,bus_interior
3,.\data\VISC Dataset SON\8 (624).wav,8,other_car_interior
4,.\data\VISC Dataset SON\1 (471).wav,1,bus_interior
...,...,...,...
495,.\data\VISC Dataset SON\4 (36).wav,4,sports_car_interior
496,.\data\VISC Dataset SON\1 (251).wav,1,bus_interior
497,.\data\VISC Dataset SON\1 (692).wav,1,bus_interior
498,.\data\VISC Dataset SON\2 (492).wav,2,minibus_interior


### Creating one dataframe with noises

In [22]:
visc_noises_df = visc_noises_dataframe[['file_path','class']]
urban_noises_df = df_urban_sounds_2000[['audio_path','class']]
urban_noises_df['file_path'] = urban_noises_df['audio_path']
urban_noises_df_2 = urban_noises_df[['file_path','class']]
noise_df = pd.concat([urban_noises_df_2,visc_noises_df],ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  urban_noises_df['file_path'] = urban_noises_df['audio_path']


### Saving noise dataframe to parquet

In [29]:
noise_df

Unnamed: 0,file_path,class
0,.\data\UrbanSound8K\audio\fold9\79089-0-0-106.wav,air_conditioner
1,.\data\UrbanSound8K\audio\fold10\167750-4-1-0.wav,drilling
2,.\data\UrbanSound8K\audio\fold3\165039-7-5-0.wav,jackhammer
3,.\data\UrbanSound8K\audio\fold1\57320-0-0-22.wav,air_conditioner
4,.\data\UrbanSound8K\audio\fold4\146709-0-0-20.wav,air_conditioner
...,...,...
2495,.\data\VISC Dataset SON\4 (36).wav,sports_car_interior
2496,.\data\VISC Dataset SON\1 (251).wav,bus_interior
2497,.\data\VISC Dataset SON\1 (692).wav,bus_interior
2498,.\data\VISC Dataset SON\2 (492).wav,minibus_interior


In [27]:
#noise_df.to_parquet('./data/parquets/noise_df.parquet.gzip', compression = 'gzip')

## Creating one dataframe with everything combined

In [2]:
df_audio= pd.read_parquet('./data/parquets/testing_batch.parquet.gzip') 
df_noises = pd.read_parquet('./data/parquets/noise_df.parquet.gzip')

In [3]:
df_audio['noise_path'] = df_noises['file_path']
df_audio['noise_class'] = df_noises['class']
df_audio

Unnamed: 0,audioname,dataset,ref_orig,sampling_rate,audiopath_bigos,audiopath_local,noise_path,noise_class
0,fair-mls-20-train-0009-04739,fair-mls-20,tam nocne włóczęgi wołano z dachów jeżeli nie ...,16000,fair-mls-20-train-0009-04739.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\UrbanSound8K\audio\fold9\79089-0-0-106.wav,air_conditioner
1,pjatk-clarin_studio-15-train-0488-00003,pjatk-clarin_studio-15,w pracy studenci chcieliby przede wszystk...,16000,pjatk-clarin_studio-15-train-0488-00003.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\UrbanSound8K\audio\fold10\167750-4-1-0.wav,drilling
2,fair-mls-20-train-0009-05501,fair-mls-20,co to znaczy sam siebie zapytywał faraon czy g...,16000,fair-mls-20-train-0009-05501.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\UrbanSound8K\audio\fold3\165039-7-5-0.wav,jackhammer
3,fair-mls-20-train-0021-01519,fair-mls-20,tylko na piaszczystem wybrzeżu lub na łąkach b...,16000,fair-mls-20-train-0021-01519.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\UrbanSound8K\audio\fold1\57320-0-0-22.wav,air_conditioner
4,pjatk-clarin_studio-15-train-0335-00001,pjatk-clarin_studio-15,zaokrągla uziemienie księdzu liźnięcie rol...,16000,pjatk-clarin_studio-15-train-0335-00001.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\UrbanSound8K\audio\fold4\146709-0-0-20.wav,air_conditioner
...,...,...,...,...,...,...,...,...
2495,fair-mls-20-train-0009-06517,fair-mls-20,kazał zrobić spis wszystkich mężczyzn w państw...,16000,fair-mls-20-train-0009-06517.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\VISC Dataset SON\4 (36).wav,sports_car_interior
2496,mozilla-common_voice_15-23-train-2851-00218,mozilla-common_voice_15-23,"W odniesieniu do Lizbony, uczyniliśmy także po...",16000,mozilla-common_voice_15-23-train-2851-00218.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\VISC Dataset SON\1 (251).wav,bus_interior
2497,mozilla-common_voice_15-23-train-2856-01361,mozilla-common_voice_15-23,"Jej budżet to budżet, który wspiera inwestycje",16000,mozilla-common_voice_15-23-train-2856-01361.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\VISC Dataset SON\1 (692).wav,bus_interior
2498,fair-mls-20-train-0009-03165,fair-mls-20,upłynęło już kilka godzin po zachodzie słońca ...,16000,fair-mls-20-train-0009-03165.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\VISC Dataset SON\2 (492).wav,minibus_interior


## Noises 

#### Normalising noise loudness

In [4]:
def normalize_loudness(file_paths, target_loudness=-15.0, output_folder="normalized", normalization_type="mean"):
    normalized_file_paths = []  # List to store paths of normalized files

    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Load audio files and calculate loudness
    audio_segments = [AudioSegment.from_file(file_path) for file_path in file_paths]
    loudness_levels = [segment.dBFS for segment in audio_segments]

    # Calculate the normalization factor based on mean or median loudness
    if normalization_type == "mean":
        normalization_factor = target_loudness - sum(loudness_levels) / len(loudness_levels)
    elif normalization_type == "median":
        sorted_loudness = sorted(loudness_levels)
        middle_index = len(sorted_loudness) // 2
        normalization_factor = target_loudness - sorted_loudness[middle_index]
    else:
        raise ValueError("Invalid normalization type. Use 'mean' or 'median'.")

    # Normalize each audio file
    normalized_segments = [segment + normalization_factor for segment in audio_segments]

    # Export normalized audio files to the output folder with the same names
    for i, file_path in enumerate(file_paths):
        file_name = os.path.basename(file_path)  # Extract file name from path
        output_path = os.path.join(output_folder, file_name)
        normalized_segments[i].export(output_path, format="wav")
        normalized_file_paths.append(output_path)  # Append output path to the list
    
    print("Normalization completed. Normalized files saved in:", output_folder)

    return normalized_file_paths  # Return the list of normalized file paths

In [6]:
#laptop
folder_path = './data/example_SNR_audio/'
file_paths = []
for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):  # Adjust this condition based on your file format
        audio_file = os.path.join(folder_path, filename)
        file_paths.append(audio_file)


output_folder = "./data/mixed_recordings/normalised_recordings/minuss_15db"
normalize_loudness(file_paths, output_folder=output_folder,normalization_type="mean",target_loudness=-15.0)

Normalization completed. Normalized files saved in: ./data/mixed_recordings/normalised_recordings/minuss_15db


In [12]:
file_paths = df_audio['noise_path'].to_list()
output_folder = "./data/mixed_recordings/normalised_recordings/minuss_15db"
normalize_loudness(file_paths, output_folder=output_folder,normalization_type="mean",target_loudness=-15.0)

Normalization completed. Normalized files saved in: ./data/mixed_recordings/normalised_recordings/plus_15db


In [16]:
# Example usage:
file_paths = df_audio['noise_path'].to_list()
loudnesses = [-15, -20, -25, -30, -35, -40]
for loudness in loudnesses:
    output_folder = "./data/mixed_recordings/normalised_recordings/minus_" + str(loudness) +  "dBmed/"
    normalize_loudness(file_paths, output_folder=output_folder,normalization_type="median",target_loudness=-loudness)

Normalization completed. Normalized files saved in: ./data/mixed_recordings/normalised_recordings/minus_-15dBmed/
Normalization completed. Normalized files saved in: ./data/mixed_recordings/normalised_recordings/minus_-20dBmed/
Normalization completed. Normalized files saved in: ./data/mixed_recordings/normalised_recordings/minus_-25dBmed/
Normalization completed. Normalized files saved in: ./data/mixed_recordings/normalised_recordings/minus_-30dBmed/
Normalization completed. Normalized files saved in: ./data/mixed_recordings/normalised_recordings/minus_-35dBmed/
Normalization completed. Normalized files saved in: ./data/mixed_recordings/normalised_recordings/minus_-40dBmed/


#### Checking loudness of the files to check if function works

In [10]:
import soundfile as sf
import pyloudnorm as pyln
def loudness_function(file_path):
    data,rate = sf.read(file_path)
    meter = pyln.Meter(rate)
    loudness = meter.integrated_loudness(data)
    return loudness


In [15]:
# Example usage:

for file in file_paths[0:6]:
        mean_loudness = calculate_mean_loudness(file)
        print(f"File: {file}, Mean Loudness: {mean_loudness} dB")

File: .\data\UrbanSound8K\audio\fold9\79089-0-0-106.wav, Mean Loudness: 0.10479561239480972 dB
File: .\data\UrbanSound8K\audio\fold10\167750-4-1-0.wav, Mean Loudness: 0.044091030955314636 dB
File: .\data\UrbanSound8K\audio\fold3\165039-7-5-0.wav, Mean Loudness: 0.0692070722579956 dB
File: .\data\UrbanSound8K\audio\fold1\57320-0-0-22.wav, Mean Loudness: 0.0559232197701931 dB
File: .\data\UrbanSound8K\audio\fold4\146709-0-0-20.wav, Mean Loudness: 0.048664987087249756 dB
File: .\data\UrbanSound8K\audio\fold1\180937-7-1-1.wav, Mean Loudness: 0.09412972629070282 dB


In [11]:
import librosa

def calculate_loudness(audio_file):
    # Load the audio file
    y, sr = librosa.load(audio_file)

    # Calculate the loudness using perceptual weighting (using ITU-R BS.1770)
    loudness = librosa.feature.rms(y=y)

    # Calculate the mean loudness
    mean_loudness = loudness.mean()

    # Calculate the peak loudness
    peak_loudness = loudness.max()

    return mean_loudness, peak_loudness

# Example usage:
folder_path = './data/example_SNR_audio/'
for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):  # Adjust this condition based on your file format
        audio_file = os.path.join(folder_path, filename)
        mean_loudness,peak_loudness = calculate_loudness(audio_file)
        print(f"File: {filename}, Mean Loudness: {mean_loudness} dB, Peak Loudness: {peak_loudness}")
        loudnes = loudness_function(audio_file)
        print("\n Loudness:",  loudnes)

File: 0.001_SNR_audio_file.wav, Mean Loudness: 0.04850560426712036 dB, Peak Loudness: 0.11297627538442612

 Loudness: -28.5842880026989
File: 0.01_SNR_audio_file.wav, Mean Loudness: 0.04847461357712746 dB, Peak Loudness: 0.11296653002500534

 Loudness: -28.586210046899303
File: 0.1_SNR_audio_file.wav, Mean Loudness: 0.04816778004169464 dB, Peak Loudness: 0.11286386847496033

 Loudness: -28.60509540219031
File: 10_SNR_audio_file.wav, Mean Loudness: 0.030357489362359047 dB, Peak Loudness: 0.10818291455507278

 Loudness: -28.474650109399008
File: 120_SNR_audio_file.wav, Mean Loudness: 0.025161296129226685 dB, Peak Loudness: 0.10743813216686249

 Loudness: -28.418271985405184
File: 1_SNR_audio_file.wav, Mean Loudness: 0.04530452936887741 dB, Peak Loudness: 0.1119380071759224

 Loudness: -28.77752098863526
File: 30_SNR_audio_file.wav, Mean Loudness: 0.025377176702022552 dB, Peak Loudness: 0.10746891051530838

 Loudness: -28.485211453144498
File: 55_SNR_audio_file.wav, Mean Loudness: 0.02516

In [7]:
folder_path = "./data/mixed_recordings/normalised_recordings/minuss_15db"
for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):  # Adjust this condition based on your file format
        audio_file = os.path.join(folder_path, filename)
        mean_loudness,peak_loudness = calculate_loudness(audio_file)
        print(f"File: {filename}, Mean Loudness: {mean_loudness} dB, Peak Loudness: {peak_loudness}")

File: 0.001_SNR_audio_file.wav, Mean Loudness: 0.20626680552959442 dB, Peak Loudness: 0.4655102789402008
File: 0.01_SNR_audio_file.wav, Mean Loudness: 0.20613504946231842 dB, Peak Loudness: 0.4654773473739624
File: 0.1_SNR_audio_file.wav, Mean Loudness: 0.2048298865556717 dB, Peak Loudness: 0.4651143550872803
File: 10_SNR_audio_file.wav, Mean Loudness: 0.1289665549993515 dB, Peak Loudness: 0.4489797055721283
File: 120_SNR_audio_file.wav, Mean Loudness: 0.10676495730876923 dB, Peak Loudness: 0.44644665718078613
File: 1_SNR_audio_file.wav, Mean Loudness: 0.19264642894268036 dB, Peak Loudness: 0.46185845136642456
File: 30_SNR_audio_file.wav, Mean Loudness: 0.10768909752368927 dB, Peak Loudness: 0.4465368688106537
File: 55_SNR_audio_file.wav, Mean Loudness: 0.10677050799131393 dB, Peak Loudness: 0.4464489817619324
File: 88_SNR_audio_file.wav, Mean Loudness: 0.1067650243639946 dB, Peak Loudness: 0.44644641876220703


#### Creating folders with mixed data

In [39]:
snr_values = [-3, -5, -10, -15]

In [40]:

# Loop through each SNR value
for snr in snr_values:
    # Create a folder for the current SNR value
    folder_path = f'./data/mixed_recordings/SNR_{snr}'
    os.makedirs(folder_path, exist_ok=True)

    # Loop through the dataframe and mix files for the current SNR value
    for index, row in full_df.iterrows():
        signal_path = row['audiopath_local']
        noise_path = row['noise_path']
        audio_name = row['audioname'] + '.wav'
        save_path = os.path.join(folder_path, audio_name)  # Change the naming convention if needed

        # Call your mixer function here
        mixer(signal_path, noise_path, snr, save_path)

### Model bark whisper v3 Large


In [41]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [42]:
torch.cuda.is_available()

True

In [43]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# Specify the CUDA device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_id = "openai/whisper-large-v3"
torch_dtype = torch.float32  # You can adjust the dtype if needed

# Load model and move it to CUDA
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

# Load processor
processor = AutoProcessor.from_pretrained(model_id)

# Create the pipeline with CUDA support
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [49]:
#whisper_results = []
#for i in range(len(df_whisper)):
#    sample = df_whisper['audiopath_local'][i]
#    result = pipe(sample)
#    whisper_results.append(result['text'])

#df_whisper['whisper_pred'] = whisper_results
results = []

for i in trange(len(snr_df)):
    sample = snr_dataframe['SNR_-3'][i]
    result = pipe(sample, generate_kwargs={"language": "polish"})
    results.append(result['text'])

  0%|          | 0/5000 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict 

In [50]:
full_df['whisper_SNR_-3'] = results

In [10]:
wer = load("wer")
wer_score = wer.compute(predictions=full_df['whisper_SNR_50'], references=full_df['ref_orig'])
print(wer_score * 100)

133.71195991245705


In [19]:
full_df.to_parquet('full_testing_df.parquet.gzip', compression = 'gzip')
#snr_df.to_parquet('snr_dataframe.parquet.gzip', compression = 'gzip')

In [9]:
full_df = pd.read_parquet('full_testing_df.parquet.gzip') 
snr_df = pd.read_parquet('snr_dataframe.parquet.gzip') 

In [54]:
full_df

Unnamed: 0,audioname,dataset,ref_orig,sampling_rate,audiopath_local,audiopath_project,noise_path,noise_class,whisper_no_noise,whisper_SNR_100,whisper_SNR_10,whisper_SNR_1,audiofile_mixed,whisper_SNR_-3
0,mozilla-common_voice_15-23-train-2856-01818,mozilla-common_voice_15-23,"Jest także trzecia sprawa, która w czasie tej ...",16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mozilla-common_voic...,.\data\UrbanSound8K\audio\fold2\156893-7-9-0.wav,jackhammer,"Jest także trzecia sprawa, która w czasie tej...",rycerze moi współbracia boleśni bardzośmy mał...,"Rycerze moi, współbracia boleśni, Bardzośmy m...","Rycerze moi, współbracia boleśni, Bardzośmy m...",fair-mls-20-train-0009-00038,"Rycerze moi, współbracia boleśni, Bardzośmy m..."
1,pjatk-clarin_studio-15-train-0457-00001,pjatk-clarin_studio-15,dżuma wziernik przemianę księdzu krzywdzen...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_studio...,.\data\UrbanSound8K\audio\fold1\40722-8-0-4.wav,siren,"Dżuma, wziernik, przemianę, księdzu, krzywdze...",kantor choć guza dostał wpośród czoła gdy pos...,"Kantor, choć guza dostał w pośród czoła, gdy ...","Kantor, choć guza dostał w pośród czoła, gdy ...",fair-mls-20-train-0009-00044,"Kantor, choć guza dostał w pośród czoła, gdy ..."
2,pjatk-clarin_mobile-15-train-0083-00007,pjatk-clarin_mobile-15,w piątek po południu była przesłuchiwana przez...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_mobile...,.\data\UrbanSound8K\audio\fold8\125678-7-0-4.wav,jackhammer,w piątek po południu była przesłuchiwana prze...,przeszłe przypadki gdy dobrze pamięta zmyśla ...,"Przeszłe przypadki, gdy dobrze pamięta Zmyśla...","Przeszły przypadki, gdy dobrze pamięta, Myśla...",fair-mls-20-train-0009-00067,"Przeszło przypadki, gdy dobrze pamięta, myśli..."
3,pwr-maleset-unk-train-0001-03097,pwr-maleset-unk,jeśli chcesz zostanę w domu,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pwr-maleset-unk-tra...,.\data\UrbanSound8K\audio\fold9\105029-7-2-16.wav,jackhammer,Jeśli chcesz zostanę w domu.,nie przykrzy własnym hołdownikom ani swemu mo...,nie przykrzy własnym hołdownikom ani swemu mo...,nie przykrzy własnym hołdownikom ani swemu mo...,fair-mls-20-train-0009-00079,"Nie przykrzy własnym hołdownikom, Ani swemu m..."
4,mozilla-common_voice_15-23-train-2862-00017,mozilla-common_voice_15-23,Tekst nie opiera się na żadnych podstawach nau...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mozilla-common_voic...,.\data\UrbanSound8K\audio\fold10\99192-4-0-54.wav,drilling,Tekst nie opiera się na żadnych podstawach na...,pięknież to przecie patrzeć na świat z góry w...,"Pięknież to przecie patrzeć na świat z góry, ...","Pięknież to przecie patrzyć na świat z góry, ...",fair-mls-20-train-0009-00080,"Piękny, że to przecie patrzyć na świat z góry..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,mailabs-corpus_librivox-19-train-2023-00011,mailabs-corpus_librivox-19,Nareszcie zniecierpliwiony kazał zamurować okn...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mailabs-corpus_libr...,.\data\VISC Dataset SON\7 (249).wav,crossover_interior,Nareszcie zniecierpliwiony kazał zamurować ok...,Duże litery.,Duże litery.,W tłu że litery.,pwr-viu-unk-train-0001-04231,Służę literę.
4996,pjatk-clarin_studio-15-train-0289-00016,pjatk-clarin_studio-15,dostała za ten reportaż nagrodę pulicera ...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_studio...,.\data\VISC Dataset SON\2 (172).wav,minibus_interior,"Dostała zatem reportaż nagrodę Pulitzera, ale...",Duże litery.,Duże litery.,Duże litery.,pwr-viu-unk-train-0001-04241,Duże litery.
4997,mozilla-common_voice_15-23-train-2846-00448,mozilla-common_voice_15-23,Dotyczy ona zasadniczo dwóch kwestii,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mozilla-common_voic...,.\data\VISC Dataset SON\5 (164).wav,jeep_interior,Dotyczy ona zasadniczo dwóch kwestii.,Małe litery.,Małe litery.,mała litarz,pwr-viu-unk-train-0001-04251,Mała literka.
4998,pjatk-clarin_mobile-15-train-0035-00018,pjatk-clarin_mobile-15,każdy starał się odlecieć najbliższym samolote...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_mobile...,.\data\VISC Dataset SON\6 (638).wav,truck_interior,Każdy starał się odjechać najbliższym samolot...,Małe litele.,Mała Litera,Mało mi wcale.,pwr-viu-unk-train-0001-04261,Mało mi tego.


In [10]:
snr_df

Unnamed: 0,SNR_0.1,SNR_0.5,SNR_1,SNR_10,SNR_100,SNR_25,SNR_5,SNR_50
0,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
1,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
2,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
3,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
4,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
...,...,...,...,...,...,...,...,...
4995,.\data\mixed_recordings\SNR_0.1\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_0.5\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_1\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_10\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_100\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_25\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_5\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_50\pwr-viu-unk-tra...
4996,.\data\mixed_recordings\SNR_0.1\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_0.5\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_1\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_10\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_100\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_25\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_5\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_50\pwr-viu-unk-tra...
4997,.\data\mixed_recordings\SNR_0.1\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_0.5\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_1\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_10\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_100\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_25\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_5\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_50\pwr-viu-unk-tra...
4998,.\data\mixed_recordings\SNR_0.1\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_0.5\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_1\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_10\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_100\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_25\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_5\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_50\pwr-viu-unk-tra...


In [52]:
df_audioname = full_df[['audioname', 'ref_orig']]

# Create a new dataframe for 'audiofile_mixed' and 'SNR_100'
df_audiofile = full_df[['audiofile_mixed', 'whisper_SNR_-3']]

# Merge the two dataframes on 'audioname' and 'audiofile_mixed'
result_df = pd.merge(df_audioname, df_audiofile, how='outer', left_on='audioname', right_on='audiofile_mixed')

# Drop the redundant column and rename columns
#result_df = result_df.drop(columns=['audiofile_mixed']).rename(columns={'ref_orig': 'ref_orig', 'SNR_100': 'aligned_SNR_100'})

# Display the result_df
result_df

Unnamed: 0,audioname,ref_orig,audiofile_mixed,whisper_SNR_-3
0,fair-mls-20-train-0009-00038,rycerze moi współ bracia boleśni bardzośmy mał...,fair-mls-20-train-0009-00038,"Rycerze moi, współbracia boleśni, Bardzośmy m..."
1,fair-mls-20-train-0009-00044,kantor choć guza dostał wpośród czoła gdy post...,fair-mls-20-train-0009-00044,"Kantor, choć guza dostał w pośród czoła, gdy ..."
2,fair-mls-20-train-0009-00067,przeszłe przypadki gdy dobrze pamięta zmyśla g...,fair-mls-20-train-0009-00067,"Przeszło przypadki, gdy dobrze pamięta, myśli..."
3,fair-mls-20-train-0009-00079,nie przykrzy własnym hołdownikom ani swemu mon...,fair-mls-20-train-0009-00079,"Nie przykrzy własnym hołdownikom, Ani swemu m..."
4,fair-mls-20-train-0009-00080,nieskończenie pięknieżto przecie patrzać na św...,fair-mls-20-train-0009-00080,"Piękny, że to przecie patrzyć na świat z góry..."
...,...,...,...,...
4995,pwr-viu-unk-train-0001-04231,duże litery,pwr-viu-unk-train-0001-04231,Służę literę.
4996,pwr-viu-unk-train-0001-04241,duże litery,pwr-viu-unk-train-0001-04241,Duże litery.
4997,pwr-viu-unk-train-0001-04251,małe litery,pwr-viu-unk-train-0001-04251,Mała literka.
4998,pwr-viu-unk-train-0001-04261,małe litery,pwr-viu-unk-train-0001-04261,Mało mi tego.


In [53]:
wer = load_metric('wer')
print("WER: {:2f}".format(100 * wer.compute(predictions=result_df["whisper_SNR_-3"], references=result_df["ref_orig"])))

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


WER: 47.315779


In [14]:
file_paths = snr_df['SNR_10']

# Use the os.path.basename function to extract only the file name from each path
snr_df['audiofile'] = file_paths.apply(lambda x: os.path.basename(x))
snr_df['audiofile'] = snr_df['audiofile'].str.replace('.wav','')

# Now df['file_name'] contains only the file names
print(snr_df['audiofile'])

full_df['audiofile_mixed'] = snr_df['audiofile']

0       fair-mls-20-train-0009-00038
1       fair-mls-20-train-0009-00044
2       fair-mls-20-train-0009-00067
3       fair-mls-20-train-0009-00079
4       fair-mls-20-train-0009-00080
                    ...             
4995    pwr-viu-unk-train-0001-04231
4996    pwr-viu-unk-train-0001-04241
4997    pwr-viu-unk-train-0001-04251
4998    pwr-viu-unk-train-0001-04261
4999    pwr-viu-unk-train-0001-04267
Name: audiofile, Length: 5000, dtype: object



### Model alexcleu/wav2vec2-large-xlsr-polish 

In [6]:
import torch
import torchaudio
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import re

In [46]:
def create_dataframe_from_folders(main_folder_path):
    data = {}

    for folder_name in os.listdir(main_folder_path):
        folder_path = os.path.join(main_folder_path, folder_name)

        if os.path.isdir(folder_path):
            file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]
            data[folder_name] = file_paths

    df = pd.DataFrame(data)
    return df

In [47]:
mixed_recordings_path = '.\\data\\mixed_recordings\\'
snr_dataframe = create_dataframe_from_folders(mixed_recordings_path)

In [48]:
snr_dataframe

Unnamed: 0,SNR_-10,SNR_-15,SNR_-3,SNR_-5,SNR_0.1,SNR_0.5,SNR_1,SNR_10,SNR_100,SNR_25,SNR_5,SNR_50
0,.\data\mixed_recordings\SNR_-10\fair-mls-20-tr...,.\data\mixed_recordings\SNR_-15\fair-mls-20-tr...,.\data\mixed_recordings\SNR_-3\fair-mls-20-tra...,.\data\mixed_recordings\SNR_-5\fair-mls-20-tra...,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
1,.\data\mixed_recordings\SNR_-10\fair-mls-20-tr...,.\data\mixed_recordings\SNR_-15\fair-mls-20-tr...,.\data\mixed_recordings\SNR_-3\fair-mls-20-tra...,.\data\mixed_recordings\SNR_-5\fair-mls-20-tra...,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
2,.\data\mixed_recordings\SNR_-10\fair-mls-20-tr...,.\data\mixed_recordings\SNR_-15\fair-mls-20-tr...,.\data\mixed_recordings\SNR_-3\fair-mls-20-tra...,.\data\mixed_recordings\SNR_-5\fair-mls-20-tra...,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
3,.\data\mixed_recordings\SNR_-10\fair-mls-20-tr...,.\data\mixed_recordings\SNR_-15\fair-mls-20-tr...,.\data\mixed_recordings\SNR_-3\fair-mls-20-tra...,.\data\mixed_recordings\SNR_-5\fair-mls-20-tra...,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
4,.\data\mixed_recordings\SNR_-10\fair-mls-20-tr...,.\data\mixed_recordings\SNR_-15\fair-mls-20-tr...,.\data\mixed_recordings\SNR_-3\fair-mls-20-tra...,.\data\mixed_recordings\SNR_-5\fair-mls-20-tra...,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,.\data\mixed_recordings\SNR_-10\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_-15\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_-3\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_-5\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_0.1\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_0.5\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_1\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_10\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_100\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_25\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_5\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_50\pwr-viu-unk-tra...
4996,.\data\mixed_recordings\SNR_-10\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_-15\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_-3\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_-5\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_0.1\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_0.5\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_1\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_10\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_100\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_25\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_5\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_50\pwr-viu-unk-tra...
4997,.\data\mixed_recordings\SNR_-10\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_-15\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_-3\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_-5\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_0.1\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_0.5\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_1\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_10\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_100\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_25\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_5\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_50\pwr-viu-unk-tra...
4998,.\data\mixed_recordings\SNR_-10\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_-15\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_-3\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_-5\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_0.1\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_0.5\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_1\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_10\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_100\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_25\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_5\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_50\pwr-viu-unk-tra...


In [18]:
full_df

Unnamed: 0,audioname,dataset,ref_orig,sampling_rate,audiopath_local,audiopath_project,noise_path,noise_class
0,mozilla-common_voice_15-23-train-2856-01818,mozilla-common_voice_15-23,"Jest także trzecia sprawa, która w czasie tej ...",16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mozilla-common_voic...,.\data\UrbanSound8K\audio\fold2\156893-7-9-0.wav,jackhammer
1,pjatk-clarin_studio-15-train-0457-00001,pjatk-clarin_studio-15,dżuma wziernik przemianę księdzu krzywdzen...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_studio...,.\data\UrbanSound8K\audio\fold1\40722-8-0-4.wav,siren
2,pjatk-clarin_mobile-15-train-0083-00007,pjatk-clarin_mobile-15,w piątek po południu była przesłuchiwana przez...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_mobile...,.\data\UrbanSound8K\audio\fold8\125678-7-0-4.wav,jackhammer
3,pwr-maleset-unk-train-0001-03097,pwr-maleset-unk,jeśli chcesz zostanę w domu,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pwr-maleset-unk-tra...,.\data\UrbanSound8K\audio\fold9\105029-7-2-16.wav,jackhammer
4,mozilla-common_voice_15-23-train-2862-00017,mozilla-common_voice_15-23,Tekst nie opiera się na żadnych podstawach nau...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mozilla-common_voic...,.\data\UrbanSound8K\audio\fold10\99192-4-0-54.wav,drilling
...,...,...,...,...,...,...,...,...
4995,mailabs-corpus_librivox-19-train-2023-00011,mailabs-corpus_librivox-19,Nareszcie zniecierpliwiony kazał zamurować okn...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mailabs-corpus_libr...,.\data\VISC Dataset SON\7 (249).wav,crossover_interior
4996,pjatk-clarin_studio-15-train-0289-00016,pjatk-clarin_studio-15,dostała za ten reportaż nagrodę pulicera ...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_studio...,.\data\VISC Dataset SON\2 (172).wav,minibus_interior
4997,mozilla-common_voice_15-23-train-2846-00448,mozilla-common_voice_15-23,Dotyczy ona zasadniczo dwóch kwestii,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mozilla-common_voic...,.\data\VISC Dataset SON\5 (164).wav,jeep_interior
4998,pjatk-clarin_mobile-15-train-0035-00018,pjatk-clarin_mobile-15,każdy starał się odlecieć najbliższym samolote...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_mobile...,.\data\VISC Dataset SON\6 (638).wav,truck_interior


In [8]:
snr_dataframe['sentence'] = full_df['ref_orig']

In [9]:
print(torch.cuda.is_available())

True


In [12]:
processor = Wav2Vec2Processor.from_pretrained("alexcleu/wav2vec2-large-xlsr-polish")
model = Wav2Vec2ForCTC.from_pretrained("alexcleu/wav2vec2-large-xlsr-polish")
resampler = torchaudio.transforms.Resample(48_000, 16_000)

def speech_file_to_array_fn(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
  
    speech_array, sampling_rate = torchaudio.load(batch["SNR_100"])
  
    batch["speech_SNR_100"] = resampler(speech_array).squeeze().numpy()
  
    return batch

wer = load_metric("wer")
processor = Wav2Vec2Processor.from_pretrained("alexcleu/wav2vec2-large-xlsr-polish")
model = Wav2Vec2ForCTC.from_pretrained("alexcleu/wav2vec2-large-xlsr-polish")
model.to("cuda")
chars_to_ignore_regex = '[\\\\\\\\\\\\\\\\,\\\\\\\\\\\\\\\\?\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\!\\\\\\\\\\\\\\\\-\\\\\\\\\\\\\\\\;\\\\\\\\\\\\\\\\:\\\\\\\\\\\\\\\\"\\\\\\\\\\\\\\\\“]'
resampler = torchaudio.transforms.Resample(48_000, 16_000)

# Assuming snr_dataframe is a DataFrame with columns like "sentence" and "SNR_100"
snr_dataframe = snr_dataframe.map(speech_file_to_array_fn)

def evaluate(batch):
    inputs = processor(batch["speech_SNR_100"], sampling_rate=16_000, return_tensors="pt", padding=True)
    
    with torch.no_grad():
        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
    
    pred_ids = torch.argmax(logits, dim=-1)
    
    batch["pred_strings"] = processor.batch_decode(pred_ids)
    
    return batch

# Assuming snr_dataframe is a DataFrame with columns like "sentence" and "SNR_100"
result = snr_dataframe.map(evaluate, batched=True, batch_size=8)
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))


Some weights of the model checkpoint at alexcleu/wav2vec2-large-xlsr-polish were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at alexcleu/wav2vec2-large-xlsr-polish and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably T

TypeError: string indices must be integers, not 'str'

### Model jonatasgrosman/wav2vec2-large-xlsr-53-polish

In [4]:
import torch
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [15]:

LANG_ID = "pl"
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-polish"
SAMPLES = 5


processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["SNR100"], sr=16_000)
    batch["speech"] = speech_array
    batch["sentence"] = batch["ref_orig"].upper()
    return batch

df_test_wav = pd.DataFrame()
df_test_wav = speech_file_to_array_fn(df_whisper[0:5])
inputs = processor(df_test_wav['speech'], sampling_rate=16_000, return_tensors="pt", padding=True)


with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)

for i, predicted_sentence in enumerate(predicted_sentences):
    print("-" * 100)
    print("Reference:", test_dataset[i]["sentence"])
    print("Prediction:", predicted_sentence)

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

TypeError: Invalid file: 0    C:\Users\Eryk\.cache\huggingface\datasets\down...
1    C:\Users\Eryk\.cache\huggingface\datasets\down...
2    C:\Users\Eryk\.cache\huggingface\datasets\down...
3    C:\Users\Eryk\.cache\huggingface\datasets\down...
4    C:\Users\Eryk\.cache\huggingface\datasets\down...
Name: audiopath_local, dtype: object