### Installing necessary packages

In [None]:
#!pip install datasets
#!pip install huggingface_hub

### Loading packages

In [1]:
import os
import pandas as pd
from datasets import load_dataset, load_metric
from evaluate import load
from src.data_engineering import create_distribution_dict, creating_random_split_df
from src.audio_mixer import mixer
from tqdm.notebook import trange
import torch
from pydub import AudioSegment

### Downloading whole bigos v2 polish ASR dataset. WARNING, dataset contains ~80 GB od data

In [2]:
# if the dataset is already downloaded it just makes dictionary out downloaded data (approx. 10 seconds)
data = load_dataset("amu-cai/pl-asr-bigos-v2",'all', 'all')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Running script


Loading dataset shards:   0%|          | 0/55 [00:00<?, ?it/s]

#### Creating dictionary showing database distribution inside whole database, getting dataset names iterating by dict (faster version - 110 sec)

In [6]:
#Iterating by Dataset dictionary to get datasets names

datasets_list = []
for i in trange(len(data['train'])):
    datasets_list.append(data['train'][i]['dataset'])

df_datasets_distribution = pd.DataFrame()
df_datasets_distribution['datasets'] = datasets_list

#creating dictionary showing database distribution inside whole database
dict_dst = create_distribution_dict(df_datasets_distribution['datasets'])
dict_dst

{'fair-mls-20': 0.305,
 'mozilla-common_voice_15-23': 0.233,
 'mailabs-corpus_librivox-19': 0.144,
 'pjatk-clarin_studio-15': 0.134,
 'pwr-maleset-unk': 0.046,
 'pjatk-clarin_mobile-15': 0.035,
 'google-fleurs-22': 0.035,
 'pwr-viu-unk': 0.026,
 'pwr-azon_read-20': 0.022,
 'pwr-shortwords-unk': 0.009,
 'polyai-minds14-21': 0.006,
 'pwr-azon_spont-20': 0.004}

### Creating dataframe from the training set with 2500 randomly chosen examples


In [24]:
df_model_testing = creating_random_split_df(data['train'], 2500)

### Getting the distribution of each dataset in choosen test-set

In [25]:
create_distribution_dict(df_model_testing['dataset'])

{'fair-mls-20': 0.29,
 'mozilla-common_voice_15-23': 0.247,
 'mailabs-corpus_librivox-19': 0.145,
 'pjatk-clarin_studio-15': 0.138,
 'pwr-maleset-unk': 0.046,
 'google-fleurs-22': 0.042,
 'pjatk-clarin_mobile-15': 0.036,
 'pwr-viu-unk': 0.026,
 'pwr-azon_read-20': 0.015,
 'pwr-shortwords-unk': 0.008,
 'polyai-minds14-21': 0.004,
 'pwr-azon_spont-20': 0.004}

We can compare to see distribution of sets from whole dataset and from our choosen batch

In [8]:
df_model_testing

Unnamed: 0,audioname,split,dataset,speaker_id,ref_orig,audio,samplingrate_orig,sampling_rate,audiopath_bigos,audiopath_local
0,pjatk-clarin_mobile-15-train-0029-00026,train,pjatk-clarin_mobile-15,29,malinda musiała stanowczo oznajmić że nie potr...,{'path': 'pjatk-clarin_mobile-15-train-0029-00...,16000,16000,pjatk-clarin_mobile-15-train-0029-00026.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
1,fair-mls-20-train-0024-00346,train,fair-mls-20,24,szczęśliwej podróży do widzenia wołał lord puc...,"{'path': 'fair-mls-20-train-0024-00346.wav', '...",16000,16000,fair-mls-20-train-0024-00346.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
2,mailabs-corpus_librivox-19-train-1002-00097,train,mailabs-corpus_librivox-19,1002,Wreszcie machnął ręką i poszedł dalej może myś...,{'path': 'mailabs-corpus_librivox-19-train-100...,16000,16000,mailabs-corpus_librivox-19-train-1002-00097.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
3,mailabs-corpus_librivox-19-train-2019-00258,train,mailabs-corpus_librivox-19,2019,Co do mnie przekonany jestem że uśpiono mnie z...,{'path': 'mailabs-corpus_librivox-19-train-201...,16000,16000,mailabs-corpus_librivox-19-train-2019-00258.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
4,google-fleurs-22-train-0001-01189,train,google-fleurs-22,1,imprezy te standardowo trwają od trzech do sze...,{'path': 'google-fleurs-22-train-0001-01189.wa...,16000,16000,google-fleurs-22-train-0001-01189.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
...,...,...,...,...,...,...,...,...,...,...
2495,mozilla-common_voice_15-23-train-2851-01283,train,mozilla-common_voice_15-23,2851,Przede wszystkim musimy trwać przy instrumenta...,{'path': 'mozilla-common_voice_15-23-train-285...,16000,16000,mozilla-common_voice_15-23-train-2851-01283.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
2496,mozilla-common_voice_15-23-train-2849-00111,train,mozilla-common_voice_15-23,2849,Tak rozstali się.,{'path': 'mozilla-common_voice_15-23-train-284...,16000,16000,mozilla-common_voice_15-23-train-2849-00111.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
2497,fair-mls-20-train-0024-05850,train,fair-mls-20,24,nie bój się pan oni go nie wypuszczą od siebie...,"{'path': 'fair-mls-20-train-0024-05850.wav', '...",16000,16000,fair-mls-20-train-0024-05850.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...
2498,pjatk-clarin_studio-15-train-0539-00019,train,pjatk-clarin_studio-15,539,my mówimy że najlepiej zadba o siebie o...,{'path': 'pjatk-clarin_studio-15-train-0539-00...,16000,16000,pjatk-clarin_studio-15-train-0539-00019.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...


### Saving audio dataframe to parquet

In [26]:
df_model_testing=df_model_testing[['audioname','dataset','ref_orig','sampling_rate','audiopath_bigos','audiopath_local']]
#line commented not to overwrite accidentally
#df_model_testing.to_parquet('./data/parquets/testing_batch.parquet.gzip', compression = 'gzip')

## Noise 

### UrbanSound dataset

In [7]:
df_urban_sounds = pd.read_csv('./data/UrbanSound8K/metadata/UrbanSound8K.csv')
df_urban_sounds

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.000000,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.500000,62.500000,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.500000,64.500000,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.000000,67.000000,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.500000,72.500000,1,5,2,children_playing
...,...,...,...,...,...,...,...,...
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.209850,255.741948,2,7,1,car_horn


In [8]:
#deleting classes that are not necessary, and leaving: dog_bark, air_conditioner, jackhammer, drilling
df_urban_sounds = df_urban_sounds[~df_urban_sounds['classID'].isin([1, 2, 5, 6, 8, 9])]

In [9]:
print('Each class distributution: ',create_distribution_dict(df_urban_sounds['class'],False))
print('Length of the result UrbanNoises dataframe: ',len(df_urban_sounds['class']))

Each class distributution:  {'dog_bark': 1000, 'air_conditioner': 1000, 'jackhammer': 1000, 'drilling': 1000}
Length of the result UrbanNoises dataframe:  4000


In [11]:
df_urban_sounds_2000 = creating_random_split_df(df_urban_sounds, 2000)
df_urban_sounds_2000

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,79089-0-0-106.wav,79089,62.664375,66.664375,2,9,0,air_conditioner
1,167750-4-1-0.wav,167750,14.330286,18.330286,1,10,4,drilling
2,165039-7-5-0.wav,165039,64.924130,68.924130,1,3,7,jackhammer
3,57320-0-0-22.wav,57320,11.000000,15.000000,2,1,0,air_conditioner
4,146709-0-0-20.wav,146709,10.000000,14.000000,1,4,0,air_conditioner
...,...,...,...,...,...,...,...,...
1995,194754-3-0-1.wav,194754,0.812357,4.812357,1,7,3,dog_bark
1996,30206-7-0-1.wav,30206,0.500000,4.500000,1,6,7,jackhammer
1997,46669-4-0-54.wav,46669,27.000000,31.000000,1,1,4,drilling
1998,24364-4-0-0.wav,24364,0.633371,4.633371,1,6,4,drilling


##### Creating path to each file in dataframe

In [12]:
# Function to create the file path
def create_file_path(row, folder_path):
    folder_number = row['fold']
    file_name = row['slice_file_name']
    file_path = os.path.join(folder_path, f'fold{folder_number}', file_name)
    return file_path

base_path = '.\\data\\UrbanSound8K\\audio\\'

# Apply the function to create the new column
df_urban_sounds_2000['audio_path'] = df_urban_sounds_2000.apply(create_file_path, axis=1, folder_path=base_path)

In [13]:
df_urban_sounds_2000

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,audio_path
0,79089-0-0-106.wav,79089,62.664375,66.664375,2,9,0,air_conditioner,.\data\UrbanSound8K\audio\fold9\79089-0-0-106.wav
1,167750-4-1-0.wav,167750,14.330286,18.330286,1,10,4,drilling,.\data\UrbanSound8K\audio\fold10\167750-4-1-0.wav
2,165039-7-5-0.wav,165039,64.924130,68.924130,1,3,7,jackhammer,.\data\UrbanSound8K\audio\fold3\165039-7-5-0.wav
3,57320-0-0-22.wav,57320,11.000000,15.000000,2,1,0,air_conditioner,.\data\UrbanSound8K\audio\fold1\57320-0-0-22.wav
4,146709-0-0-20.wav,146709,10.000000,14.000000,1,4,0,air_conditioner,.\data\UrbanSound8K\audio\fold4\146709-0-0-20.wav
...,...,...,...,...,...,...,...,...,...
1995,194754-3-0-1.wav,194754,0.812357,4.812357,1,7,3,dog_bark,.\data\UrbanSound8K\audio\fold7\194754-3-0-1.wav
1996,30206-7-0-1.wav,30206,0.500000,4.500000,1,6,7,jackhammer,.\data\UrbanSound8K\audio\fold6\30206-7-0-1.wav
1997,46669-4-0-54.wav,46669,27.000000,31.000000,1,1,4,drilling,.\data\UrbanSound8K\audio\fold1\46669-4-0-54.wav
1998,24364-4-0-0.wav,24364,0.633371,4.633371,1,6,4,drilling,.\data\UrbanSound8K\audio\fold6\24364-4-0-0.wav


### Creating dataframe from Vechicle Interior Sound folder

In [8]:
visc_folder_path = '.\\data\\VISC Dataset SON\\'

file_paths = []
class_ids = []
file_names = []


# Traverse the directory
for filename in os.listdir(visc_folder_path):
    # Join the folder path with the filename to get the full file path
    file_path = os.path.join(visc_folder_path, filename)
    
    # Extract the class ID from the file name
    class_id = int(filename.split()[0])
    
    # Append the values to the lists
    file_names.append(filename)
    file_paths.append(file_path)
    class_ids.append(class_id)

# Create a DataFrame
df = pd.DataFrame({'file_names': file_names, 'file_path': file_paths, 'class_id': class_ids})

In [9]:
df.to_parquet('./data/parquets/VISC_noises.parquet.gzip', compression= 'gzip')

#### Ading each noise class name to the dataframe as a column

In [3]:
visc_noises_dataframe = creating_random_split_df(df,500)

visc_noises_dict = {1 : 'bus_interior',
                    2 : 'minibus_interior',
                    3 : 'pickup_interior',
                    4 : 'sports_car_interior',
                    5 : 'jeep_interior',
                    6 : 'truck_interior',
                    7 : 'crossover_interior',
                    8 : 'other_car_interior'}
visc_noises_dataframe['class'] = visc_noises_dataframe['class_id'].map(visc_noises_dict)

In [4]:
visc_noises_dataframe

Unnamed: 0,file_path,class_id,class
0,.\data\VISC Dataset SON\6 (174).wav,6,truck_interior
1,.\data\VISC Dataset SON\6 (863).wav,6,truck_interior
2,.\data\VISC Dataset SON\6 (417).wav,6,truck_interior
3,.\data\VISC Dataset SON\5 (280).wav,5,jeep_interior
4,.\data\VISC Dataset SON\8 (242).wav,8,other_car_interior
...,...,...,...
495,.\data\VISC Dataset SON\4 (9).wav,4,sports_car_interior
496,.\data\VISC Dataset SON\4 (96).wav,4,sports_car_interior
497,.\data\VISC Dataset SON\7 (734).wav,7,crossover_interior
498,.\data\VISC Dataset SON\6 (473).wav,6,truck_interior


### Creating one dataframe with noises

In [22]:
visc_noises_df = visc_noises_dataframe[['file_path','class']]
urban_noises_df = df_urban_sounds_2000[['audio_path','class']]
urban_noises_df['file_path'] = urban_noises_df['audio_path']
urban_noises_df_2 = urban_noises_df[['file_path','class']]
noise_df = pd.concat([urban_noises_df_2,visc_noises_df],ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  urban_noises_df['file_path'] = urban_noises_df['audio_path']


### Saving noise dataframe to parquet

In [29]:
noise_df

Unnamed: 0,file_path,class
0,.\data\UrbanSound8K\audio\fold9\79089-0-0-106.wav,air_conditioner
1,.\data\UrbanSound8K\audio\fold10\167750-4-1-0.wav,drilling
2,.\data\UrbanSound8K\audio\fold3\165039-7-5-0.wav,jackhammer
3,.\data\UrbanSound8K\audio\fold1\57320-0-0-22.wav,air_conditioner
4,.\data\UrbanSound8K\audio\fold4\146709-0-0-20.wav,air_conditioner
...,...,...
2495,.\data\VISC Dataset SON\4 (36).wav,sports_car_interior
2496,.\data\VISC Dataset SON\1 (251).wav,bus_interior
2497,.\data\VISC Dataset SON\1 (692).wav,bus_interior
2498,.\data\VISC Dataset SON\2 (492).wav,minibus_interior


In [27]:
#noise_df.to_parquet('./data/parquets/noise_df.parquet.gzip', compression = 'gzip')

## Creating one dataframe with everything combined

In [2]:
df_audio= pd.read_parquet('./data/parquets/testing_batch.parquet.gzip') 
df_noises = pd.read_parquet('./data/parquets/noise_df.parquet.gzip')

In [3]:
df_audio['noise_path'] = df_noises['file_path']
df_audio['noise_class'] = df_noises['class']
df_audio

Unnamed: 0,audioname,dataset,ref_orig,sampling_rate,audiopath_bigos,audiopath_local,noise_path,noise_class
0,fair-mls-20-train-0009-04739,fair-mls-20,tam nocne włóczęgi wołano z dachów jeżeli nie ...,16000,fair-mls-20-train-0009-04739.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\UrbanSound8K\audio\fold9\79089-0-0-106.wav,air_conditioner
1,pjatk-clarin_studio-15-train-0488-00003,pjatk-clarin_studio-15,w pracy studenci chcieliby przede wszystk...,16000,pjatk-clarin_studio-15-train-0488-00003.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\UrbanSound8K\audio\fold10\167750-4-1-0.wav,drilling
2,fair-mls-20-train-0009-05501,fair-mls-20,co to znaczy sam siebie zapytywał faraon czy g...,16000,fair-mls-20-train-0009-05501.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\UrbanSound8K\audio\fold3\165039-7-5-0.wav,jackhammer
3,fair-mls-20-train-0021-01519,fair-mls-20,tylko na piaszczystem wybrzeżu lub na łąkach b...,16000,fair-mls-20-train-0021-01519.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\UrbanSound8K\audio\fold1\57320-0-0-22.wav,air_conditioner
4,pjatk-clarin_studio-15-train-0335-00001,pjatk-clarin_studio-15,zaokrągla uziemienie księdzu liźnięcie rol...,16000,pjatk-clarin_studio-15-train-0335-00001.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\UrbanSound8K\audio\fold4\146709-0-0-20.wav,air_conditioner
...,...,...,...,...,...,...,...,...
2495,fair-mls-20-train-0009-06517,fair-mls-20,kazał zrobić spis wszystkich mężczyzn w państw...,16000,fair-mls-20-train-0009-06517.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\VISC Dataset SON\4 (36).wav,sports_car_interior
2496,mozilla-common_voice_15-23-train-2851-00218,mozilla-common_voice_15-23,"W odniesieniu do Lizbony, uczyniliśmy także po...",16000,mozilla-common_voice_15-23-train-2851-00218.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\VISC Dataset SON\1 (251).wav,bus_interior
2497,mozilla-common_voice_15-23-train-2856-01361,mozilla-common_voice_15-23,"Jej budżet to budżet, który wspiera inwestycje",16000,mozilla-common_voice_15-23-train-2856-01361.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\VISC Dataset SON\1 (692).wav,bus_interior
2498,fair-mls-20-train-0009-03165,fair-mls-20,upłynęło już kilka godzin po zachodzie słońca ...,16000,fair-mls-20-train-0009-03165.wav,C:\Users\Eryk\.cache\huggingface\datasets\down...,.\data\VISC Dataset SON\2 (492).wav,minibus_interior


In [20]:
df_audio.loc[75]

audioname                    pjatk-clarin_studio-15-train-0292-00018
dataset                                       pjatk-clarin_studio-15
ref_orig           bo  faktycznie  co  da  polsce  upublicznienie...
sampling_rate                                                  16000
audiopath_bigos          pjatk-clarin_studio-15-train-0292-00018.wav
audiopath_local    C:\Users\Eryk\.cache\huggingface\datasets\down...
noise_path           .\data\UrbanSound8K\audio\fold3\69598-4-3-0.wav
noise_class                                                 drilling
Name: 75, dtype: object

## Noises 

#### Normalising noise loudness

In [23]:
import os
import soundfile as sf
import pyloudnorm as pyln

def audio_normalizer(input_path, output_folder_path,output_file_name, loudness_value):

    # Load audio
    data, rate = sf.read(input_path)

    # Measure the loudness
    meter = pyln.Meter(rate, block_size =0.100)
    loudness = meter.integrated_loudness(data)

    # Loudness normalize audio to -12 dB LUFS
    loudness_normalized_audio = pyln.normalize.loudness(data, loudness, loudness_value)

    # Specify the output file paths
    loudness_normalized_output_path = os.path.join(output_folder_path, output_file_name)

    # Write normalized audio to files
    sf.write(loudness_normalized_output_path, loudness_normalized_audio, rate)


### Normalizing audio to - 20dB

In [13]:

files_paths = df_audio['audiopath_local'].to_list()
file_names = df_audio['audiopath_bigos'].to_list()
output_folder_path = "./data/mixed_recordings/normalised_recordings/audio_minus_20dB/"
for i in trange(len(files_paths)):
    audio_normalizer(files_paths[i], output_folder_path,file_names[i], -20.0)


  0%|          | 0/2500 [00:00<?, ?it/s]



### Normalizing noises to -20dB

In [24]:
files_paths = df_audio['noise_path'].to_list()
file_names = df_audio['audiopath_bigos'].to_list()
output_folder_path = "./data/mixed_recordings/normalised_recordings/noise_minus_20dB/"
for i in trange(len(files_paths)):
    audio_normalizer(files_paths[i], output_folder_path,file_names[i], -20.0)

  0%|          | 0/2500 [00:00<?, ?it/s]



### Adding normalised sounds paths to the dataframe

In [25]:
df_audio['normalised_audio_path'] = "./data/mixed_recordings/normalised_recordings/audio_minus_20dB/" + df_audio['audiopath_bigos']
df_audio['normalised_noise_path'] = "./data/mixed_recordings/normalised_recordings/noise_minus_20dB/" + df_audio['audiopath_bigos']

In [30]:
df_audio.loc[341]['normalised_noise_path']

'./data/mixed_recordings/normalised_recordings/noise_minus_20dB/google-fleurs-22-train-0001-02295.wav'

#### Checking loudness of the files to check if function works

In [14]:
import soundfile as sf
import pyloudnorm as pyln
def loudness_function(file_path):
    data,rate = sf.read(file_path)
    meter = pyln.Meter(rate)
    loudness = meter.integrated_loudness(data)
    return loudness


In [15]:

# Example usage:
folder_path = './data/mixed_recordings/normalised_recordings/minus_20dB/'
for filename in os.listdir(folder_path)[0:15]:
    if filename.endswith(".wav"):  # Adjust this condition based on your file format
        audio_file = os.path.join(folder_path, filename)
        mean_loudness,peak_loudness = calculate_loudness(audio_file)
        print(f"File: {filename}, Mean Loudness: {mean_loudness} dB, Peak Loudness: {peak_loudness}")
        loudnes = loudness_function(audio_file)
        print("\n Loudness:",  loudnes)

File: fair-mls-20-train-0009-00003.wav, Mean Loudness: 0.085548035800457 dB, Peak Loudness: 0.2265169769525528

 Loudness: -20.00000112362228
File: fair-mls-20-train-0009-00015.wav, Mean Loudness: 0.0797126367688179 dB, Peak Loudness: 0.2544344961643219

 Loudness: -19.999999688897617
File: fair-mls-20-train-0009-00021.wav, Mean Loudness: 0.05714057385921478 dB, Peak Loudness: 0.2113061249256134

 Loudness: -19.99999932826547
File: fair-mls-20-train-0009-00119.wav, Mean Loudness: 0.08406061679124832 dB, Peak Loudness: 0.20962868630886078

 Loudness: -20.00000084081724
File: fair-mls-20-train-0009-00149.wav, Mean Loudness: 0.08546153455972672 dB, Peak Loudness: 0.31954923272132874

 Loudness: -20.000000841516083
File: fair-mls-20-train-0009-00172.wav, Mean Loudness: 0.05835843086242676 dB, Peak Loudness: 0.20835955440998077

 Loudness: -19.999999791265463
File: fair-mls-20-train-0009-00189.wav, Mean Loudness: 0.08292762190103531 dB, Peak Loudness: 0.24532683193683624

 Loudness: -20.000

#### Creating folders with mixed data

In [33]:
snr_values = [100, 50, 25, 10, 5, 0.1, -1, -3, -10]

In [34]:

# Loop through each SNR value
for snr in snr_values:
    # Create a folder for the current SNR value
    folder_path = f'./data/mixed_recordings/SNR_{snr}'
    os.makedirs(folder_path, exist_ok=True)
    # Loop through the dataframe and mix files for the current SNR value
    snr_paths = []
    for index, row in df_audio.iterrows():
        signal_path = row['normalised_audio_path']
        noise_path = row['normalised_noise_path']
        audio_name = row['audiopath_bigos']
        save_path = os.path.join(folder_path, audio_name)  # Change the naming convention if needed
        snr_paths.append(save_path)

        # Call your mixer function here
        mixer(signal_path, noise_path, snr, save_path)
    
    column_name = f"audio_SNR_{snr}_path"
    df_audio[column_name] = snr_paths

In [36]:
#df_audio.to_parquet('./data/parquets/SNR_testing_dataset.gzip', compression = 'gzip')

### Model bark whisper v3 Large


In [53]:
wer = load_metric('wer')
print("WER: {:2f}".format(100 * wer.compute(predictions=result_df["whisper_SNR_-3"], references=result_df["ref_orig"])))

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


WER: 47.315779


In [14]:
file_paths = snr_df['SNR_10']

# Use the os.path.basename function to extract only the file name from each path
snr_df['audiofile'] = file_paths.apply(lambda x: os.path.basename(x))
snr_df['audiofile'] = snr_df['audiofile'].str.replace('.wav','')

# Now df['file_name'] contains only the file names
print(snr_df['audiofile'])

full_df['audiofile_mixed'] = snr_df['audiofile']

0       fair-mls-20-train-0009-00038
1       fair-mls-20-train-0009-00044
2       fair-mls-20-train-0009-00067
3       fair-mls-20-train-0009-00079
4       fair-mls-20-train-0009-00080
                    ...             
4995    pwr-viu-unk-train-0001-04231
4996    pwr-viu-unk-train-0001-04241
4997    pwr-viu-unk-train-0001-04251
4998    pwr-viu-unk-train-0001-04261
4999    pwr-viu-unk-train-0001-04267
Name: audiofile, Length: 5000, dtype: object



### Model alexcleu/wav2vec2-large-xlsr-polish 

In [6]:
import torch
import torchaudio
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import re

In [46]:
def create_dataframe_from_folders(main_folder_path):
    data = {}

    for folder_name in os.listdir(main_folder_path):
        folder_path = os.path.join(main_folder_path, folder_name)

        if os.path.isdir(folder_path):
            file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]
            data[folder_name] = file_paths

    df = pd.DataFrame(data)
    return df

In [47]:
mixed_recordings_path = '.\\data\\mixed_recordings\\'
snr_dataframe = create_dataframe_from_folders(mixed_recordings_path)

In [8]:
snr_dataframe['sentence'] = full_df['ref_orig']

In [9]:
print(torch.cuda.is_available())

True


In [12]:
processor = Wav2Vec2Processor.from_pretrained("alexcleu/wav2vec2-large-xlsr-polish")
model = Wav2Vec2ForCTC.from_pretrained("alexcleu/wav2vec2-large-xlsr-polish")
resampler = torchaudio.transforms.Resample(48_000, 16_000)

def speech_file_to_array_fn(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
  
    speech_array, sampling_rate = torchaudio.load(batch["SNR_100"])
  
    batch["speech_SNR_100"] = resampler(speech_array).squeeze().numpy()
  
    return batch

wer = load_metric("wer")
processor = Wav2Vec2Processor.from_pretrained("alexcleu/wav2vec2-large-xlsr-polish")
model = Wav2Vec2ForCTC.from_pretrained("alexcleu/wav2vec2-large-xlsr-polish")
model.to("cuda")
chars_to_ignore_regex = '[\\\\\\\\\\\\\\\\,\\\\\\\\\\\\\\\\?\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\!\\\\\\\\\\\\\\\\-\\\\\\\\\\\\\\\\;\\\\\\\\\\\\\\\\:\\\\\\\\\\\\\\\\"\\\\\\\\\\\\\\\\“]'
resampler = torchaudio.transforms.Resample(48_000, 16_000)

# Assuming snr_dataframe is a DataFrame with columns like "sentence" and "SNR_100"
snr_dataframe = snr_dataframe.map(speech_file_to_array_fn)

def evaluate(batch):
    inputs = processor(batch["speech_SNR_100"], sampling_rate=16_000, return_tensors="pt", padding=True)
    
    with torch.no_grad():
        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
    
    pred_ids = torch.argmax(logits, dim=-1)
    
    batch["pred_strings"] = processor.batch_decode(pred_ids)
    
    return batch

# Assuming snr_dataframe is a DataFrame with columns like "sentence" and "SNR_100"
result = snr_dataframe.map(evaluate, batched=True, batch_size=8)
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))


Some weights of the model checkpoint at alexcleu/wav2vec2-large-xlsr-polish were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at alexcleu/wav2vec2-large-xlsr-polish and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably T

TypeError: string indices must be integers, not 'str'

### Model jonatasgrosman/wav2vec2-large-xlsr-53-polish

In [4]:
import torch
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [15]:

LANG_ID = "pl"
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-polish"
SAMPLES = 5


processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["SNR100"], sr=16_000)
    batch["speech"] = speech_array
    batch["sentence"] = batch["ref_orig"].upper()
    return batch

df_test_wav = pd.DataFrame()
df_test_wav = speech_file_to_array_fn(df_whisper[0:5])
inputs = processor(df_test_wav['speech'], sampling_rate=16_000, return_tensors="pt", padding=True)


with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)

for i, predicted_sentence in enumerate(predicted_sentences):
    print("-" * 100)
    print("Reference:", test_dataset[i]["sentence"])
    print("Prediction:", predicted_sentence)

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

TypeError: Invalid file: 0    C:\Users\Eryk\.cache\huggingface\datasets\down...
1    C:\Users\Eryk\.cache\huggingface\datasets\down...
2    C:\Users\Eryk\.cache\huggingface\datasets\down...
3    C:\Users\Eryk\.cache\huggingface\datasets\down...
4    C:\Users\Eryk\.cache\huggingface\datasets\down...
Name: audiopath_local, dtype: object