In [None]:
#!pip install datasets
#!pip install huggingface_hub

In [10]:
import pandas as pd
from datasets import load_dataset
from evaluate import load
import os
from data_engineering import *
from audio_mixer import mixer
from tqdm.notebook import trange, tqdm

### Downloading whole bigos v2 polish ASR dataset

In [22]:
data = load_dataset("amu-cai/pl-asr-bigos-v2",'all', 'all')


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading dataset shards:   0%|          | 0/55 [00:00<?, ?it/s]

#### Creating dataframe from the training set with 5000 randomly chosen examples


In [24]:
df_model_testing = creating_random_split_df(data['train'], 500)

ValueError: Input must be a DataFrame or a dictionary.

#### Moving audio files to the folder in project

In [None]:
target_folder_path = './data/testing_batch/clear/'
copy_files(df_model_testing, 'audiopath_local', target_folder_path)
df_model_testing['audiopath_project'] = target_folder_path + df_model_testing['audiopath_bigos']

### Saving dataframe to parquet

In [37]:
df_testing = df_model_testing[['audioname','dataset','ref_orig','sampling_rate','audiopath_local','audiopath_project']]
df_testing.to_parquet('testing_batch_df.parquet.gzip', compression = 'gzip')

In [6]:
df_urban_sounds = pd.read_csv('./data/UrbanSound8K/metadata/UrbanSound8K.csv')
df_urban_sounds = df_urban_sounds[~df_urban_sounds['classID'].isin([2, 5, 9])]
df_urban_shuffled = creating_random_split_df(df_urban_sounds,3500)

In [7]:
# Function to create the file path
def create_file_path(row, folder_path):
    folder_number = row['fold']
    file_name = row['slice_file_name']
    file_path = os.path.join(folder_path, f'fold{folder_number}', file_name)
    return file_path

base_path = '.\\data\\UrbanSound8K\\audio\\'

# Apply the function to create the new column
df_urban_shuffled['audio_path'] = df_urban_shuffled.apply(create_file_path, axis=1, folder_path=base_path)


In [8]:
visc_folder_path = '.\\data\\VISC Dataset SON\\'


file_paths = []
class_ids = []


# Traverse the directory
for filename in os.listdir(visc_folder_path):
    # Join the folder path with the filename to get the full file path
    file_path = os.path.join(visc_folder_path, filename)
    
    # Extract the class ID from the file name
    class_id = int(filename.split()[0])
    
    # Append the values to the lists
    file_paths.append(file_path)
    class_ids.append(class_id)

# Create a DataFrame
df = pd.DataFrame({'file_path': file_paths, 'class_id': class_ids})



In [9]:
visc_noises_dataframe = creating_random_split_df(df,1500)

visc_noises_dict = {1 : 'bus_interior',
                    2 : 'minibus_interior',
                    3 : 'pickup_interior',
                    4 : 'sports_car_interior',
                    5 : 'jeep_interior',
                    6 : 'truck_interior',
                    7 : 'crossover_interior',
                    8 : 'other_car_interior'}
visc_noises_dataframe['class'] = visc_noises_dataframe['class_id'].map(visc_noises_dict)

In [12]:
noise_combined_df = pd.merge(visc_noises_dataframe[['file_path', 'class']],df_urban_shuffled[['audio_path', 'class']], left_index=True, right_index=True, how='outer')

noise_combined_df.rename(columns={'file_path': 'noise_path'}, inplace=True)

### Loading work dataframe from parquet

In [3]:
df_testing_models = pd.read_parquet('testing_batch_df.parquet.gzip') 
df_testing_models

Unnamed: 0,audioname,dataset,ref_orig,sampling_rate,audiopath_local,audiopath_project
0,mozilla-common_voice_15-23-train-2856-01818,mozilla-common_voice_15-23,"Jest także trzecia sprawa, która w czasie tej ...",16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mozilla-common_voic...
1,pjatk-clarin_studio-15-train-0457-00001,pjatk-clarin_studio-15,dżuma wziernik przemianę księdzu krzywdzen...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_studio...
2,pjatk-clarin_mobile-15-train-0083-00007,pjatk-clarin_mobile-15,w piątek po południu była przesłuchiwana przez...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_mobile...
3,pwr-maleset-unk-train-0001-03097,pwr-maleset-unk,jeśli chcesz zostanę w domu,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pwr-maleset-unk-tra...
4,mozilla-common_voice_15-23-train-2862-00017,mozilla-common_voice_15-23,Tekst nie opiera się na żadnych podstawach nau...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mozilla-common_voic...
...,...,...,...,...,...,...
4995,mailabs-corpus_librivox-19-train-2023-00011,mailabs-corpus_librivox-19,Nareszcie zniecierpliwiony kazał zamurować okn...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mailabs-corpus_libr...
4996,pjatk-clarin_studio-15-train-0289-00016,pjatk-clarin_studio-15,dostała za ten reportaż nagrodę pulicera ...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_studio...
4997,mozilla-common_voice_15-23-train-2846-00448,mozilla-common_voice_15-23,Dotyczy ona zasadniczo dwóch kwestii,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mozilla-common_voic...
4998,pjatk-clarin_mobile-15-train-0035-00018,pjatk-clarin_mobile-15,każdy starał się odlecieć najbliższym samolote...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_mobile...


## Noises 

#### Shuffling noises from Urban Sounds and VISC Dataset to create one noises dataset

In [14]:
visc_noises_df = visc_noises_dataframe[['file_path','class']]
urban_noises_df = df_urban_shuffled[['audio_path','class']]
urban_noises_df['file_path'] = urban_noises_df['audio_path']
urban_noises_df_2 = urban_noises_df[['file_path','class']]

In [17]:
noise_df = pd.concat([urban_noises_df_2,visc_noises_df],ignore_index=True)
df_testing_models[['noise_path','noise_class']] = noise_df[['file_path','class']]
df_testing_models.to_parquet('full_dataframe_with_noises.parquet.gzip', compression = 'gzip')


## Reading full dataframe from parquet

In [2]:
full_df = pd.read_parquet('full_dataframe_with_noises.parquet.gzip') 

#### Creating folders with mixed data

In [4]:
snr_values = [100,50, 25, 10, 5, 1, 0.5, 0.1]

In [5]:

# Loop through each SNR value
for snr in snr_values:
    # Create a folder for the current SNR value
    folder_path = f'./data/mixed_recordings/SNR_{snr}'
    os.makedirs(folder_path, exist_ok=True)

    # Loop through the dataframe and mix files for the current SNR value
    for index, row in full_df.iterrows():
        signal_path = row['audiopath_local']
        noise_path = row['noise_path']
        audio_name = row['audioname'] + '.wav'
        save_path = os.path.join(folder_path, audio_name)  # Change the naming convention if needed

        # Call your mixer function here
        mixer(signal_path, noise_path, snr, save_path)

### Model bark whisper v3 Large


In [3]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [6]:
torch.cuda.is_available()

True

In [14]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# Specify the CUDA device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_id = "openai/whisper-large-v3"
torch_dtype = torch.float32  # You can adjust the dtype if needed

# Load model and move it to CUDA
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

# Load processor
processor = AutoProcessor.from_pretrained(model_id)

# Create the pipeline with CUDA support
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
#whisper_results = []
#for i in range(len(df_whisper)):
#    sample = df_whisper['audiopath_local'][i]
#    result = pipe(sample)
#    whisper_results.append(result['text'])

#df_whisper['whisper_pred'] = whisper_results
results = []

for i in trange(len(snr_dataframe)):
    sample = snr_dataframe['SNR_10'][i]
    result = pipe(sample, generate_kwargs={"language": "polish"})
    results.append(result['text'])

  0%|          | 0/5000 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


In [19]:
full_df['whisper_SNR_100'] = results

In [20]:
wer = load("wer")
wer_score = wer.compute(predictions=full_df['whisper_SNR_100'], references=full_df['ref_orig'])
print(wer_score * 100)

133.1217381831866


In [17]:
snr_dataframe

Unnamed: 0,SNR_0.1,SNR_0.5,SNR_1,SNR_10,SNR_100,SNR_25,SNR_5,SNR_50
0,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
1,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
2,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
3,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
4,.\data\mixed_recordings\SNR_0.1\fair-mls-20-tr...,.\data\mixed_recordings\SNR_0.5\fair-mls-20-tr...,.\data\mixed_recordings\SNR_1\fair-mls-20-trai...,.\data\mixed_recordings\SNR_10\fair-mls-20-tra...,.\data\mixed_recordings\SNR_100\fair-mls-20-tr...,.\data\mixed_recordings\SNR_25\fair-mls-20-tra...,.\data\mixed_recordings\SNR_5\fair-mls-20-trai...,.\data\mixed_recordings\SNR_50\fair-mls-20-tra...
...,...,...,...,...,...,...,...,...
4995,.\data\mixed_recordings\SNR_0.1\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_0.5\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_1\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_10\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_100\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_25\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_5\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_50\pwr-viu-unk-tra...
4996,.\data\mixed_recordings\SNR_0.1\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_0.5\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_1\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_10\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_100\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_25\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_5\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_50\pwr-viu-unk-tra...
4997,.\data\mixed_recordings\SNR_0.1\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_0.5\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_1\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_10\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_100\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_25\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_5\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_50\pwr-viu-unk-tra...
4998,.\data\mixed_recordings\SNR_0.1\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_0.5\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_1\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_10\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_100\pwr-viu-unk-tr...,.\data\mixed_recordings\SNR_25\pwr-viu-unk-tra...,.\data\mixed_recordings\SNR_5\pwr-viu-unk-trai...,.\data\mixed_recordings\SNR_50\pwr-viu-unk-tra...



### Model alexcleu/wav2vec2-large-xlsr-polish 

In [6]:
import torch
import torchaudio
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import re

In [8]:
def create_dataframe_from_folders(main_folder_path):
    data = {}

    for folder_name in os.listdir(main_folder_path):
        folder_path = os.path.join(main_folder_path, folder_name)

        if os.path.isdir(folder_path):
            file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]
            data[folder_name] = file_paths

    df = pd.DataFrame(data)
    return df

In [9]:
mixed_recordings_path = '.\\data\\mixed_recordings\\'
snr_dataframe = create_dataframe_from_folders(mixed_recordings_path)

In [18]:
full_df

Unnamed: 0,audioname,dataset,ref_orig,sampling_rate,audiopath_local,audiopath_project,noise_path,noise_class
0,mozilla-common_voice_15-23-train-2856-01818,mozilla-common_voice_15-23,"Jest także trzecia sprawa, która w czasie tej ...",16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mozilla-common_voic...,.\data\UrbanSound8K\audio\fold2\156893-7-9-0.wav,jackhammer
1,pjatk-clarin_studio-15-train-0457-00001,pjatk-clarin_studio-15,dżuma wziernik przemianę księdzu krzywdzen...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_studio...,.\data\UrbanSound8K\audio\fold1\40722-8-0-4.wav,siren
2,pjatk-clarin_mobile-15-train-0083-00007,pjatk-clarin_mobile-15,w piątek po południu była przesłuchiwana przez...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_mobile...,.\data\UrbanSound8K\audio\fold8\125678-7-0-4.wav,jackhammer
3,pwr-maleset-unk-train-0001-03097,pwr-maleset-unk,jeśli chcesz zostanę w domu,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pwr-maleset-unk-tra...,.\data\UrbanSound8K\audio\fold9\105029-7-2-16.wav,jackhammer
4,mozilla-common_voice_15-23-train-2862-00017,mozilla-common_voice_15-23,Tekst nie opiera się na żadnych podstawach nau...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mozilla-common_voic...,.\data\UrbanSound8K\audio\fold10\99192-4-0-54.wav,drilling
...,...,...,...,...,...,...,...,...
4995,mailabs-corpus_librivox-19-train-2023-00011,mailabs-corpus_librivox-19,Nareszcie zniecierpliwiony kazał zamurować okn...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mailabs-corpus_libr...,.\data\VISC Dataset SON\7 (249).wav,crossover_interior
4996,pjatk-clarin_studio-15-train-0289-00016,pjatk-clarin_studio-15,dostała za ten reportaż nagrodę pulicera ...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_studio...,.\data\VISC Dataset SON\2 (172).wav,minibus_interior
4997,mozilla-common_voice_15-23-train-2846-00448,mozilla-common_voice_15-23,Dotyczy ona zasadniczo dwóch kwestii,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/mozilla-common_voic...,.\data\VISC Dataset SON\5 (164).wav,jeep_interior
4998,pjatk-clarin_mobile-15-train-0035-00018,pjatk-clarin_mobile-15,każdy starał się odlecieć najbliższym samolote...,16000,C:\Users\Eryk\.cache\huggingface\datasets\down...,./data/testing_batch/clear/pjatk-clarin_mobile...,.\data\VISC Dataset SON\6 (638).wav,truck_interior


In [8]:
snr_dataframe['sentence'] = full_df['ref_orig']

In [9]:
print(torch.cuda.is_available())

True


In [12]:
processor = Wav2Vec2Processor.from_pretrained("alexcleu/wav2vec2-large-xlsr-polish")
model = Wav2Vec2ForCTC.from_pretrained("alexcleu/wav2vec2-large-xlsr-polish")
resampler = torchaudio.transforms.Resample(48_000, 16_000)

def speech_file_to_array_fn(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
  
    speech_array, sampling_rate = torchaudio.load(batch["SNR_100"])
  
    batch["speech_SNR_100"] = resampler(speech_array).squeeze().numpy()
  
    return batch

wer = load_metric("wer")
processor = Wav2Vec2Processor.from_pretrained("alexcleu/wav2vec2-large-xlsr-polish")
model = Wav2Vec2ForCTC.from_pretrained("alexcleu/wav2vec2-large-xlsr-polish")
model.to("cuda")
chars_to_ignore_regex = '[\\\\\\\\\\\\\\\\,\\\\\\\\\\\\\\\\?\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\!\\\\\\\\\\\\\\\\-\\\\\\\\\\\\\\\\;\\\\\\\\\\\\\\\\:\\\\\\\\\\\\\\\\"\\\\\\\\\\\\\\\\“]'
resampler = torchaudio.transforms.Resample(48_000, 16_000)

# Assuming snr_dataframe is a DataFrame with columns like "sentence" and "SNR_100"
snr_dataframe = snr_dataframe.map(speech_file_to_array_fn)

def evaluate(batch):
    inputs = processor(batch["speech_SNR_100"], sampling_rate=16_000, return_tensors="pt", padding=True)
    
    with torch.no_grad():
        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
    
    pred_ids = torch.argmax(logits, dim=-1)
    
    batch["pred_strings"] = processor.batch_decode(pred_ids)
    
    return batch

# Assuming snr_dataframe is a DataFrame with columns like "sentence" and "SNR_100"
result = snr_dataframe.map(evaluate, batched=True, batch_size=8)
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))


Some weights of the model checkpoint at alexcleu/wav2vec2-large-xlsr-polish were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at alexcleu/wav2vec2-large-xlsr-polish and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably T

TypeError: string indices must be integers, not 'str'

### Model jonatasgrosman/wav2vec2-large-xlsr-53-polish

In [4]:
import torch
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [15]:

LANG_ID = "pl"
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-polish"
SAMPLES = 5


processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["SNR100"], sr=16_000)
    batch["speech"] = speech_array
    batch["sentence"] = batch["ref_orig"].upper()
    return batch

df_test_wav = pd.DataFrame()
df_test_wav = speech_file_to_array_fn(df_whisper[0:5])
inputs = processor(df_test_wav['speech'], sampling_rate=16_000, return_tensors="pt", padding=True)


with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)

for i, predicted_sentence in enumerate(predicted_sentences):
    print("-" * 100)
    print("Reference:", test_dataset[i]["sentence"])
    print("Prediction:", predicted_sentence)

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

TypeError: Invalid file: 0    C:\Users\Eryk\.cache\huggingface\datasets\down...
1    C:\Users\Eryk\.cache\huggingface\datasets\down...
2    C:\Users\Eryk\.cache\huggingface\datasets\down...
3    C:\Users\Eryk\.cache\huggingface\datasets\down...
4    C:\Users\Eryk\.cache\huggingface\datasets\down...
Name: audiopath_local, dtype: object