In [26]:
import os
import pandas as pd
from tqdm.notebook import trange, tqdm
import torch
from pydub import AudioSegment
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import re

### Reading dataframe with recordings paths

In [2]:
df_audio = pd.read_parquet('./data/parquets/SNR_testing_dataset.gzip')

In [4]:
print(df_audio.columns.to_list())

['audioname', 'dataset', 'ref_orig', 'sampling_rate', 'audiopath_bigos', 'audiopath_local', 'noise_path', 'noise_class', 'normalised_audio_path', 'normalised_noise_path', 'audio_SNR_100_path', 'audio_SNR_50_path', 'audio_SNR_25_path', 'audio_SNR_10_path', 'audio_SNR_5_path', 'audio_SNR_0.1_path', 'audio_SNR_-1_path', 'audio_SNR_-3_path', 'audio_SNR_-10_path']


In [5]:
torch.cuda.is_available()

True

### Enabling model Whisper v3 large

In [6]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# Specify the CUDA device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_id = "openai/whisper-large-v3"
torch_dtype = torch.float32  # You can adjust the dtype if needed

# Load model and move it to CUDA
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

# Load processor
processor = AutoProcessor.from_pretrained(model_id)

# Create the pipeline with CUDA support
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Testing all values

In [7]:
#whisper_results = []
#for i in range(len(df_whisper)):
#    sample = df_whisper['audiopath_local'][i]
#    result = pipe(sample)
#    whisper_results.append(result['text'])

#df_whisper['whisper_pred'] = whisper_results
snr_list = ['audio_SNR_100_path', 'audio_SNR_50_path', 'audio_SNR_25_path', 'audio_SNR_10_path', 'audio_SNR_5_path', 'audio_SNR_0.1_path', 'audio_SNR_-1_path', 'audio_SNR_-3_path', 'audio_SNR_-10_path']
for snr in snr_list:
    audio_paths = df_audio[snr].to_list()
    results = []
    for i in trange(len(audio_paths)):
        sample = audio_paths[i]
        result = pipe(sample, generate_kwargs={"language": "polish"})
        results.append(result['text'])
    col_name = f"WER_{snr}"
    df_audio[col_name] = results 
    

  0%|          | 0/2500 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the m

  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the m

  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the m

In [8]:
#df_audio.to_parquet('./data/parquets/Whisper_SNR_WER.gzip', compression = 'gzip')

### Model jonatasgrosman/wav2vec2-large-xlsr-53-polish

In [2]:
df_audio = pd.read_parquet('./data/parquets/SNR_testing_dataset.gzip')

In [3]:
df_audio.columns.to_list()

['audioname',
 'dataset',
 'ref_orig',
 'sampling_rate',
 'audiopath_bigos',
 'audiopath_local',
 'noise_path',
 'noise_class',
 'normalised_audio_path',
 'normalised_noise_path',
 'audio_SNR_100_path',
 'audio_SNR_50_path',
 'audio_SNR_25_path',
 'audio_SNR_10_path',
 'audio_SNR_5_path',
 'audio_SNR_0.1_path',
 'audio_SNR_-1_path',
 'audio_SNR_-3_path',
 'audio_SNR_-10_path']

In [4]:
import torch
import librosa
from datasets import load_dataset, Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import numpy as np

In [5]:
print(torch.cuda.is_available())

True


In [6]:
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-polish"

In [7]:
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

In [31]:
ds = Dataset.from_pandas(df_audio)

In [11]:
print(ds)

Dataset({
    features: ['audioname', 'dataset', 'ref_orig', 'sampling_rate', 'audiopath_bigos', 'audiopath_local', 'noise_path', 'noise_class', 'normalised_audio_path', 'normalised_noise_path', 'audio_SNR_100_path', 'audio_SNR_50_path', 'audio_SNR_25_path', 'audio_SNR_10_path', 'audio_SNR_5_path', 'audio_SNR_0.1_path', 'audio_SNR_-1_path', 'audio_SNR_-3_path', 'audio_SNR_-10_path'],
    num_rows: 2500
})


In [32]:
def speech_file_to_array_fn(batch,input_column):
    speech_array, sampling_rate = librosa.load(batch[input_column], sr=16_000)
    batch["speech"] = speech_array
    return batch

In [33]:
df_wav2wec = pd.DataFrame()
df_wav2wec[['audio_name','sentences']] = df_audio[['audiopath_bigos','ref_orig']]
df_wav2wec

Unnamed: 0,audio_name,sentences
0,fair-mls-20-train-0009-04739.wav,tam nocne włóczęgi wołano z dachów jeżeli nie ...
1,pjatk-clarin_studio-15-train-0488-00003.wav,w pracy studenci chcieliby przede wszystk...
2,fair-mls-20-train-0009-05501.wav,co to znaczy sam siebie zapytywał faraon czy g...
3,fair-mls-20-train-0021-01519.wav,tylko na piaszczystem wybrzeżu lub na łąkach b...
4,pjatk-clarin_studio-15-train-0335-00001.wav,zaokrągla uziemienie księdzu liźnięcie rol...
...,...,...
2495,fair-mls-20-train-0009-06517.wav,kazał zrobić spis wszystkich mężczyzn w państw...
2496,mozilla-common_voice_15-23-train-2851-00218.wav,"W odniesieniu do Lizbony, uczyniliśmy także po..."
2497,mozilla-common_voice_15-23-train-2856-01361.wav,"Jej budżet to budżet, który wspiera inwestycje"
2498,fair-mls-20-train-0009-03165.wav,upłynęło już kilka godzin po zachodzie słońca ...


In [34]:
snr_paths = ['audio_SNR_100_path', 'audio_SNR_50_path', 'audio_SNR_25_path', 'audio_SNR_10_path', 'audio_SNR_5_path', 'audio_SNR_0.1_path', 'audio_SNR_-1_path', 'audio_SNR_-3_path', 'audio_SNR_-10_path']
for snr in snr_paths:
    test_dataset = ds.map(speech_file_to_array_fn,snr_paths)
    predictions = []
    batch = 50
    for i in range(batch):
        inputs = processor(test_dataset["speech"][(i)*50:(i+1)*50], sampling_rate=16_000, return_tensors="pt", padding=True)
        with torch.no_grad():
            logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_sentences = processor.batch_decode(predicted_ids)
        predictions.append(predicted_sentences)

    
    prefix = re.search(r'SNR_\d+', snr).group()
    col_name = f"Wav2wec_{prefix}"
    df_wav2wec[col_name] = predictions



Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

KeyError: 0

In [None]:
test_dataset = ds.map(speech_file_to_array_fn)
inputs = processor(test_dataset["speech"][0:30], sampling_rate=16_000, return_tensors="pt", padding=True)

In [17]:
with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)