In [1]:
import os
import pandas as pd
from tqdm.notebook import trange, tqdm
import torch
from pydub import AudioSegment
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline



### Reading dataframe with recordings paths

In [2]:
df_audio = pd.read_parquet('./data/parquets/SNR_testing_dataset.gzip')

In [4]:
print(df_audio.columns.to_list())

['audioname', 'dataset', 'ref_orig', 'sampling_rate', 'audiopath_bigos', 'audiopath_local', 'noise_path', 'noise_class', 'normalised_audio_path', 'normalised_noise_path', 'audio_SNR_100_path', 'audio_SNR_50_path', 'audio_SNR_25_path', 'audio_SNR_10_path', 'audio_SNR_5_path', 'audio_SNR_0.1_path', 'audio_SNR_-1_path', 'audio_SNR_-3_path', 'audio_SNR_-10_path']


In [5]:
torch.cuda.is_available()

True

### Enabling model Whisper v3 large

In [6]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# Specify the CUDA device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_id = "openai/whisper-large-v3"
torch_dtype = torch.float32  # You can adjust the dtype if needed

# Load model and move it to CUDA
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

# Load processor
processor = AutoProcessor.from_pretrained(model_id)

# Create the pipeline with CUDA support
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Testing all values

In [7]:
#whisper_results = []
#for i in range(len(df_whisper)):
#    sample = df_whisper['audiopath_local'][i]
#    result = pipe(sample)
#    whisper_results.append(result['text'])

#df_whisper['whisper_pred'] = whisper_results
snr_list = ['audio_SNR_100_path', 'audio_SNR_50_path', 'audio_SNR_25_path', 'audio_SNR_10_path', 'audio_SNR_5_path', 'audio_SNR_0.1_path', 'audio_SNR_-1_path', 'audio_SNR_-3_path', 'audio_SNR_-10_path']
for snr in snr_list:
    audio_paths = df_audio[snr].to_list()
    results = []
    for i in trange(len(audio_paths)):
        sample = audio_paths[i]
        result = pipe(sample, generate_kwargs={"language": "polish"})
        results.append(result['text'])
    col_name = f"WER_{snr}"
    df_audio[col_name] = results 
    

  0%|          | 0/2500 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the m

  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the m

  0%|          | 0/2500 [00:00<?, ?it/s]

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the m

In [8]:
#df_audio.to_parquet('./data/parquets/Whisper_SNR_WER.gzip', compression = 'gzip')

### Model jonatasgrosman/wav2vec2-large-xlsr-53-polish

In [2]:
df_audio = pd.read_parquet('./data/parquets/SNR_testing_dataset.gzip')

In [8]:
df_audio.columns.to_list()

['audioname',
 'dataset',
 'ref_orig',
 'sampling_rate',
 'audiopath_bigos',
 'audiopath_local',
 'noise_path',
 'noise_class',
 'normalised_audio_path',
 'normalised_noise_path',
 'audio_SNR_100_path',
 'audio_SNR_50_path',
 'audio_SNR_25_path',
 'audio_SNR_10_path',
 'audio_SNR_5_path',
 'audio_SNR_0.1_path',
 'audio_SNR_-1_path',
 'audio_SNR_-3_path',
 'audio_SNR_-10_path']

In [29]:
import torch
import librosa
from datasets import load_dataset, Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import numpy as np

In [None]:
print(torch.cuda.is_available())

In [5]:

MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-polish"

In [27]:
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-polish and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

In [None]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
    batch["speech"] = speech_array
    batch["sentence"] = batch["sentence"].upper()
    return batch

In [11]:
test_df = pd.DataFrame({'file_path': ['./data/example_SNR_audio/88_SNR_audio_file.wav','./data/example_SNR_audio/55_SNR_audio_file.wav','./data/example_SNR_audio/30_SNR_audio_file.wav'],
                        'transcription' : ['ale jaja panie ferdku', 'random string', 'losowy input' ]})

In [31]:
ds = Dataset.from_pandas(test_df)

In [22]:
def speech_file_to_array_fn(batch,input_column,output_column):
    sound_arrays = []
    for path in batch[input_column]:
        speech_array, sampling_rate = librosa.load(path, sr=16_000)
        sound_arrays.append(speech_array)
        
    batch[output_column] = sound_arrays
    
    return batch

test_df = speech_file_to_array_fn(test_df, 'file_path', 'audio_arrays')

In [24]:
input_arrays = torch.tensor(test_df['audio_arrays'].to_list())
inputs = processor(input_arrays, sampling_rate=16_000, return_tensors="pt", padding=True)

  input_arrays = torch.tensor(test_df['audio_arrays'].to_list())


In [25]:
with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)

print(predicted_sentences)
'''for i, predicted_sentence in enumerate(predicted_sentences):
    print("-" * 100)
    print("Reference:", test_dataset[i]["sentence"])
    print("Prediction:", predicted_sentence)'''

RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 1, 3, 131328]

In [15]:
test_df

Unnamed: 0,file_path,transcription,audio_arrays
0,./data/example_SNR_audio/88_SNR_audio_file.wav,ale jaja panie ferdku,"[6.299524e-07, 1.4356192e-06, -6.4476626e-07, ..."
1,./data/example_SNR_audio/55_SNR_audio_file.wav,random string,"[1.1672091e-06, 1.748238e-06, -1.2229866e-06, ..."
2,./data/example_SNR_audio/30_SNR_audio_file.wav,losowy input,"[6.2228675e-05, 8.997516e-05, 9.032915e-05, 7...."


In [None]:
print(torch.cuda.is_available())