In [None]:
import os
import pandas as pd
from datasets import load_dataset
from src.data_engineering import create_distribution_dict, creating_random_split_df, audio_normalizer
from src.audio_mixer import mixer
from tqdm.notebook import trange, tqdm
import soundfile as sf
import pyloudnorm as pyln
import torch
from pydub import AudioSegment
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import re


# Preparing data

### Loading already prepared audio and noise data

In [None]:
df_testing = creating_random_split_df(data['train'], 20)

### Creating random splits

### Normalizing audio and noise to -20dB

In [None]:
################### AUDIO ####################

#files_paths = df_audio['audiopath_local'].to_list()
#file_names = df_audio['audiopath_bigos'].to_list()

output_audio_folder_path = "./data/demo/normalized_audio/"

#creating folder
os.makedirs(output_folder_path, exist_ok=True)
for i in trange(len(files_paths)):
    audio_normalizer(files_paths[i], output_folder_path,file_names[i], -20.0)


#################### NOISE ##################3 
    
output_noise_folder_path = "./data/demo/normalized_noise/"

#files_paths = df_audio['audiopath_local'].to_list()
#file_names = df_audio['audiopath_bigos'].to_list()

#creating folder
os.makedirs(output_folder_path, exist_ok=True)
for i in trange(len(files_paths)):
    audio_normalizer(files_paths[i], output_folder_path,file_names[i], -20.0)

### Mixing audio

In [None]:
snr_values = [100,50,25,10,5,-1,-5]

In [None]:

# Loop through each SNR value
for snr in snr_values:
    # Create a folder for the current SNR value
    folder_path = f'./data/demo/normalized_audio/SNR_{snr}'
    os.makedirs(folder_path, exist_ok=True)
    # Loop through the dataframe and mix files for the current SNR value
    snr_paths = []
    for index, row in df_audio.iterrows():
        signal_path = row['normalized_audio_path']
        noise_path = row['normalized_noise_path']
        audio_name = row['audiopath_bigos']
        save_path = os.path.join(folder_path, audio_name)  # Change the naming convention if needed
        snr_paths.append(save_path)

        # Call your mixer function here
        mixer(signal_path, noise_path, snr, save_path)
    
    column_name = f"audio_SNR_{snr}_path"
    df_audio[column_name] = snr_paths

# Testing 

In [1]:
#Checking if cuda is avalible
torch.cuda.is_available()

NameError: name 'torch' is not defined

### Testing Whisper v3 Large

In [None]:

# Specify the CUDA device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_id = "openai/whisper-large-v3"
torch_dtype = torch.float16  # You can adjust the dtype if needed

# Load model and move it to CUDA
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)

model.to(device)

# Load processor
processor = AutoProcessor.from_pretrained(model_id)

# Create the pipeline with CUDA support
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

In [None]:
snr_list = ['audio_SNR_100_path', 'audio_SNR_50_path', 'audio_SNR_25_path', 'audio_SNR_10_path', 'audio_SNR_5_path', 'audio_SNR_0.1_path', 'audio_SNR_-1_path', 'audio_SNR_-3_path', 'audio_SNR_-10_path']
for snr in snr_list:
    audio_paths = df_audio[snr].to_list()
    results = []
    for i in trange(len(audio_paths)):
        sample = audio_paths[i]
        result = pipe(sample, generate_kwargs={"language": "polish"})
        results.append(result['text'])
    col_name = f"WER_{snr}"
    df_audio[col_name] = results 

### Model jonatasgrosman/wav2vec2-large-xlsr-53-polish

In [None]:
import torch
import librosa
from datasets import load_dataset, Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-polish"

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(device)

In [None]:
ds = Dataset.from_pandas(df_audio)

In [None]:
def speech_file_to_array_fn(batch,column_name):
    speech_array, sampling_rate = librosa.load(batch[column_name], sr=16_000)
    batch["speech"] = speech_array
    return batch

In [None]:
import torch
import math
import re

optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

# Decrease Batch Size
batch_size = 10

# Gradient Accumulation
accumulate_gradients = True
gradient_accumulation_steps = 4  # Accumulate gradients over 4 batches


snr_paths = ['audio_SNR_100_path', 'audio_SNR_50_path', 'audio_SNR_25_path', 'audio_SNR_10_path', 'audio_SNR_5_path', 'audio_SNR_0.1_path', 'audio_SNR_-1_path', 'audio_SNR_-3_path', 'audio_SNR_-10_path']


for snr in snr_paths:
    test_dataset = ds.map(lambda batch: speech_file_to_array_fn(batch, snr))
    predictions = []
    num_batches = math.ceil(len(test_dataset) / batch_size)
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(test_dataset))
        
        inputs = processor(test_dataset["speech"][batch_start:batch_end], sampling_rate=16_000, return_tensors="pt", padding=True)
        inputs = inputs.to(device)
        print('inputs done for:', snr)
        
        with torch.no_grad():
            logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
        print('torch done for:', snr)

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_sentences = processor.batch_decode(predicted_ids)
        predictions.extend(predicted_sentences)
        print('predictions appended')

        # Gradient Accumulation
        if accumulate_gradients and (i + 1) % gradient_accumulation_steps == 0:
            # Update weights
            optimizer.step()
            optimizer.zero_grad()

    del inputs
    torch.cuda.empty_cache()

    
    col_name = f"Wav2wec_{snr}"
    df_wav2wec[col_name] = predictions

# Model evaluation