In [20]:
import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [29]:
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-german"
SAMPLE_RATE = 16000

In [30]:
#model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID)

In [31]:
import librosa
import numpy as np

def make_preds(raw_waveform, sample_rate):
    wf = raw_waveform
    sr = sample_rate
    
    if sr != 16000:
        wf = librosa.resample(wf, sr, 16000)
        
    #wf = librosa.to_mono(wf)
    #wf /= np.max(np.abs(wf))
    
    input_values = feature_extractor(wf, return_tensors="pt").input_values
    logits = model(input_values).logits[0]
    pred_ids = torch.argmax(logits, axis=-1)
    
    outputs = tokenizer.decode(pred_ids, output_word_offsets=True)
    
    return outputs[0]

In [34]:
import helper_functions as hf

fn = 'SampleAudio/achtgesichterambiwasse_0001.wav'
speech_wf, speech_sr = hf.get_speech_sample(fn, resample=SAMPLE_RATE)

rir = 'RIR_Samples/h013_Hospital_ExaminationRoom_19txts.wav'
rir_wf, rir_sr = hf.get_sample(rir, resample=SAMPLE_RATE)


In [35]:
#clean_audio_preds = make_preds(fn)
hf.print_stats(speech_wf, speech_sr)
hf.print_stats(rir_wf, rir_sr)

Sample Rate: 16000
Dtype: torch.float32
 - Max:      0.346
 - Min:     -0.300
 - Mean:    -0.000
 - Std Dev:  0.038

tensor([[-3.0994e-05,  5.2696e-04, -1.3202e-04,  ...,  3.7897e-04,
          3.9393e-04,  1.7238e-04]])

Sample Rate: 16000
Dtype: torch.float32
 - Max:      0.816
 - Min:     -0.403
 - Mean:     0.000
 - Std Dev:  0.018

tensor([[ 2.8014e-06, -5.5337e-04, -6.6245e-04,  ...,  1.7881e-07,
         -1.1921e-07,  1.1921e-07]])



In [36]:
# In order to conlvolve the speech samples and the RIR
speech_ = nn.functional.pad(speech_wf, (rir_wf.shape[1] - 1, 0))
convolved_ = nn.functional.conv1d(speech_[None, ...], rir_wf[None, ...])[0]

In [37]:
print(make_preds(speech_wf.squeeze(), SAMPLE_RATE))
print(make_preds(convolved_.squeeze(), SAMPLE_RATE))

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


um zu den göttlichen schönheiten der vergänglichkeit gezählt zu werdenihr hals war biegsam wie eine reihefeder
um zu den göttlichen schönheiten der vergänglichkeit gezählt zu werden ihr hals war biegsam wie eine reihefed


In [41]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [42]:
class ContrastiveAudioDataset(Dataset):
    
    def __init__(self, csv_file, transform=None):
        self.dataframe = pd.read_csv(csv_file)
        self.transform = transform
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self,idx):
        # Load the audio file and label from the dataframe
        audio_path = self.dataframe.iloc[idx]['file']
        label = self.dataframe.iloc[idx]['orgText']
        audio, sr = hf.get_speech_sample(audio_path, sr = 16000)
        
        # Apply the transform function to get the noisy version of the file
        if self.transform:
            noisy_audio = self.transform(audio)
        else:
            noisy_audio = audio
            
        # Convert the audio and label to PyTorch Tensor
        label_tensor = torch.tensor(label).long()
        
        return audio, noisy_audio, label_tensor
    

In [46]:
# Define the function that returns the noisy version of the audio data
import random
import os

rir_dir = 'RIR_Samples/'
def convolve_rir(audio):
    random_rir_file = random.choice(os.listdir(rir_dir))
    rir, sr = hf.get_sample(random_rir_file, resample=SAMPLE_RATE)
    
    audio_ = nn.functional.pad(audio, (rir.shape[1] - 1, 0))
    convolved_ = nn.functional.conv1d(audio_[None, ...], rir[None, ...])[0]
    
    return convolved_
    

In [47]:
# Set the csv path
csv_path = '/home/lski-029/Downloads/AudioFiles/archive/outcsv.csv'

# Create instances of the custom dataset class for training and testing sets
train_dataset = ContrastiveAudioDataset(csv_path, transform=convolve_rir)
test_dataset = ContrastiveAudioDataset(csv_path)

In [52]:
# Define batch size and number of epochs
batch_size = 32
epochs = 10

# Define the training data loader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle = True)

{'map': functools.partial(<function Dataset.register_datapipe_as_function.<locals>.class_function at 0x7f2fd2ba3430>, <class 'torch.utils.data.datapipes.map.callable.MapperMapDataPipe'>, False),
 'concat': functools.partial(<function Dataset.register_datapipe_as_function.<locals>.class_function at 0x7f2fd2ba3790>, <class 'torch.utils.data.datapipes.map.combining.ConcaterMapDataPipe'>, False)}