In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import librosa
from scipy.signal import fftconvolve
import whisperx
import os
import torchaudio
import h5py
import random
from sklearn.model_selection import train_test_split

SAMPLE_RATE = 16000
libri_dir = '../../LibriVox_Kaggle/'
rir_dir = '../audioData/Office_RIR'
rir_files_list = os.listdir(rir_dir)

  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.set_audio_backend("soundfile")


In [3]:
device = "cuda"
compute_type="float16"

whisper_model = whisperx.load_model("large-v2", device=device, compute_type=compute_type)

No language specified, language will be first be detected for each audio file (increases inference time).


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../home/dpandya/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.4.1+cu121. Bad things might happen unless you revert torch to 1.x.


In [4]:
def add_echo(aud_fname, rir_fname):
    audio, sr = librosa.load(aud_fname, sr=SAMPLE_RATE)
    rir, sr = librosa.load(rir_fname, sr=SAMPLE_RATE)

    augmented = fftconvolve(audio, rir)
    return augmented

def encode_features(aud):
    
    audio = {}
    audio['inputs'] = aud
    feats = whisper_model.preprocess(audio)['inputs']
    embeddings = whisper_model.model.encode(feats)
    
    return torch.as_tensor(embeddings)


In [5]:
df = pd.read_csv('LibriVox_Kaggle_org.csv')
train_df, test_df = train_test_split(df, random_state=42)

In [6]:
def extract_embeddings(audio_files, rir_files, hdf5_file_path):
    
    # Create an empty df to store embeddings
    #emb_df = pd.DataFrame(columns=['file', 'rir_file', 'embedding_original', 'embedding_with_echo'])
    emb_data = []

    for audio_file in audio_files:
        
        print(f'xx--Extracting features for {audio_file}--xx')
        # Extract embeddings from original audio
        audio_file_ = os.path.join(libri_dir, audio_file)
        audio,sr = librosa.load(audio_file_, sr=SAMPLE_RATE)
        original_embedding = encode_features(audio)

        # Extract embeddings from echoed audio
        rir_file = random.choice(rir_files)
        rir_file_ = os.path.join(rir_dir, rir_file)
        augmented = add_echo(audio_file_, rir_file_)
        echo_embedding = encode_features(augmented)

        data_dict = {'file':audio_file,
            'rir_file':rir_file,
            'embedding_original':original_embedding,
            'embedding_with_echo':echo_embedding}

        emb_data.append(data_dict)

    emb_df = pd.concat([pd.DataFrame([item]) for item in emb_data], ignore_index=True)

    # Save the embeddings to an HDF5 file
    with h5py.File(hdf5_file_path, 'w') as hf:
        for index, row in emb_df.iterrows():
           group = hf.create_group(row['file'])
           group.create_dataset('embedding_original', data=row['embedding_original'])
           group.create_dataset('embedding_with_echo', data=row['embedding_with_echo'])

           group.attrs['rir_file'] = row['rir_file']
    print(f'Embeddings saved to {hdf5_file_path}')
    return emb_df

In [7]:
df_embeddings = extract_embeddings(train_df['file'], rir_files_list, 'embeddings.h5')

xx--Extracting features for meisterfloh/meisterfloh_1173.wav--xx
xx--Extracting features for meisterfloh/meisterfloh_2322.wav--xx
xx--Extracting features for meisterfloh/meisterfloh_2417.wav--xx
xx--Extracting features for achtgesichterambiwasse/achtgesichterambiwasse_2534.wav--xx
xx--Extracting features for serapionsbruederauswahl/serapionsbruederauswahl_1128.wav--xx
xx--Extracting features for achtgesichterambiwasse/achtgesichterambiwasse_0240.wav--xx
xx--Extracting features for achtgesichterambiwasse/achtgesichterambiwasse_0783.wav--xx
xx--Extracting features for achtgesichterambiwasse/achtgesichterambiwasse_1354.wav--xx
xx--Extracting features for achtgesichterambiwasse/achtgesichterambiwasse_1018.wav--xx
xx--Extracting features for meisterfloh/meisterfloh_2028.wav--xx
xx--Extracting features for achtgesichterambiwasse/achtgesichterambiwasse_0977.wav--xx
xx--Extracting features for achtgesichterambiwasse/achtgesichterambiwasse_0809.wav--xx
xx--Extracting features for meisterfloh/me

In [9]:
def load_embeddings_and_rir_from_hdf5(hdf5_file_path, audio_file):
    with h5py.File(hdf5_file_path, 'r') as hf:
        # Access the group corresponding to the audio file
        if audio_file in hf:
            group = hf[audio_file]
            embedding_original = np.array(group['embedding_original'])
            embedding_with_echo = np.array(group['embedding_with_echo'])

            return embedding_original, embedding_with_echo
        else:
            print(f"{audio_file} not found in HDF5 file.")
            return None, None

In [5]:
sample = '../../LibriVox_Kaggle/achtgesichterambiwasse/achtgesichterambiwasse_0000.wav'
office_rir_dir = '../audioData/Office_RIR'
rir_list = os.listdir(office_rir_dir)
rir_sample = os.path.join(office_rir_dir, rir_list[0])

aud = add_echo(sample, rir_sample)
aud.shape

(126572,)

In [17]:
audio = {}
audio['inputs'] = aud
adapter = LinearAdapter().half()

adapter(encode_features(aud)).shape, encode_features(aud).shape

(torch.Size([1, 1500, 1280]), torch.Size([1, 1500, 1280]))