In [1]:
import pandas as pd
import numpy as np
import os
import re
import librosa
from IPython.display import display, Audio

In [2]:
def clean_text(s):
    # Remove punctuation
    s = re.sub(r'<[^>]*>', '', s)
    s = re.sub(r'[^\w\s]', '', s)
    # Remove extra spaces
    s = re.sub(r'\s+', ' ', s)
    # Remove leading and trailing spaces
    s = s.strip()
    return s.lower()

def add_dir(s):
    return os.path.join(eval_dir, s)

In [3]:
eval_dir = '/ceph/dpandya/notsofar/nsfd_adap_segments/eval/'
eval_df = pd.read_csv('/ceph/dpandya/notsofar/nsfd_adap_segments/eval_segments.csv')

In [4]:
eval_df['segmented_audio_file'] = eval_df['segmented_audio_file'].apply(add_dir)
eval_df['segmented_text'] = eval_df['segmented_text'].apply(clean_text)

In [17]:
import adapters
import eval_utils as eu
import datasets
from transformers import AutomaticSpeechRecognitionPipeline, WhisperForConditionalGeneration
from transformers.pipelines.pt_utils import KeyDataset
from tqdm import tqdm
import torch

model_dict = {
    'name': "openai/whisper-medium",
    'language': "english",
    'task': 'transcribe'
}

In [18]:
eval_files = {'audio':list(eval_df['segmented_audio_file'])}
eval_ds = datasets.Dataset.from_dict(eval_files).cast_column('audio', datasets.Audio(sampling_rate=16000))
eval_ds = eval_ds.add_column("text", list(eval_df['segmented_text']))

feature_extractor, tokenizer, processor = eu.get_testing_components(model_dict)
whisper_model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-medium",
    dtype=torch.float16)
#whisper_model = adapters.WhisperAdapterModel.from_pretrained(model_dict['name'], language=model_dict['language'], task=model_dict['task'])
#whisper_model.config.max_length=512

# Eval pipeline
eval_pipeline = AutomaticSpeechRecognitionPipeline(
    model=whisper_model,
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    device = 'cuda:0',
    chunk_length_s=30,

)
forced_decoder_ids = processor.get_decoder_prompt_ids(language=model_dict['language'], task=model_dict['task'])

Device set to use cuda:0


In [19]:
predictions = []

for prediction in tqdm(
    eval_pipeline(
        #list(eval_df['audio_file']),
        KeyDataset(eval_ds,'audio'),
        batch_size=1,
        generate_kwargs={"forced_decoder_ids": forced_decoder_ids},
        #max_new_tokens=256
    ), total=len(eval_ds), desc='Evaluating ...'
):
    predictions.append(prediction['text'])

eval_col = 'preds_medium'
wer = 'WER'
eval_df['preds_medium'] = predictions
eval_df['preds_medium'] = eval_df['preds_medium'].apply(clean_text)


Evaluating ...: 100%|██████████| 2074/2074 [53:22<00:00,  1.54s/it] 


In [22]:
eval_df

Unnamed: 0,original_audio_file,segmented_audio_file,segmented_text,start,stop,preds_large,WER,preds_medium
0,/ceph/dpandya/notsofar/eval_set/240825.1_eval_...,/ceph/dpandya/notsofar/nsfd_adap_segments/eval...,ok so why we here we got a we got a big budget...,6140.0,30580.0,okay so we got a big budget in the office rega...,0.289474,okay so we got a big budget in the office rega...
1,/ceph/dpandya/notsofar/eval_set/240825.1_eval_...,/ceph/dpandya/notsofar/nsfd_adap_segments/eval...,um so even though we have a big budget we wann...,30680.0,54990.0,so even though we have a big budget we want to...,0.235955,so even though we have a big budget we want to...
2,/ceph/dpandya/notsofar/eval_set/240825.1_eval_...,/ceph/dpandya/notsofar/nsfd_adap_segments/eval...,and macadamia thats right isnt that kind of a ...,54690.0,79110.0,isnt that kind of a waste of money like who ea...,0.317073,isnt that kind of a waste of money like who ea...
3,/ceph/dpandya/notsofar/eval_set/240825.1_eval_...,/ceph/dpandya/notsofar/nsfd_adap_segments/eval...,i think is a snack chocolate bars no ok then a...,79550.0,102030.0,i think its a snack me no me no chocolate bars...,0.627451,i think its a snack chocolate bars okay then a...
4,/ceph/dpandya/notsofar/eval_set/240825.1_eval_...,/ceph/dpandya/notsofar/nsfd_adap_segments/eval...,but it smells so good wouldnt you want the off...,101810.0,125950.0,its not so good would you want that it smells ...,0.424779,it smells so good would you want that it smell...
...,...,...,...,...,...,...,...,...
2069,/ceph/dpandya/notsofar/eval_set/240825.1_eval_...,/ceph/dpandya/notsofar/nsfd_adap_segments/eval...,you know that they use in the movies and uh th...,253840.0,278780.0,you know that they use in the movies or the do...,0.875862,you know that they use in the movies and uh fo...
2070,/ceph/dpandya/notsofar/eval_set/240825.1_eval_...,/ceph/dpandya/notsofar/nsfd_adap_segments/eval...,thats right big big bigfoot is using our equip...,274960.0,299660.0,so we can make the list for our next time and ...,0.909091,okay so we can make the list for next time and...
2071,/ceph/dpandya/notsofar/eval_set/240825.1_eval_...,/ceph/dpandya/notsofar/nsfd_adap_segments/eval...,ron what do you say yes so number one bigfoot ...,300150.0,321590.0,being the being the money person i will try to...,0.715789,being the money person i will try to find out ...
2072,/ceph/dpandya/notsofar/eval_set/240825.1_eval_...,/ceph/dpandya/notsofar/nsfd_adap_segments/eval...,dont make me laugh then how could it be monday...,319650.0,342810.0,dont make me laugh okay so tuesday if we are a...,0.673913,dont make me laugh okay so if we already frida...


In [21]:
eval_df['WER'] = eval_df.apply(lambda x: eu.compute_wer(x['preds_medium'], x['segmented_text']), axis=1)
print(f'WER: {eval_df[wer].mean()}')

eval_df.to_csv('/ceph/dpandya/notsofar/nsfd_adap_segments/baseline_large.csv', index=False)

WER: 0.4117449623708684


In [56]:
wav, sr = librosa.load(os.path.join(eval_dir,eval_df.iloc[2]['segmented_audio_file']), sr=16000)

print(clean_text(eval_df.iloc[2]['segmented_text']))
display(Audio(wav, rate=sr))

and macadamia thats right isnt that kind of a waste of money like who who eats that brazil waste of money we have budget though thats the thing brazil nuts yeah but who eats it oh monkey nuts i do i do healthy people mmhmm thats right they are the best snack thin and healthy look we want healthy bodies healthy brains here i dont really eat but i dont really eat healthy like i dont care for healthy snacks ah you dont care for health mmm no like i i think kind of a waste of money alright so what are you looking for in a snack yeah i think we should have like ice cream ice cream is not a snack
