In [1]:
#! pip install git+https://github.com/openai/whisper.git
#! pip install numpy torch pandas urllib3 torchaudio scipy tqdm matplotlib transformers deepcut jiwer pydub evaluate
#! pip install tf-keras

In [3]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd
from pydub import AudioSegment
from transformers import pipeline
import string

from scipy.io import wavfile
from tqdm.notebook import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

## Preprocessing Audio and Subtitles Files

In [8]:
# Retrieve .txt files
def list_files_in_directory(directory):
    file_list = []
    for filename in os.listdir(directory):
        # Only pick up files with .txt extensions (transcript)
        if filename.endswith(".txt"):
            file_list.append(filename.replace(".txt", ""))
    return file_list

# Create the dataframe
def get_reference_df(directory, audio_txt_file):
    txt_file_path = os.path.join(directory, audio_txt_file + ".txt")
    columns = ["start_time", "end_time", "reference"]
    # Read the text file into a DataFrame
    df = pd.read_csv(txt_file_path, sep="\t", header=None, names=columns, quoting=3)

    # Add file name
    df.insert(0, 'file_name', pd.Series([audio_txt_file] * len(df)))

    # Remove quotation marks
    df['reference'] = df['reference'].apply(lambda x : x.replace('"',""))
    
    return df

# Trim audio files
def trim_wav_by_timestamps(directory, wav_file_name, reference_df):
    # Create the output directory if it doesn't exist
    output_dir = "data/sub/"
    os.makedirs(output_dir, exist_ok=True)
    wav_file = os.path.join(directory, wav_file_name + ".wav") # get into data file
    
    # Load the WAV file
    audio = AudioSegment.from_wav(wav_file)
    
    def trim_segments(row):
        start_ms = float(row['start_time']) * 1000  # Convert start time to milliseconds
        end_ms = float(row['end_time']) * 1000      # Convert end time to milliseconds
        trimmed_segment = audio[start_ms:end_ms]
    
        return trimmed_segment
    
    # Iterate over timestamps and trim the audio
    for i, row in reference_df.iterrows():
        trimmed_segment = trim_segments(row)
        output_file = os.path.join(output_dir, wav_file_name + "_" f"trimmed_segment_{i+1}.wav")
        trimmed_segment.export(output_file, format="wav")
        reference_df.at[i, 'trimmed_segment_path'] = output_file
    
    return reference_df

def strip_punctuation(text):
    # Define a regex pattern to remove punctuation
    punctuation_pattern = re.compile(r'[^\w\s]')
    return punctuation_pattern.sub('', text)

def filter_subs_by_lang(reference_df):
    # Helper function that is applied across the rows to filter English text only
    
    def filter_english_only(text):
        # Define a regex pattern to match English words with punctuations
        english_pattern = re.compile(r'\b[A-Za-z][A-Za-z\'\-]*\b')
        # Find all English words in the text
        english_words = english_pattern.findall(text)
        # Join the English words into a single string
        english_text = ' '.join(english_words)
        return english_text
    
    def filter_thai_only(text):
        # Tokenize the string, split by spaces
        list_of_words = text.split()
        # Define a regex pattern to match English words with punctuations
        english_pattern = re.compile(r'\b[A-Za-z][A-Za-z\'\-]*\b')
        # Find all English words in the text
        english_words = english_pattern.findall(text)
        # Remove punctuation from the list of words
        list_of_words = [strip_punctuation(word) for word in list_of_words]
        # Find all Thai words
        thai_words = [word for word in list_of_words if strip_punctuation(word) not in english_words]
        # Concatenate the Thai words into a string
        thai_text = ' '.join(thai_words)
        # Return Thai string
        return thai_text

    reference_df['eng_reference'] = reference_df['reference'].apply(filter_english_only)
    reference_df['thai_reference'] = reference_df['reference'].apply(filter_thai_only)

    return reference_df


def get_combined_audio_table(directory, file_names):
    combined_df = pd.DataFrame()
    for file_name in file_names:
        # Reads the transcript dataframe which has the start_time, end_time of each transcript
        reference_df = get_reference_df(directory, file_name)

        # Split subtitles by language
        reference_df = filter_subs_by_lang(reference_df)

        # Remove 'reference' column
        reference_df = reference_df.drop('reference', axis=1)

        # Uncomment to trim all the .wav file according to the subtitles start_time and end_time
        reference_df = trim_wav_by_timestamps(directory, file_name, reference_df)
        
        # Append the processed DataFrame to the combined DataFrame
        combined_df = pd.concat([combined_df, reference_df], ignore_index=True)

        # Comment out this section if not required
        #combined_df = combined_df.drop(['file_name', 'start_time', 'end_time'], axis=1)
    
    return combined_df

directory = os.path.join(os.getcwd(), "data/train")
file_names = list_files_in_directory(directory)
combined_df_train = get_combined_audio_table(directory, file_names)
combined_df_train

Unnamed: 0,file_name,start_time,end_time,eng_reference,thai_reference,trimmed_segment_path
0,DU_s5HDJc0w,8.839,13.818,Episode This episode we will be inviting a fam...,คำนด นจะไดตอนรบคนทแวะ เวยนมาบอยพอสมควร,data/sub/DU_s5HDJc0w_trimmed_segment_1.wav
1,DU_s5HDJc0w,14.228,18.711,but this will be the first time that we will b...,แตนจะเปนครงแรก ทเราจะไดนงคยกนรปแบบวดโอเตมรปแบบ,data/sub/DU_s5HDJc0w_trimmed_segment_2.wav
2,DU_s5HDJc0w,19.445,21.321,and it s going to be the first time also,its และยงเปนครงแรก,data/sub/DU_s5HDJc0w_trimmed_segment_3.wav
3,DU_s5HDJc0w,21.669,26.559,that this episode will be done in bilingual wi...,ทจะเปนพดสองภาษากบแขกทานน,data/sub/DU_s5HDJc0w_trimmed_segment_4.wav
4,DU_s5HDJc0w,26.760,28.899,Good morning,พดจดาว วฒนปกรณ สวสดครบ,data/sub/DU_s5HDJc0w_trimmed_segment_5.wav
...,...,...,...,...,...,...
2408,EoBVsX--qz8,1259.847,1262.269,Please like share and subscribe,ดแลวชอบ กดไลก แชร ซบสไครบดวย,data/sub/EoBVsX--qz8_trimmed_segment_412.wav
2409,EoBVsX--qz8,1262.269,1264.449,Chollada Channel Chollada Channel,กบ คะ,data/sub/EoBVsX--qz8_trimmed_segment_413.wav
2410,EoBVsX--qz8,1265.109,1266.776,Bye,บาย,data/sub/EoBVsX--qz8_trimmed_segment_414.wav
2411,EoBVsX--qz8,1266.776,1269.002,Please like share and subscribe,อยาลมกดไลก กดแชร หรอวาซบสไครบ,data/sub/EoBVsX--qz8_trimmed_segment_415.wav


In [9]:
combined_df_train[:50]

Unnamed: 0,file_name,start_time,end_time,eng_reference,thai_reference,trimmed_segment_path
0,DU_s5HDJc0w,8.839,13.818,Episode This episode we will be inviting a fam...,คำนด นจะไดตอนรบคนทแวะ เวยนมาบอยพอสมควร,data/sub/DU_s5HDJc0w_trimmed_segment_1.wav
1,DU_s5HDJc0w,14.228,18.711,but this will be the first time that we will b...,แตนจะเปนครงแรก ทเราจะไดนงคยกนรปแบบวดโอเตมรปแบบ,data/sub/DU_s5HDJc0w_trimmed_segment_2.wav
2,DU_s5HDJc0w,19.445,21.321,and it s going to be the first time also,its และยงเปนครงแรก,data/sub/DU_s5HDJc0w_trimmed_segment_3.wav
3,DU_s5HDJc0w,21.669,26.559,that this episode will be done in bilingual wi...,ทจะเปนพดสองภาษากบแขกทานน,data/sub/DU_s5HDJc0w_trimmed_segment_4.wav
4,DU_s5HDJc0w,26.76,28.899,Good morning,พดจดาว วฒนปกรณ สวสดครบ,data/sub/DU_s5HDJc0w_trimmed_segment_5.wav
5,DU_s5HDJc0w,28.899,30.591,Good morning,สวสดคะ,data/sub/DU_s5HDJc0w_trimmed_segment_6.wav
6,DU_s5HDJc0w,30.72,35.48,Actually I ve heard that you give lectures in ...,จรงๆ เคยไดยนพดาวสอนหรอวาเลคเชอรเปน ภาษาองกฤษอย...,data/sub/DU_s5HDJc0w_trimmed_segment_7.wav
7,DU_s5HDJc0w,35.604,37.271,Yes sometimes,มบาง มบางคะ,data/sub/DU_s5HDJc0w_trimmed_segment_8.wav
8,DU_s5HDJc0w,37.271,41.545,Because you went to get your education in Brit...,เพราะคณเรยบจบจากองกฤษใชไหม,data/sub/DU_s5HDJc0w_trimmed_segment_9.wav
9,DU_s5HDJc0w,41.545,46.773,Yes I got my MA in London So yeah sometimes I ...,ใชคะ เรยบจบปโททกรงลอนดอน มสอนเปนภาษาองกฤษบาง,data/sub/DU_s5HDJc0w_trimmed_segment_10.wav


In [21]:
directory = os.path.join(os.getcwd(), "data/test/")

file_names = list_files_in_directory(directory)
combined_df = get_combined_audio_table(directory, file_names)
combined_df

Unnamed: 0,file_name,start_time,end_time,reference,trimmed_segment_path
0,2OF_N9xQOAc,0.000,3.118,Right now I m in Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_1.wav
1,2OF_N9xQOAc,3.118,4.909,Most people when they come to Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_2.wav
2,2OF_N9xQOAc,4.909,6.129,They come to eat dim sum,data/sub/2OF_N9xQOAc_trimmed_segment_3.wav
3,2OF_N9xQOAc,6.129,7.498,Visit Merlion,data/sub/2OF_N9xQOAc_trimmed_segment_4.wav
4,2OF_N9xQOAc,7.498,8.827,Visit Universal,data/sub/2OF_N9xQOAc_trimmed_segment_5.wav
...,...,...,...,...,...
592,b4o5YC_wMXM,1019.264,1022.082,Please press like share and subscribe,data/sub/b4o5YC_wMXM_trimmed_segment_348.wav
593,b4o5YC_wMXM,1022.107,1025.358,If you want me to take you somewhere,data/sub/b4o5YC_wMXM_trimmed_segment_349.wav
594,b4o5YC_wMXM,1025.383,1027.521,to eat or do any activities,data/sub/b4o5YC_wMXM_trimmed_segment_350.wav
595,b4o5YC_wMXM,1027.546,1029.461,you can leave me a comment,data/sub/b4o5YC_wMXM_trimmed_segment_351.wav


In [22]:
combined_df['segment_duration'] = combined_df.apply(lambda x : x['end_time'] - x['start_time'], axis = 1)

In [23]:
combined_df['segment_duration'].describe() # All audio segments are under 10s

count    597.000000
mean       2.282425
std        1.008288
min        0.596000
25%        1.563000
50%        2.079000
75%        2.833000
max        6.487000
Name: segment_duration, dtype: float64

In [24]:
combined_df['segment_duration'].sum()

1362.6079999999988

## Direct Translation with Whisper

In [25]:
import whisper

model = whisper.load_model("medium")

In [26]:
language = "Thai"
options = dict(language=language, beam_size=5, best_of=5)
translate_options = dict(task="translate", **options)

In [27]:
def map_transcription(row):
    segment_wavfile = row['trimmed_segment_path']
    transcription = model.transcribe(segment_wavfile, **translate_options)['text']
    return transcription

In [28]:
transcriptions = combined_df.apply(map_transcription, axis=1)

In [29]:
combined_df['hypothesis'] = transcriptions

In [30]:
combined_df

Unnamed: 0,file_name,start_time,end_time,reference,trimmed_segment_path,segment_duration,hypothesis
0,2OF_N9xQOAc,0.000,3.118,Right now I m in Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_1.wav,3.118,But I hate Singapore!
1,2OF_N9xQOAc,3.118,4.909,Most people when they come to Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_2.wav,1.791,See you in Singapore!
2,2OF_N9xQOAc,4.909,6.129,They come to eat dim sum,data/sub/2OF_N9xQOAc_trimmed_segment_3.wav,1.220,Let's eat som tam.
3,2OF_N9xQOAc,6.129,7.498,Visit Merlion,data/sub/2OF_N9xQOAc_trimmed_segment_4.wav,1.369,Let's see Aon's hand.
4,2OF_N9xQOAc,7.498,8.827,Visit Universal,data/sub/2OF_N9xQOAc_trimmed_segment_5.wav,1.329,
...,...,...,...,...,...,...,...
592,b4o5YC_wMXM,1019.264,1022.082,Please press like share and subscribe,data/sub/b4o5YC_wMXM_trimmed_segment_348.wav,2.818,"Please like, share and subscribe to my channel."
593,b4o5YC_wMXM,1022.107,1025.358,If you want me to take you somewhere,data/sub/b4o5YC_wMXM_trimmed_segment_349.wav,3.251,If you want me to take you out
594,b4o5YC_wMXM,1025.383,1027.521,to eat or do any activities,data/sub/b4o5YC_wMXM_trimmed_segment_350.wav,2.138,Where should I go next?
595,b4o5YC_wMXM,1027.546,1029.461,you can leave me a comment,data/sub/b4o5YC_wMXM_trimmed_segment_351.wav,1.915,Feel free to leave a comment.


In [31]:
# Using Whisper that is finetuned on Thai Speech Datasets
MODEL_NAME = "biodatlab/whisper-th-medium-combined"  
lang = "th"

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [32]:
def map_transcription_finetuned_asr(row, pipe):
    segment_wavfile = row['trimmed_segment_path']
    transcription = pipe(
        segment_wavfile, 
        generate_kwargs={"language":"<|th|>", "task":"translate"}, 
        batch_size=16
    )["text"]
    
    return transcription

In [33]:
transcriptions_finetuned = combined_df.apply(lambda row: map_transcription_finetuned_asr(row, pipe), axis=1)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [34]:
transcriptions_finetuned

0                                 Now I'm in Singapore!
1               I'm going to Singapore with my friends.
2                                      Let's eat tipsum
3                                Let's see Lion's Hand!
4                                     It's a universal.
                             ...                       
592     Press like, share and subscribe for me as well.
593                      If you want me to go on a trip
594                           Where can I eat and play?
595                             You can leave a comment
596                                             Bye bye
Length: 597, dtype: object

In [35]:
combined_df['hypothesis_finetuned'] = transcriptions_finetuned
combined_df

Unnamed: 0,file_name,start_time,end_time,reference,trimmed_segment_path,segment_duration,hypothesis,hypothesis_finetuned
0,2OF_N9xQOAc,0.000,3.118,Right now I m in Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_1.wav,3.118,But I hate Singapore!,Now I'm in Singapore!
1,2OF_N9xQOAc,3.118,4.909,Most people when they come to Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_2.wav,1.791,See you in Singapore!,I'm going to Singapore with my friends.
2,2OF_N9xQOAc,4.909,6.129,They come to eat dim sum,data/sub/2OF_N9xQOAc_trimmed_segment_3.wav,1.220,Let's eat som tam.,Let's eat tipsum
3,2OF_N9xQOAc,6.129,7.498,Visit Merlion,data/sub/2OF_N9xQOAc_trimmed_segment_4.wav,1.369,Let's see Aon's hand.,Let's see Lion's Hand!
4,2OF_N9xQOAc,7.498,8.827,Visit Universal,data/sub/2OF_N9xQOAc_trimmed_segment_5.wav,1.329,,It's a universal.
...,...,...,...,...,...,...,...,...
592,b4o5YC_wMXM,1019.264,1022.082,Please press like share and subscribe,data/sub/b4o5YC_wMXM_trimmed_segment_348.wav,2.818,"Please like, share and subscribe to my channel.","Press like, share and subscribe for me as well."
593,b4o5YC_wMXM,1022.107,1025.358,If you want me to take you somewhere,data/sub/b4o5YC_wMXM_trimmed_segment_349.wav,3.251,If you want me to take you out,If you want me to go on a trip
594,b4o5YC_wMXM,1025.383,1027.521,to eat or do any activities,data/sub/b4o5YC_wMXM_trimmed_segment_350.wav,2.138,Where should I go next?,Where can I eat and play?
595,b4o5YC_wMXM,1027.546,1029.461,you can leave me a comment,data/sub/b4o5YC_wMXM_trimmed_segment_351.wav,1.915,Feel free to leave a comment.,You can leave a comment


In [39]:
import string

def strip_punctuation(text):
    # Create a translation table that maps each punctuation character to None
    translator = str.maketrans('', '', string.punctuation)
    # Use the translation table to remove all punctuation from the text
    return text.translate(translator)

In [40]:
combined_df['hypothesis'] = combined_df['hypothesis'].apply(strip_punctuation).apply(lambda x : x.strip())
combined_df['hypothesis_finetuned'] = combined_df['hypothesis_finetuned'].apply(strip_punctuation).apply(lambda x : x.strip())

In [44]:
combined_df

Unnamed: 0,file_name,start_time,end_time,reference,trimmed_segment_path,segment_duration,hypothesis,hypothesis_finetuned
0,2OF_N9xQOAc,0.000,3.118,Right now I m in Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_1.wav,3.118,But I hate Singapore,Now Im in Singapore
1,2OF_N9xQOAc,3.118,4.909,Most people when they come to Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_2.wav,1.791,See you in Singapore,Im going to Singapore with my friends
2,2OF_N9xQOAc,4.909,6.129,They come to eat dim sum,data/sub/2OF_N9xQOAc_trimmed_segment_3.wav,1.220,Lets eat som tam,Lets eat tipsum
3,2OF_N9xQOAc,6.129,7.498,Visit Merlion,data/sub/2OF_N9xQOAc_trimmed_segment_4.wav,1.369,Lets see Aons hand,Lets see Lions Hand
4,2OF_N9xQOAc,7.498,8.827,Visit Universal,data/sub/2OF_N9xQOAc_trimmed_segment_5.wav,1.329,,Its a universal
...,...,...,...,...,...,...,...,...
592,b4o5YC_wMXM,1019.264,1022.082,Please press like share and subscribe,data/sub/b4o5YC_wMXM_trimmed_segment_348.wav,2.818,Please like share and subscribe to my channel,Press like share and subscribe for me as well
593,b4o5YC_wMXM,1022.107,1025.358,If you want me to take you somewhere,data/sub/b4o5YC_wMXM_trimmed_segment_349.wav,3.251,If you want me to take you out,If you want me to go on a trip
594,b4o5YC_wMXM,1025.383,1027.521,to eat or do any activities,data/sub/b4o5YC_wMXM_trimmed_segment_350.wav,2.138,Where should I go next,Where can I eat and play
595,b4o5YC_wMXM,1027.546,1029.461,you can leave me a comment,data/sub/b4o5YC_wMXM_trimmed_segment_351.wav,1.915,Feel free to leave a comment,You can leave a comment


In [53]:
from evaluate import load

wer = load("wer")
wer_score = wer.compute(predictions=combined_df['hypothesis'], references=combined_df['reference'])
print("Word Error Rate:", wer_score)

Word Error Rate: 0.9452316076294278


In [54]:
wer = load("wer")
wer_score = wer.compute(predictions=combined_df['hypothesis'], references=transcriptions_finetuned)
print("Word Error Rate:", wer_score)

Word Error Rate: 0.9441959531416401


## CER

In [41]:

cer = load("cer")
cer_score = cer.compute(predictions=combined_df['hypothesis'], references=combined_df['reference'])
print("Character Error Rate:", cer_score)

Character Error Rate: 0.7598657562782085


In [42]:
cer_score_finetuned = cer.compute(predictions=combined_df['hypothesis'], references=transcriptions_finetuned)
print("Character Error Rate (Finetuned):", cer_score_finetuned)

Character Error Rate (Finetuned): 0.7792978261832028


## Meteor Score

In [45]:
import evaluate 

meteor = evaluate.load('meteor')

[nltk_data] Downloading package wordnet to /home/dhuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/dhuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/dhuser/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [46]:
def compute_meteor(row, hypothesis_column, reference_column):
    meteor_score = meteor.compute(predictions=[row[hypothesis_column],], references=[row[reference_column],])
    return meteor_score['meteor']

meteor_scores_whisper = combined_df.apply(lambda row : compute_meteor(row, 'hypothesis', 'reference'), axis = 1)
meteor_scores_whisper.describe()

count    597.000000
mean       0.193621
std        0.238468
min        0.000000
25%        0.000000
50%        0.102041
75%        0.328488
max        0.996000
dtype: float64

In [47]:
meteor_scores_whisper_finetuned = combined_df.apply(lambda row : compute_meteor(row, 'hypothesis_finetuned', 'reference'), axis = 1)
meteor_scores_whisper_finetuned.describe()

count    597.000000
mean       0.225605
std        0.228305
min        0.000000
25%        0.050000
50%        0.142857
75%        0.368056
max        0.996000
dtype: float64

## Bleu Scores

In [48]:
bleu = evaluate.load("bleu")

In [49]:
def compute_bleu(row, hypothesis_column, reference_column):
    # Extract the hypothesis and reference
    hypothesis = row[hypothesis_column]
    reference = row[reference_column]

    # Check for empty hypothesis or reference
    if not hypothesis or not reference:
        return 0.0

    # Compute BLEU score
    bleu_score = bleu.compute(predictions=[hypothesis], references=[[reference]])
    return bleu_score['bleu']

bleu_scores_whisper = combined_df.apply(lambda row : compute_bleu(row, 'hypothesis', 'reference'), axis = 1)
bleu_scores_whisper.describe()

count    597.000000
mean       0.022754
std        0.108430
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
dtype: float64

In [51]:
bleu_scores_whisper

0      0.000000
1      0.000000
2      0.000000
3      0.000000
4      0.000000
         ...   
592    0.365555
593    0.840896
594    0.000000
595    0.000000
596    0.000000
Length: 597, dtype: float64

In [52]:
combined_df

Unnamed: 0,file_name,start_time,end_time,reference,trimmed_segment_path,segment_duration,hypothesis,hypothesis_finetuned
0,2OF_N9xQOAc,0.000,3.118,Right now I m in Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_1.wav,3.118,But I hate Singapore,Now Im in Singapore
1,2OF_N9xQOAc,3.118,4.909,Most people when they come to Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_2.wav,1.791,See you in Singapore,Im going to Singapore with my friends
2,2OF_N9xQOAc,4.909,6.129,They come to eat dim sum,data/sub/2OF_N9xQOAc_trimmed_segment_3.wav,1.220,Lets eat som tam,Lets eat tipsum
3,2OF_N9xQOAc,6.129,7.498,Visit Merlion,data/sub/2OF_N9xQOAc_trimmed_segment_4.wav,1.369,Lets see Aons hand,Lets see Lions Hand
4,2OF_N9xQOAc,7.498,8.827,Visit Universal,data/sub/2OF_N9xQOAc_trimmed_segment_5.wav,1.329,,Its a universal
...,...,...,...,...,...,...,...,...
592,b4o5YC_wMXM,1019.264,1022.082,Please press like share and subscribe,data/sub/b4o5YC_wMXM_trimmed_segment_348.wav,2.818,Please like share and subscribe to my channel,Press like share and subscribe for me as well
593,b4o5YC_wMXM,1022.107,1025.358,If you want me to take you somewhere,data/sub/b4o5YC_wMXM_trimmed_segment_349.wav,3.251,If you want me to take you out,If you want me to go on a trip
594,b4o5YC_wMXM,1025.383,1027.521,to eat or do any activities,data/sub/b4o5YC_wMXM_trimmed_segment_350.wav,2.138,Where should I go next,Where can I eat and play
595,b4o5YC_wMXM,1027.546,1029.461,you can leave me a comment,data/sub/b4o5YC_wMXM_trimmed_segment_351.wav,1.915,Feel free to leave a comment,You can leave a comment


In [50]:
bleu_scores_whisper_finetuned = combined_df.apply(lambda row : compute_bleu(row, 'hypothesis_finetuned', 'reference'), axis = 1)
bleu_scores_whisper_finetuned.describe()

count    597.00000
mean       0.01289
std        0.08359
min        0.00000
25%        0.00000
50%        0.00000
75%        0.00000
max        1.00000
dtype: float64