In [1]:
# ! pip install git+https://github.com/openai/whisper.git
#! pip install numpy torch pandas urllib3 torchaudio scipy tqdm matplotlib transformers deepcut jiwer pydub evaluate
#! pip install tf-keras

In [3]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd
from pydub import AudioSegment
from transformers import pipeline
import string
from datasets import Dataset, Audio

from scipy.io import wavfile
from tqdm.notebook import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


## Preprocessing Audio and Subtitles Files

In [20]:
# Retrieve .txt files
def list_files_in_directory(directory):
    file_list = []
    for filename in os.listdir(directory):
        # Only pick up files with .txt extensions (transcript)
        if filename.endswith(".txt"):
            file_list.append(filename.replace(".txt", ""))
    return file_list

# Create the dataframe
def get_reference_df(directory, audio_txt_file):
    txt_file_path = os.path.join(directory, audio_txt_file + ".txt")
    columns = ["start_time", "end_time", "reference"]
    # Read the text file into a DataFrame
    df = pd.read_csv(txt_file_path, sep="\t", header=None, names=columns, quoting=3)

    # Add file name
    df.insert(0, 'file_name', pd.Series([audio_txt_file] * len(df)))

    # Remove quotation marks
    df['reference'] = df['reference'].apply(lambda x : x.replace('"',""))
    
    return df

# Trim audio files
def trim_wav_by_timestamps(directory, wav_file_name, reference_df):
    # Create the output directory if it doesn't exist
    output_dir = "data/sub/"
    os.makedirs(output_dir, exist_ok=True)
    wav_file = os.path.join(directory, wav_file_name + ".wav") # get into data file
    
    # Load the WAV file
    audio = AudioSegment.from_wav(wav_file)
    
    def trim_segments(row):
        start_ms = float(row['start_time']) * 1000  # Convert start time to milliseconds
        end_ms = float(row['end_time']) * 1000      # Convert end time to milliseconds
        trimmed_segment = audio[start_ms:end_ms]
    
        return trimmed_segment
    
    # Iterate over timestamps and trim the audio
    for i, row in reference_df.iterrows():
        trimmed_segment = trim_segments(row)
        output_file = os.path.join(output_dir, wav_file_name + "_" f"trimmed_segment_{i+1}.wav")
        trimmed_segment.export(output_file, format="wav")
        reference_df.at[i, 'trimmed_segment_path'] = output_file
    
    return reference_df

# Helper function to remove punctuations from original subtitles
def strip_punctuation(text):
    # Create a translation table that maps each punctuation character to None
    translator = str.maketrans('', '', string.punctuation)
    # Use the translation table to remove all punctuation from the text
    return text.translate(translator)

# Filter english text and append it to dataframe
def filter_subs_by_lang(reference_df):
    # Helper function that is applied across the rows to filter english text only
    
    def filter_english_only(text):
        # Define a regex pattern to match English letters, numbers, spaces, and specific punctuation
        english_pattern = re.compile(r'[A-Za-z0-9\s.,!?]+')
        # Find all matches in the text
        english_words_numbers_punctuations = english_pattern.findall(text)
        # Join the matches into a single string
        filtered_text = ' '.join(english_words_numbers_punctuations)
        return filtered_text
    
    def filter_thai_only(text):
        # Remove punctuation from text
        text = strip_punctuation(text)
        # Tokenize the string, split by spaces
        list_of_words = text.split()
        # Define a regex pattern to match English words
        english_pattern = re.compile(r'\b[A-Za-z]+\b')
        # Find all English words in the text
        english_words = english_pattern.findall(text)
        # Find all Thai words
        thai_words = [word for word in list_of_words if word not in english_words]
        # Concatenate the Thai words into a string
        thai_text = ' '.join(thai_words)
        # Return thai string
        return thai_text

    reference_df['eng_reference'] = reference_df['reference'].apply(filter_english_only)
    reference_df['thai_reference'] = reference_df['reference'].apply(filter_thai_only)

    return reference_df


def get_combined_audio_table(directory, file_names):
    combined_df = pd.DataFrame()
    for file_name in file_names:
        # Reads the transcript dataframe which has the start_time, end_time of each transcript
        print(file_name)
        reference_df = get_reference_df(directory, file_name)

        # Split subtitles by language
        reference_df = filter_subs_by_lang(reference_df)

        # Remove 'reference' column
        reference_df = reference_df.drop('reference', axis=1)

        # Uncomment to trim all the .wav file according to the subtitles start_time and end_time
        reference_df = trim_wav_by_timestamps(directory, file_name, reference_df)
        
        # Append the processed DataFrame to the combined DataFrame
        combined_df = pd.concat([combined_df, reference_df], ignore_index=True)

        # Comment out this section if not required
        #combined_df = combined_df.drop(['file_name', 'start_time', 'end_time'], axis=1)
    
    return combined_df

directory = os.path.join(os.getcwd(), "data/")
file_names = list_files_in_directory(directory)
df_test = get_combined_audio_table(directory, file_names)
df_test

8p9rJ-cFHQ0
2OF_N9xQOAc
E-RvXySDc48
5VAwZ7sKNtU
G2CkH0xBj6Y
e0YgFgE10nk
b4o5YC_wMXM


Unnamed: 0,file_name,start_time,end_time,eng_reference,thai_reference,trimmed_segment_path
0,8p9rJ-cFHQ0,0.000,1.800,I may not have thought this way,,data/sub/8p9rJ-cFHQ0_trimmed_segment_1.wav
1,8p9rJ-cFHQ0,1.800,3.880,but as time went by,,data/sub/8p9rJ-cFHQ0_trimmed_segment_2.wav
2,8p9rJ-cFHQ0,4.240,6.600,there is something that teaches us,,data/sub/8p9rJ-cFHQ0_trimmed_segment_3.wav
3,8p9rJ-cFHQ0,6.600,8.600,that there are things we cannot control,,data/sub/8p9rJ-cFHQ0_trimmed_segment_4.wav
4,8p9rJ-cFHQ0,8.600,9.920,and things we can control,,data/sub/8p9rJ-cFHQ0_trimmed_segment_5.wav
...,...,...,...,...,...,...
2075,b4o5YC_wMXM,1019.264,1022.082,"Please press like, share and subscribe.",กดไลค์ กดแชร์ และซับสไครบ์ให้มิวด้วยนะคะ,data/sub/b4o5YC_wMXM_trimmed_segment_348.wav
2076,b4o5YC_wMXM,1022.107,1025.358,"If you want me to take you somewhere,",และถ้าใครอยากให้มิวพาไปเที่ยว,data/sub/b4o5YC_wMXM_trimmed_segment_349.wav
2077,b4o5YC_wMXM,1025.383,1027.521,"to eat, or do any activities",ไปกิน ไปเล่นที่ไหนต่อเนี่ย,data/sub/b4o5YC_wMXM_trimmed_segment_350.wav
2078,b4o5YC_wMXM,1027.546,1029.461,you can leave me a comment.,ก็สามารถคอมเมนต์มาได้เลยนะคะ,data/sub/b4o5YC_wMXM_trimmed_segment_351.wav


In [21]:
import pandas as pd
from pydub import AudioSegment
import os

# Function to combine WAV files with padding and split into multiple files if necessary
def combine_wav_files_with_split(padding_duration_ms, max_duration_seconds, csv_df, output_dir):

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    combined = AudioSegment.silent(duration=0)  # Start with an empty audio segment
    padding = AudioSegment.silent(duration=padding_duration_ms)  # Create padding segment
    file_count = 1
    output_files = []
    new_csv_rows = []

    current_transcription = {
        'eng_reference': [],
        'thai_reference': [],
        'trimmed_segment_path': None
    }

    for index, row in csv_df.iterrows():
        audio = AudioSegment.from_wav(row['trimmed_segment_path'])

        # If adding the next audio exceeds 28 seconds
        if len(combined) + len(audio) + padding_duration_ms > max_duration_seconds * 1000:
            # Export the current combined audio to a new file
            output_filename = f'{output_dir}/combined_output_{file_count}.wav'
            combined.export(output_filename, format='wav')
            output_files.append(output_filename)
            file_count += 1

            # Update CSV with the current transcription information
            current_transcription['eng_reference'] = ' '.join(current_transcription['eng_reference'])
            current_transcription['thai_reference'] = ' '.join(current_transcription['thai_reference'])
            current_transcription['trimmed_segment_path'] = output_filename
            new_csv_rows.append(current_transcription)

            # Start a new combined segment and reset transcription
            combined = AudioSegment.silent(duration=0)
            current_transcription = {
                'eng_reference': [],
                'thai_reference': [],
                'trimmed_segment_path': None
            }

        combined += audio + padding
        current_transcription['eng_reference'].append(row['eng_reference'])
        current_transcription['thai_reference'].append(row['thai_reference'])

    # Export the last combined audio segment if it has any content
    if len(combined) > 0:
        output_filename = f'{output_dir}/combined_output_{file_count}.wav'
        combined.export(output_filename, format='wav')
        output_files.append(output_filename)

        current_transcription['eng_reference'] = ' '.join(current_transcription['eng_reference'])
        current_transcription['thai_reference'] = ' '.join(current_transcription['thai_reference'])
        current_transcription['trimmed_segment_path'] = output_filename
        new_csv_rows.append(current_transcription)

    new_csv_df = pd.DataFrame(new_csv_rows)
    return output_files, new_csv_df

# List of WAV files from the CSV
padding_duration_ms = 1000  # 1 second padding
max_duration_seconds = 28  # Maximum duration of 28 seconds per file

# Combine the WAV files with splitting if necessary
output_files, df_test = combine_wav_files_with_split(padding_duration_ms, max_duration_seconds, combined_df, "test_wav")

In [21]:
df_test = df_test.drop('thai_reference', axis = 1)

In [22]:
df_test

Unnamed: 0,file_name,start_time,end_time,eng_reference,trimmed_segment_path
0,8p9rJ-cFHQ0,0.000,1.800,I may not have thought this way,data/sub/8p9rJ-cFHQ0_trimmed_segment_1.wav
1,8p9rJ-cFHQ0,1.800,3.880,but as time went by,data/sub/8p9rJ-cFHQ0_trimmed_segment_2.wav
2,8p9rJ-cFHQ0,4.240,6.600,there is something that teaches us,data/sub/8p9rJ-cFHQ0_trimmed_segment_3.wav
3,8p9rJ-cFHQ0,6.600,8.600,that there are things we cannot control,data/sub/8p9rJ-cFHQ0_trimmed_segment_4.wav
4,8p9rJ-cFHQ0,8.600,9.920,and things we can control,data/sub/8p9rJ-cFHQ0_trimmed_segment_5.wav
...,...,...,...,...,...
2075,b4o5YC_wMXM,1019.264,1022.082,"Please press like, share and subscribe.",data/sub/b4o5YC_wMXM_trimmed_segment_348.wav
2076,b4o5YC_wMXM,1022.107,1025.358,"If you want me to take you somewhere,",data/sub/b4o5YC_wMXM_trimmed_segment_349.wav
2077,b4o5YC_wMXM,1025.383,1027.521,"to eat, or do any activities",data/sub/b4o5YC_wMXM_trimmed_segment_350.wav
2078,b4o5YC_wMXM,1027.546,1029.461,you can leave me a comment.,data/sub/b4o5YC_wMXM_trimmed_segment_351.wav


In [61]:
df_test.to_csv("combinedtest.csv")

In [8]:
df_test = pd.read_csv("combinedtest.csv", index_col= 0)

In [9]:
df_test.head()

Unnamed: 0,eng_reference,trimmed_segment_path
0,Hello everyone. I'm Anucha Kornsawad. You can ...,test_wav/combined_output_1.wav
1,I was working and studying since I was vocatio...,test_wav/combined_output_2.wav
2,"So, my father worried about me. At first I don...",test_wav/combined_output_3.wav
3,You told us that you didn't like it at first. ...,test_wav/combined_output_4.wav
4,It's not bad. It's not bad. How many hours did...,test_wav/combined_output_5.wav


# Audio Transcription with Whisper

In [23]:
audio_dataset_test = Dataset.from_dict({
        "audio": df_test['trimmed_segment_path'].tolist()
    }
)

# Casting audio column to Audio type
audio_dataset_test = audio_dataset_test.cast_column("audio", Audio())

# Adding transcriptions column
audio_dataset_test = audio_dataset_test.add_column("transcription", np.array(df_test['eng_reference']))

print(audio_dataset_test)

Dataset({
    features: ['audio', 'transcription'],
    num_rows: 2080
})


In [24]:
from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer, WhisperForConditionalGeneration

tokenizer = WhisperTokenizer.from_pretrained("whisper-medium", language="Thai", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("whisper-medium")
processor = WhisperProcessor.from_pretrained("whisper-medium", language="Thai", task="transcribe")                                                                
model = WhisperForConditionalGeneration.from_pretrained("whisper-medium").to("cuda")

forced_decoder_ids = processor.get_decoder_prompt_ids(language="thai", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
def map_to_pred(batch):
    audio = batch["audio"]
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features

    with torch.no_grad():
        predicted_ids = model.generate(input_features.to("cuda"), forced_decoder_ids = forced_decoder_ids)[0]

    transcription = processor.decode(predicted_ids, skip_special_tokens = True)
    transcription = processor.tokenizer._normalize(transcription)
    batch["thai_transcription"] = transcription
    return batch

result = audio_dataset_test.map(map_to_pred)

Map: 100%|██████████| 2080/2080 [14:41<00:00,  2.36 examples/s]  


In [26]:
result['thai_transcription']

['เมอกอนไปอาจจะไมไดคดแบบนนะ',
 'แตพอระยะเวลามนเปลยนไปพวต',
 'คอมนมอะไรบางอยางทมนสอนเราวา',
 'มนมสงทเราควบคมไมได',
 'และครบคมได',
 'นสงทเราคกคมได',
 'วากคดวาปยจะทาเหมอนทดทสด',
 'ไมวาจะเปนในภาษณอะไรกตามแต',
 'แตในเรองทมนคบคมไมได',
 'กคดวา พกตองทาใจยอมรบกบมน',
 'และกลลวม',
 'กตองการเตนเยอะ',
 'บอกตวเองวา เรานงโคตรเจงหวะ',
 'เราผานมาไดถงขนาดนไดไง',
 'แลวกใหกาลงใจตวเองละ',
 'แลวเรากร เรากนมาตลอดวา',
 'ทผานมาสบกวาป',
 'เฮยแมงเราทาดทกอยางแลว',
 'เราทาดทสดแลว',
 'ไมวาจะในพฒความเปนแมของรป',
 'ในภาษณความเปนผลยา',
 'ในความเปนเพอนคคด',
 'ทาดทสดแลว',
 'เทาทผหญงคนหนงจะทาไดพบต',
 'และกนสดแลวอะ',
 'เรากตองบอกวา .',
 'ในความเปนพอเปนแม',
 'เพมจะยงคงอยตลอดไป',
 'มนจะไมหายไปไหน',
 'ฉะนนผมไมตองกาวน',
 'โอย เดยวมพอและแมอก',
 'ใชคะ',
 'อยกบคณไมไดหายไปไหนเลย',
 'กพงตววา',
 'ในความสมพนธระหวางพอกบแม',
 'มนอาจจะมเปลยนไปนะ',
 'ผหญงอะ ถามนมแตความถกกลว',
 'แลวมนจะเอาความสขทไหนไปเลยงลกกบพ',
 'เขาเรยกวาอะไรเหรอบา',
 'ประสบการ สอนเรา',
 'ทสดแลวมนไมมใครอยกบเราเทาตวเราเอง',
 'มนสวยเยอะ',
 'นเราขาวกขาวเลยแลวเนย',
 'มนไมไ

In [27]:
df_test['thai'] = result['thai_transcription']
df_test

Unnamed: 0,file_name,start_time,end_time,eng_reference,trimmed_segment_path,thai
0,8p9rJ-cFHQ0,0.000,1.800,I may not have thought this way,data/sub/8p9rJ-cFHQ0_trimmed_segment_1.wav,เมอกอนไปอาจจะไมไดคดแบบนนะ
1,8p9rJ-cFHQ0,1.800,3.880,but as time went by,data/sub/8p9rJ-cFHQ0_trimmed_segment_2.wav,แตพอระยะเวลามนเปลยนไปพวต
2,8p9rJ-cFHQ0,4.240,6.600,there is something that teaches us,data/sub/8p9rJ-cFHQ0_trimmed_segment_3.wav,คอมนมอะไรบางอยางทมนสอนเราวา
3,8p9rJ-cFHQ0,6.600,8.600,that there are things we cannot control,data/sub/8p9rJ-cFHQ0_trimmed_segment_4.wav,มนมสงทเราควบคมไมได
4,8p9rJ-cFHQ0,8.600,9.920,and things we can control,data/sub/8p9rJ-cFHQ0_trimmed_segment_5.wav,และครบคมได
...,...,...,...,...,...,...
2075,b4o5YC_wMXM,1019.264,1022.082,"Please press like, share and subscribe.",data/sub/b4o5YC_wMXM_trimmed_segment_348.wav,กดไลค กดชา แลวก subscribe ใหพมดวยนะคะ
2076,b4o5YC_wMXM,1022.107,1025.358,"If you want me to take you somewhere,",data/sub/b4o5YC_wMXM_trimmed_segment_349.wav,ใครอยากจะใหมวพาไปเทยว
2077,b4o5YC_wMXM,1025.383,1027.521,"to eat, or do any activities",data/sub/b4o5YC_wMXM_trimmed_segment_350.wav,อะ อะ
2078,b4o5YC_wMXM,1027.546,1029.461,you can leave me a comment.,data/sub/b4o5YC_wMXM_trimmed_segment_351.wav,กสาหรบคอมเมนตมาไดเลยนะคะ


In [28]:
df_test.to_csv("hi.csv") # send to jaslyn

## Direct Translation with Whisper

In [29]:
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="translate")

In [35]:
def map_to_pred_translate(batch):
    audio = batch["audio"]
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features

    with torch.no_grad():
        predicted_ids = model.generate(input_features.to("cuda"), forced_decoder_ids = forced_decoder_ids)[0]

    transcription = processor.decode(predicted_ids, skip_special_tokens = True)
    transcription = processor.tokenizer._normalize(transcription)

    batch["eng_translation"] = transcription
    return batch

result = audio_dataset_test.map(map_to_pred_translate)

Map: 100%|██████████| 2080/2080 [08:32<00:00,  4.06 examples/s]  


In [37]:
result['eng_translation']

['i did not think of it that way before',
 'but as time goes by woody .',
 'there are some things that teach us that',
 'there are things that we cannot control',
 'and control',
 'this is what we can control',
 'i thought i would do my best',
 'no matter what part of the story it is',
 'but it is something that cannot be controlled',
 'i think you have to accept it',
 'and then i am done',
 '0 god the end yeah yeah',
 'i told myself i am so cool',
 'how did we get this far',
 'look at',
 'and i always thought that',
 'in the past 10 years',
 'i did everything right',
 'we did our best',
 'whether it is in the family or in the motherhood',
 'in the story of being a wife',
 'in the sense of being a friend',
 'you did the best',
 'as much as a woman can do p wutty',
 'that is lower',
 'i have to say that .',
 'in being a mom and dad',
 'i will always be here for you',
 'it will not disappear',
 'so do not worry',
 'for the me parlor man',
 'time',
 'you are with me everywhere',
 'pink th

## CER

In [41]:
cer = load("cer")
cer_score = cer.compute(predictions=combined_df['hypothesis'], references=combined_df['reference'])
print("Character Error Rate:", cer_score)

Character Error Rate: 0.7598657562782085


In [42]:
cer_score_finetuned = cer.compute(predictions=combined_df['hypothesis'], references=transcriptions_finetuned)
print("Character Error Rate (Finetuned):", cer_score_finetuned)

Character Error Rate (Finetuned): 0.7792978261832028


## Meteor Score

In [45]:
import evaluate 

meteor = evaluate.load('meteor')

[nltk_data] Downloading package wordnet to /home/dhuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/dhuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/dhuser/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [46]:
def compute_meteor(row, hypothesis_column, reference_column):
    meteor_score = meteor.compute(predictions=[row[hypothesis_column],], references=[row[reference_column],])
    return meteor_score['meteor']

meteor_scores_whisper = combined_df.apply(lambda row : compute_meteor(row, 'hypothesis', 'reference'), axis = 1)
meteor_scores_whisper.describe()

count    597.000000
mean       0.193621
std        0.238468
min        0.000000
25%        0.000000
50%        0.102041
75%        0.328488
max        0.996000
dtype: float64

In [47]:
meteor_scores_whisper_finetuned = combined_df.apply(lambda row : compute_meteor(row, 'hypothesis_finetuned', 'reference'), axis = 1)
meteor_scores_whisper_finetuned.describe()

count    597.000000
mean       0.225605
std        0.228305
min        0.000000
25%        0.050000
50%        0.142857
75%        0.368056
max        0.996000
dtype: float64

## Bleu Scores

In [48]:
bleu = evaluate.load("bleu")

In [49]:
def compute_bleu(row, hypothesis_column, reference_column):
    # Extract the hypothesis and reference
    hypothesis = row[hypothesis_column]
    reference = row[reference_column]

    # Check for empty hypothesis or reference
    if not hypothesis or not reference:
        return 0.0

    # Compute BLEU score
    bleu_score = bleu.compute(predictions=[hypothesis], references=[[reference]])
    return bleu_score['bleu']

bleu_scores_whisper = combined_df.apply(lambda row : compute_bleu(row, 'hypothesis', 'reference'), axis = 1)
bleu_scores_whisper.describe()

count    597.000000
mean       0.022754
std        0.108430
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
dtype: float64

In [51]:
bleu_scores_whisper

0      0.000000
1      0.000000
2      0.000000
3      0.000000
4      0.000000
         ...   
592    0.365555
593    0.840896
594    0.000000
595    0.000000
596    0.000000
Length: 597, dtype: float64

In [52]:
combined_df

Unnamed: 0,file_name,start_time,end_time,reference,trimmed_segment_path,segment_duration,hypothesis,hypothesis_finetuned
0,2OF_N9xQOAc,0.000,3.118,Right now I m in Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_1.wav,3.118,But I hate Singapore,Now Im in Singapore
1,2OF_N9xQOAc,3.118,4.909,Most people when they come to Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_2.wav,1.791,See you in Singapore,Im going to Singapore with my friends
2,2OF_N9xQOAc,4.909,6.129,They come to eat dim sum,data/sub/2OF_N9xQOAc_trimmed_segment_3.wav,1.220,Lets eat som tam,Lets eat tipsum
3,2OF_N9xQOAc,6.129,7.498,Visit Merlion,data/sub/2OF_N9xQOAc_trimmed_segment_4.wav,1.369,Lets see Aons hand,Lets see Lions Hand
4,2OF_N9xQOAc,7.498,8.827,Visit Universal,data/sub/2OF_N9xQOAc_trimmed_segment_5.wav,1.329,,Its a universal
...,...,...,...,...,...,...,...,...
592,b4o5YC_wMXM,1019.264,1022.082,Please press like share and subscribe,data/sub/b4o5YC_wMXM_trimmed_segment_348.wav,2.818,Please like share and subscribe to my channel,Press like share and subscribe for me as well
593,b4o5YC_wMXM,1022.107,1025.358,If you want me to take you somewhere,data/sub/b4o5YC_wMXM_trimmed_segment_349.wav,3.251,If you want me to take you out,If you want me to go on a trip
594,b4o5YC_wMXM,1025.383,1027.521,to eat or do any activities,data/sub/b4o5YC_wMXM_trimmed_segment_350.wav,2.138,Where should I go next,Where can I eat and play
595,b4o5YC_wMXM,1027.546,1029.461,you can leave me a comment,data/sub/b4o5YC_wMXM_trimmed_segment_351.wav,1.915,Feel free to leave a comment,You can leave a comment


In [50]:
bleu_scores_whisper_finetuned = combined_df.apply(lambda row : compute_bleu(row, 'hypothesis_finetuned', 'reference'), axis = 1)
bleu_scores_whisper_finetuned.describe()

count    597.00000
mean       0.01289
std        0.08359
min        0.00000
25%        0.00000
50%        0.00000
75%        0.00000
max        1.00000
dtype: float64