In [1]:
#! pip install git+https://github.com/openai/whisper.git

In [38]:
#!pip install numpy torch pandas urllib3 torchaudio scipy tqdm matplotlib transformers deepcut jiwer pydub evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting pyarrow>=12.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-16.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets>=2.0.0->evaluate)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting aiohttp (from datasets>=2.0.0->evaluate)
  Downloading aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting aiosignal>=1.1.2 (from aiohtt

In [70]:
# !pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.16.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hInstalling collected packages: tf-keras
Successfully installed tf-keras-2.16.0


In [1]:
import io
import os
import re
import numpy as np
import matplotlib.pyplot as plt

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import deepcut
import pandas as pd
import urllib
import tarfile
import whisper
import torchaudio
import wave
from pydub import AudioSegment
from transformers import pipeline

from scipy.io import wavfile
from tqdm.notebook import tqdm

from jiwer import cer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


2024-05-15 10:52:34.122066: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-15 10:52:34.155424: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


## Preprocessing Audio and Subtitles Files

In [6]:
def list_files_in_directory(directory):
    file_list = []
    for filename in os.listdir(directory):
        # Only pick up files with .txt extensions (transcript)
        if filename.endswith(".txt"):
            file_list.append(filename.replace(".txt", ""))
    return file_list

def get_reference_df(audio_txt_file):
    columns = ["start_time", "end_time", "reference"]
    txt_file_path = os.path.join("data", audio_txt_file + ".txt")
    # Read the text file into a DataFrame
    df = pd.read_csv(txt_file_path, sep="\t", header=None, names=columns, quoting=3)

    # Add file name
    df.insert(0, 'file_name', pd.Series([audio_txt_file] * len(df)))

    # Remove quotation marks
    df['reference'] = df['reference'].apply(lambda x : x.replace('"',""))
    
    return df

def trim_wav_by_timestamps(wav_file_name, reference_df):
    # Create the output directory if it doesn't exist
    output_dir = "data/sub/"
    os.makedirs(output_dir, exist_ok=True)
    wav_file = os.path.join("data", wav_file_name + ".wav") # get into data file
    
    # Load the WAV file
    audio = AudioSegment.from_wav(wav_file)
    
    def trim_segments(row):
        start_ms = float(row['start_time']) * 1000  # Convert start time to milliseconds
        end_ms = float(row['end_time']) * 1000      # Convert end time to milliseconds
        trimmed_segment = audio[start_ms:end_ms]
    
        return trimmed_segment
    
    # Iterate over timestamps and trim the audio
    for i, row in reference_df.iterrows():
        trimmed_segment = trim_segments(row)
        output_file = os.path.join(output_dir, wav_file_name + "_" f"trimmed_segment_{i+1}.wav")
        trimmed_segment.export(output_file, format="wav")
        reference_df.at[i, 'trimmed_segment_path'] = output_file
    
    return reference_df

def filter_english_subs(reference_df):
    # Helper function that is applied across the rows to filter english text only
    def filter_english_only(text):
        # Define a regex pattern to match English words
        english_pattern = re.compile(r'\b[A-Za-z]+\b')
        # Find all English words in the text
        english_words = english_pattern.findall(text)
        # Join the English words into a single string
        english_text = ' '.join(english_words)
        return english_text

    reference_df['reference'] = reference_df['reference'].apply(filter_english_only)

    return reference_df

def get_combined_audio_table(file_names):
    combined_df = pd.DataFrame()
    for file_name in file_names:
        # Reads the transcript dataframe which has the start_time, end_time of each transcript
        reference_df = get_reference_df(file_name)

        # Retain only English translations in the transcript (reference) column
        reference_df = filter_english_subs(reference_df)

        # Trims all the .wav file according to the subtitles start_time and end_time
        reference_df = trim_wav_by_timestamps(file_name, reference_df)
        
        # Append the processed DataFrame to the combined DataFrame
        combined_df = pd.concat([combined_df, reference_df], ignore_index=True)
    
    return combined_df

In [9]:
directory = os.path.join(os.getcwd(), "data")

file_names = list_files_in_directory(directory)
print(file_names)
combined_df = get_combined_audio_table(file_names)
combined_df

['ScnwIYdmqYw', 'De95Osq7p1c', 'GWXwTJM68hk']


Unnamed: 0,file_name,start_time,end_time,reference,trimmed_segment_path
0,ScnwIYdmqYw,0.708,4.350,Hello this is Dr Supawat from Eternity Clinic,data/sub/ScnwIYdmqYw_trimmed_segment_1.wav
1,ScnwIYdmqYw,4.350,9.554,today we are going to give you some knowledge ...,data/sub/ScnwIYdmqYw_trimmed_segment_2.wav
2,ScnwIYdmqYw,9.554,11.554,Into the genitals,data/sub/ScnwIYdmqYw_trimmed_segment_3.wav
3,ScnwIYdmqYw,21.000,25.426,Many patients say that the injection of foreig...,data/sub/ScnwIYdmqYw_trimmed_segment_4.wav
4,ScnwIYdmqYw,25.426,26.393,Into the genitals,data/sub/ScnwIYdmqYw_trimmed_segment_5.wav
...,...,...,...,...,...
579,GWXwTJM68hk,784.274,786.203,Because it s a special order from me,data/sub/GWXwTJM68hk_trimmed_segment_351.wav
580,GWXwTJM68hk,786.235,788.016,Take a closer look,data/sub/GWXwTJM68hk_trimmed_segment_352.wav
581,GWXwTJM68hk,788.329,790.116,Please leave an answer within,data/sub/GWXwTJM68hk_trimmed_segment_353.wav
582,GWXwTJM68hk,790.141,792.530,Wednesday April,data/sub/GWXwTJM68hk_trimmed_segment_354.wav


In [56]:
combined_df['segment_duration'] = combined_df.apply(lambda x : x['end_time'] - x['start_time'], axis = 1)

In [58]:
combined_df['segment_duration'].describe() # All audio segments are under 10s

count    584.000000
mean       2.102329
std        1.055981
min        0.500000
25%        1.383000
50%        1.858500
75%        2.475250
max        7.442000
Name: segment_duration, dtype: float64

## Direct Translation with Whisper

In [25]:
import whisper

model = whisper.load_model("large-v2")

In [26]:
language = "Thai"
options = dict(language=language, beam_size=5, best_of=5)
translate_options = dict(task="translate", **options)

In [27]:
def map_transcription(row):
    segment_wavfile = row['trimmed_segment_path']
    transcription = model.transcribe(segment_wavfile, **translate_options)['text']
    return transcription

In [28]:
transcriptions = combined_df.apply(map_transcription, axis=1)

In [29]:
combined_df['hypothesis'] = transcriptions

In [30]:
combined_df

Unnamed: 0,file_name,start_time,end_time,reference,trimmed_segment_path,hypothesis,segment_duration
0,ScnwIYdmqYw,0.708,4.350,Hello this is Dr Supawat from Eternity Clinic,data/sub/ScnwIYdmqYw_trimmed_segment_1.wav,"Hello, I'm Dr. Superwat from Eternity Clinic.",3.642
1,ScnwIYdmqYw,4.350,9.554,today we are going to give you some knowledge ...,data/sub/ScnwIYdmqYw_trimmed_segment_2.wav,"Today, I'm going to tell you about Synthetic ...",5.204
2,ScnwIYdmqYw,9.554,11.554,Into the genitals,data/sub/ScnwIYdmqYw_trimmed_segment_3.wav,menopause.,2.000
3,ScnwIYdmqYw,21.000,25.426,Many patients say that the injection of foreig...,data/sub/ScnwIYdmqYw_trimmed_segment_4.wav,many patients have also diagnosed that it's i...,4.426
4,ScnwIYdmqYw,25.426,26.393,Into the genitals,data/sub/ScnwIYdmqYw_trimmed_segment_5.wav,,0.967
...,...,...,...,...,...,...,...
579,GWXwTJM68hk,784.274,786.203,Because it s a special order from me,data/sub/GWXwTJM68hk_trimmed_segment_351.wav,Especially when it comes to special events.,1.929
580,GWXwTJM68hk,786.235,788.016,Take a closer look,data/sub/GWXwTJM68hk_trimmed_segment_352.wav,Please take a good look at it.,1.781
581,GWXwTJM68hk,788.329,790.116,Please leave an answer within,data/sub/GWXwTJM68hk_trimmed_segment_353.wav,ENG SUB by JayBL,1.787
582,GWXwTJM68hk,790.141,792.530,Wednesday April,data/sub/GWXwTJM68hk_trimmed_segment_354.wav,ENG SUBBED BY GIRLS' GENERATION ENG SUB,2.389


In [31]:
combined_df.tail(50)

Unnamed: 0,file_name,start_time,end_time,reference,trimmed_segment_path,hypothesis,segment_duration
534,GWXwTJM68hk,699.707,701.432,There is no almond flakes left,data/sub/GWXwTJM68hk_trimmed_segment_306.wav,I don't feel tired at all.,1.725
535,GWXwTJM68hk,701.448,703.174,Well it is good,data/sub/GWXwTJM68hk_trimmed_segment_307.wav,It's like...,1.726
536,GWXwTJM68hk,703.198,705.174,so milky it s so fine Yes,data/sub/GWXwTJM68hk_trimmed_segment_308.wav,It's like milk.,1.976
537,GWXwTJM68hk,705.206,706.564,It s fine almond milk,data/sub/GWXwTJM68hk_trimmed_segment_309.wav,Thank you for watching.,1.358
538,GWXwTJM68hk,706.589,707.799,So awesome So hot,data/sub/GWXwTJM68hk_trimmed_segment_310.wav,Very hot!,1.21
539,GWXwTJM68hk,707.831,709.245,How many points,data/sub/GWXwTJM68hk_trimmed_segment_311.wav,,1.414
540,GWXwTJM68hk,710.073,711.385,With point two as well,data/sub/GWXwTJM68hk_trimmed_segment_312.wav,1..2..3..,1.312
541,GWXwTJM68hk,712.057,713.182,Where s the rest of the point,data/sub/GWXwTJM68hk_trimmed_segment_313.wav,See you next time.,1.125
542,GWXwTJM68hk,713.206,714.689,It s too hot If it s a bit warm,data/sub/GWXwTJM68hk_trimmed_segment_314.wav,It's too hot.,1.483
543,GWXwTJM68hk,714.714,716.648,I can drink it up in a minute Oh I see,data/sub/GWXwTJM68hk_trimmed_segment_315.wav,This the result of what I should do.,1.934


In [33]:
print("Total audio time (s):", combined_df['segment_duration'].sum())

Total audio time (s): 1227.7599999999993


In [35]:
MODEL_NAME = "biodatlab/whisper-th-medium-combined"  # see alternative model names below
lang = "th"

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [36]:
def map_transcription_finetuned_asr(row, pipe):
    segment_wavfile = row['trimmed_segment_path']
    transcription = pipe(segment_wavfile, generate_kwargs={"language":"<|th|>", "task":"translate"}, batch_size=16)["text"]
    return transcription

In [38]:
transcriptions_finetuned = combined_df.apply(lambda row: map_transcription_finetuned_asr(row, pipe), axis=1)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [39]:
transcriptions_finetuned

0         Hello, I'm Dr. Suprawath from Eternity Clinic.
1       Today, we will give you some information abou...
2                                         into the type.
3       Many people have been taken to the treatment ...
4                               in the type of tomatoes,
                             ...                        
579                       especially for special events.
580                                     Watch carefully.
581                           I'm going to the bathroom.
582                                     I'm going to the
583                                   Add 1 Tbsp. of oil
Length: 584, dtype: object

In [41]:
combined_df['hypothesis_finetuned'] = transcriptions_finetuned
combined_df

Unnamed: 0,file_name,start_time,end_time,reference,trimmed_segment_path,hypothesis,segment_duration,hypothesis_finetuned
0,ScnwIYdmqYw,0.708,4.350,Hello this is Dr Supawat from Eternity Clinic,data/sub/ScnwIYdmqYw_trimmed_segment_1.wav,"Hello, I'm Dr. Superwat from Eternity Clinic.",3.642,"Hello, I'm Dr. Suprawath from Eternity Clinic."
1,ScnwIYdmqYw,4.350,9.554,today we are going to give you some knowledge ...,data/sub/ScnwIYdmqYw_trimmed_segment_2.wav,"Today, I'm going to tell you about Synthetic ...",5.204,"Today, we will give you some information abou..."
2,ScnwIYdmqYw,9.554,11.554,Into the genitals,data/sub/ScnwIYdmqYw_trimmed_segment_3.wav,menopause.,2.000,into the type.
3,ScnwIYdmqYw,21.000,25.426,Many patients say that the injection of foreig...,data/sub/ScnwIYdmqYw_trimmed_segment_4.wav,many patients have also diagnosed that it's i...,4.426,Many people have been taken to the treatment ...
4,ScnwIYdmqYw,25.426,26.393,Into the genitals,data/sub/ScnwIYdmqYw_trimmed_segment_5.wav,,0.967,"in the type of tomatoes,"
...,...,...,...,...,...,...,...,...
579,GWXwTJM68hk,784.274,786.203,Because it s a special order from me,data/sub/GWXwTJM68hk_trimmed_segment_351.wav,Especially when it comes to special events.,1.929,especially for special events.
580,GWXwTJM68hk,786.235,788.016,Take a closer look,data/sub/GWXwTJM68hk_trimmed_segment_352.wav,Please take a good look at it.,1.781,Watch carefully.
581,GWXwTJM68hk,788.329,790.116,Please leave an answer within,data/sub/GWXwTJM68hk_trimmed_segment_353.wav,ENG SUB by JayBL,1.787,I'm going to the bathroom.
582,GWXwTJM68hk,790.141,792.530,Wednesday April,data/sub/GWXwTJM68hk_trimmed_segment_354.wav,ENG SUBBED BY GIRLS' GENERATION ENG SUB,2.389,I'm going to the


## WER

In [54]:
from evaluate import load
wer = load("wer")
wer_score = wer.compute(predictions=combined_df['hypothesis'], references=combined_df['reference'])
print("Word Error Rate:", wer_score)

Word Error Rate: 0.9378796245168415


In [55]:
wer_score_finetuned = wer.compute(predictions=combined_df['hypothesis'], references=transcriptions_finetuned)
print("Word Error Rate (Finetuned):", wer_score_finetuned)

Word Error Rate (Finetuned): 0.9350684196404615


## Meteor Score

In [86]:
import string

def strip_punctuation(text):
    # Create a translation table that maps each punctuation character to None
    translator = str.maketrans('', '', string.punctuation)
    # Use the translation table to remove all punctuation from the text
    return text.translate(translator)

In [96]:
combined_df['hypothesis'] = combined_df['hypothesis'].apply(strip_punctuation).apply(lambda x : x.strip())
combined_df['hypothesis_finetuned'] = combined_df['hypothesis_finetuned'].apply(strip_punctuation).apply(lambda x : x.strip())


In [48]:
import evaluate 

meteor = evaluate.load('meteor')

[nltk_data] Downloading package wordnet to /home/dhuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/dhuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/dhuser/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [102]:
def compute_meteor(row, hypothesis_column, reference_column):
    meteor_score = meteor.compute(predictions=[row[hypothesis_column],], references=[row[reference_column],])
    return meteor_score['meteor']

meteor_scores_whisper = combined_df.apply(lambda row : compute_meteor(row, 'hypothesis', 'reference'), axis = 1)
meteor_scores_whisper.describe()

count    584.000000
mean       0.231022
std        0.267411
min        0.000000
25%        0.000000
50%        0.125000
75%        0.405705
max        0.997685
dtype: float64

In [104]:
meteor_scores_whisper_finetuned = combined_df.apply(lambda row : compute_meteor(row, 'hypothesis_finetuned', 'reference'), axis = 1)
meteor_scores_whisper_finetuned.describe()

count    584.000000
mean       0.248323
std        0.240214
min        0.000000
25%        0.064935
50%        0.172414
75%        0.385483
max        0.997685
dtype: float64

## Bleu Scores

In [105]:
bleu = evaluate.load("bleu")

In [108]:
def compute_bleu(row, hypothesis_column, reference_column):
    # Extract the hypothesis and reference
    hypothesis = row[hypothesis_column]
    reference = row[reference_column]

    # Check for empty hypothesis or reference
    if not hypothesis or not reference:
        return 0.0

    # Compute BLEU score
    bleu_score = bleu.compute(predictions=[hypothesis], references=[[reference]])
    return bleu_score['bleu']

bleu_scores_whisper = combined_df.apply(lambda row : compute_bleu(row, 'hypothesis', 'reference'), axis = 1)
bleu_scores_whisper.describe()

count    584.000000
mean       0.035520
std        0.142855
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
dtype: float64

In [109]:
bleu_scores_whisper_finetuned = combined_df.apply(lambda row : compute_bleu(row, 'hypothesis_finetuned', 'reference'), axis = 1)
bleu_scores_whisper_finetuned.describe()

count    584.000000
mean       0.033423
std        0.137551
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
dtype: float64

# Extracting Audio Embeddings from Whisper

In [112]:
from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram

In [152]:
mel = log_mel_spectrogram("data/sub/ScnwIYdmqYw_trimmed_segment_1.wav", 80)
mel2 = log_mel_spectrogram("data/sub/ScnwIYdmqYw_trimmed_segment_2.wav", 80)

padded_mel = pad_or_trim(mel, 3000).to("cuda")
padded_mel2 = pad_or_trim(mel2, 3000).to("cuda")
batch_tensor = torch.stack([padded_mel2], dim=0)

In [155]:
model.transcribe("data/sub/ScnwIYdmqYw_trimmed_segment_2.wav", **translate_options)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 