In [1]:
import os
import re
import glob
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd
from pydub import AudioSegment
from transformers import pipeline
import numpy as np
import librosa
from datasets import Dataset, Audio

from scipy.io import wavfile
from tqdm.notebook import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm
2024-06-05 09:52:15.727095: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-05 09:52:15.759316: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Preprocessing Audio and Transcript Files

In [2]:
def list_files_in_directory(directory):
    file_list = []
    for filename in os.listdir(directory):
        # Only pick up files with .txt extensions (transcript)
        if filename.endswith(".txt"):
            file_list.append(filename.replace(".txt", ""))
    return file_list

def get_reference_df(directory, audio_txt_file):
    txt_file_path = os.path.join(directory, audio_txt_file + ".txt")
    columns = ["start_time", "end_time", "reference"]
    # Read the text file into a DataFrame
    df = pd.read_csv(txt_file_path, sep="\t", header=None, names=columns, quoting=3)

    # Add file name
    df.insert(0, 'file_name', pd.Series([audio_txt_file] * len(df)))

    # Remove quotation marks
    df['reference'] = df['reference'].apply(lambda x : x.replace('"',""))
    
    return df

def trim_wav_by_timestamps(directory, wav_file_name, reference_df):
    # Create the output directory if it doesn't exist
    output_dir = "data/sub/"
    os.makedirs(output_dir, exist_ok=True)
    wav_file = os.path.join(directory, wav_file_name + ".wav") # get into data file
    
    # Load the WAV file
    audio = AudioSegment.from_wav(wav_file)
    
    def trim_segments(row):
        start_ms = float(row['start_time']) * 1000  # Convert start time to milliseconds
        end_ms = float(row['end_time']) * 1000      # Convert end time to milliseconds
        trimmed_segment = audio[start_ms:end_ms]
    
        return trimmed_segment
    
    # Iterate over timestamps and trim the audio
    for i, row in reference_df.iterrows():
        trimmed_segment = trim_segments(row)
        output_file = os.path.join(output_dir, wav_file_name + "_" f"trimmed_segment_{i+1}.wav")
        trimmed_segment.export(output_file, format="wav")
        reference_df.at[i, 'trimmed_segment_path'] = output_file
    
    return reference_df

def filter_english_subs(reference_df):
    # Helper function that is applied across the rows to filter english text only
    
    def filter_english_only(text):
        # Define a regex pattern to match English words
        english_pattern = re.compile(r'\b[A-Za-z]+\b')
        # Find all English words in the text
        english_words = english_pattern.findall(text)
        # Join the English words into a single string
        english_text = ' '.join(english_words)
        return english_text

    reference_df['reference'] = reference_df['reference'].apply(filter_english_only)

    return reference_df

def get_combined_audio_table(directory, file_names):
    combined_df = pd.DataFrame()
    for file_name in file_names:
        # Reads the transcript dataframe which has the start_time, end_time of each transcript
        reference_df = get_reference_df(directory, file_name)

        # Retain only English translations in the transcript (reference) column
        reference_df = filter_english_subs(reference_df)

        # Trims all the .wav file according to the subtitles start_time and end_time
        reference_df = trim_wav_by_timestamps(directory, file_name, reference_df)
        
        # Append the processed DataFrame to the combined DataFrame
        combined_df = pd.concat([combined_df, reference_df], ignore_index=True)
    
    return combined_df

directory = os.path.join(os.getcwd(), "data/test/")
file_names = list_files_in_directory(directory)
df_test = get_combined_audio_table(directory, file_names)
df_test

Unnamed: 0,file_name,start_time,end_time,reference,trimmed_segment_path
0,2OF_N9xQOAc,0.000,3.118,Right now I m in Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_1.wav
1,2OF_N9xQOAc,3.118,4.909,Most people when they come to Singapore,data/sub/2OF_N9xQOAc_trimmed_segment_2.wav
2,2OF_N9xQOAc,4.909,6.129,They come to eat dim sum,data/sub/2OF_N9xQOAc_trimmed_segment_3.wav
3,2OF_N9xQOAc,6.129,7.498,Visit Merlion,data/sub/2OF_N9xQOAc_trimmed_segment_4.wav
4,2OF_N9xQOAc,7.498,8.827,Visit Universal,data/sub/2OF_N9xQOAc_trimmed_segment_5.wav
...,...,...,...,...,...
592,b4o5YC_wMXM,1019.264,1022.082,Please press like share and subscribe,data/sub/b4o5YC_wMXM_trimmed_segment_348.wav
593,b4o5YC_wMXM,1022.107,1025.358,If you want me to take you somewhere,data/sub/b4o5YC_wMXM_trimmed_segment_349.wav
594,b4o5YC_wMXM,1025.383,1027.521,to eat or do any activities,data/sub/b4o5YC_wMXM_trimmed_segment_350.wav
595,b4o5YC_wMXM,1027.546,1029.461,you can leave me a comment,data/sub/b4o5YC_wMXM_trimmed_segment_351.wav


In [3]:
def read_and_append_csvs(directory):
    # Get list of all CSV files in the directory
    all_files = glob.glob(os.path.join(directory, "*.csv"))
    
    # List to hold all DataFrames
    df_list = []
    
    # Read each CSV file and append to the list
    for file in all_files:
        df = pd.read_csv(file, index_col= 0)
        df_list.append(df)
    
    # Concatenate all DataFrames
    combined_df = pd.concat(df_list, ignore_index=True)
    
    return combined_df

df_train = read_and_append_csvs('data')

In [4]:
df_train = df_train[~df_train['eng_reference'].isna()]

In [5]:
df_train = df_train[~df_train['thai_reference'].isna()]

In [7]:
df_train = df_train.drop('meteor_scores', axis = 1)

## Padding and Trimming to 28 seconds of Audio

In [8]:
segment_duration = df_train.apply(lambda x : x['end_time'] - x['start_time'], axis = 1)

print("Total length of audio-transcription pairs:", segment_duration.sum())

Total length of audio-transcription pairs: 12090.383999999995


In [9]:
import pandas as pd
from pydub import AudioSegment
import os

# Function to combine WAV files with padding and split into multiple files if necessary
def combine_wav_files_with_split(padding_duration_ms, max_duration_seconds, csv_df, output_dir):

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    combined = AudioSegment.silent(duration=0)  # Start with an empty audio segment
    padding = AudioSegment.silent(duration=padding_duration_ms)  # Create padding segment
    file_count = 1
    output_files = []
    new_csv_rows = []

    current_transcription = {
        'eng_reference': [],
        'thai_reference': [],
        'trimmed_segment_path': None
    }

    for index, row in csv_df.iterrows():
        audio = AudioSegment.from_wav(row['trimmed_segment_path'])

        # If adding the next audio exceeds 28 seconds
        if len(combined) + len(audio) + padding_duration_ms > max_duration_seconds * 1000:
            # Export the current combined audio to a new file
            output_filename = f'{output_dir}/combined_output_{file_count}.wav'
            combined.export(output_filename, format='wav')
            output_files.append(output_filename)
            file_count += 1

            # Update CSV with the current transcription information
            current_transcription['eng_reference'] = ' '.join(current_transcription['eng_reference'])
            current_transcription['thai_reference'] = ' '.join(current_transcription['thai_reference'])
            current_transcription['trimmed_segment_path'] = output_filename
            new_csv_rows.append(current_transcription)

            # Start a new combined segment and reset transcription
            combined = AudioSegment.silent(duration=0)
            current_transcription = {
                'eng_reference': [],
                'thai_reference': [],
                'trimmed_segment_path': None
            }

        combined += audio + padding
        current_transcription['eng_reference'].append(row['eng_reference'])
        current_transcription['thai_reference'].append(row['thai_reference'])

    # Export the last combined audio segment if it has any content
    if len(combined) > 0:
        output_filename = f'{output_dir}/combined_output_{file_count}.wav'
        combined.export(output_filename, format='wav')
        output_files.append(output_filename)

        current_transcription['eng_reference'] = ' '.join(current_transcription['eng_reference'])
        current_transcription['thai_reference'] = ' '.join(current_transcription['thai_reference'])
        current_transcription['trimmed_segment_path'] = output_filename
        new_csv_rows.append(current_transcription)

    new_csv_df = pd.DataFrame(new_csv_rows)
    return output_files, new_csv_df

# List of WAV files from the CSV
padding_duration_ms = 1000  # 1 second padding
max_duration_seconds = 28  # Maximum duration of 28 seconds per file

# Combine the WAV files with splitting if necessary
output_files, df_train_combined = combine_wav_files_with_split(padding_duration_ms, max_duration_seconds, df_train, "combined_wav")

In [10]:
df_train_combined # Trimmed to 684 rows!

Unnamed: 0,eng_reference,thai_reference,trimmed_segment_path
0,It's been 10 years. Some people can't let go. ...,เหตุการณ์มันผ่านมาแล้ว 10 ปี บางคนปล่อยไม่ได้ ...,combined_wav/combined_output_1.wav
1,"Uh, okay, now. Sometimes. We'll find out. The ...",อะ โอเค ทีนี้ บางครั้ง เราจะเจอแล้วว่า ความคิด...,combined_wav/combined_output_2.wav
2,We have an N. We have bones. We have muscles. ...,เรามีเอ็น เรามีกระดูก เรามีกล้ามเนื้อ เสียไปทุ...,combined_wav/combined_output_3.wav
3,Come into our lives again. Let's just let it g...,เข้ามาในชีวิตของเราอีก ลองปล่อยมันอัตโนมัติไปเ...,combined_wav/combined_output_4.wav
4,I know what's out there. I don't know how to s...,รู้แต่เรื่องข้างนอก ไม่รู้จะปิดยังไง เราล่ะ มั...,combined_wav/combined_output_5.wav
...,...,...,...
679,Leonardo Da Vinci painted it on purpose. I...,ลีโอนาร์โด ดาร์วินชีเขาตั้งใจวาด เก๋จะขยับกล้อ...,combined_wav/combined_output_680.wav
680,This is the famous painting. Another highl...,นี่ค่ะรูปที่ดังมากๆ และอีกไฮไลท์นู่น สุดห้องโถ...,combined_wav/combined_output_681.wav
681,Those are the real diamonds. Don't ask abo...,ที่เห็นนั่นคือเพชรจริงๆ นะคะ อย่าถามราคาว่าเท่...,combined_wav/combined_output_682.wav
682,"9,000 It's over 9,000 years. 9,000 This o...",มีอายุกว่า ปี ปีชิ้นนี้ ขอปิดคลิป ด้วยผู้หญิงท...,combined_wav/combined_output_683.wav


## Casting to Huggingface Datasets Object

In [9]:
audio_dataset_train = Dataset.from_dict({
        "audio": df_train_combined['trimmed_segment_path'].tolist()
    }
)

# Casting audio column to Audio type
audio_dataset_train = audio_dataset_train.cast_column("audio", Audio())

# Adding transcriptions column
audio_dataset_train = audio_dataset_train.add_column("transcription", np.array(df_train_combined['eng_reference']))

print(audio_dataset_train)

Dataset({
    features: ['audio', 'transcription'],
    num_rows: 684
})


In [10]:
audio_dataset_test = Dataset.from_dict({
        "audio": df_test['trimmed_segment_path'].tolist()
    }
)

# Casting audio column to Audio type
audio_dataset_test = audio_dataset_test.cast_column("audio", Audio())

# Adding transcriptions column
audio_dataset_test = audio_dataset_test.add_column("transcription", np.array(df_test['reference']))

print(audio_dataset_test)

Dataset({
    features: ['audio', 'transcription'],
    num_rows: 597
})


In [11]:
from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium", language="Thai", task="translate")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium")
processor = WhisperProcessor.from_pretrained("openai/whisper-medium", language="Thai", task="translate")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors='pt').input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["transcription"]).input_ids
    return batch

In [13]:
audio_dataset_train = audio_dataset_train.map(
    prepare_dataset
)

Map: 100%|██████████| 684/684 [01:52<00:00,  6.10 examples/s] 


In [14]:
audio_dataset_test = audio_dataset_test.map(
    prepare_dataset
)

Map: 100%|██████████| 597/597 [00:10<00:00, 57.99 examples/s] 


In [15]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")



In [16]:
model.generation_config.language = "thai"
model.generation_config.task = "translate"
model.generation_config.forced_decoder_ids = None

In [17]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        # return in pytorch tensors
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length and return in pytorch tensors
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [18]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [19]:
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [20]:
torch.cuda.empty_cache()

In [21]:
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-medium-thai",  # change to a repo name of your choice
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=1000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    generation_max_length=150,
    save_steps=200,
    eval_steps=100,
    logging_steps=100,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=audio_dataset_train,
    eval_dataset=audio_dataset_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

max_steps is given, it will override any value given in num_train_epochs


In [22]:
trainer.train()

  0%|          | 0/1000 [00:00<?, ?it/s]`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
  0%|          | 3/1000 [00:14<1:19:13,  4.77s/it]

OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 15.70 GiB total capacity; 15.06 GiB already allocated; 8.00 MiB free; 15.33 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
eval_logs = trainer.evaluate()
print(eval_logs)

100%|██████████| 75/75 [02:09<00:00,  1.72s/it]

{'eval_loss': 2.4484760761260986, 'eval_wer': 88.11989100817439, 'eval_runtime': 131.0741, 'eval_samples_per_second': 4.555, 'eval_steps_per_second': 0.572, 'epoch': 33.11258278145695}





In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("whisper-small-thai/checkpoint-1000").to("cuda")
#processor = WhisperProcessor.from_pretrained("whisper-medium-thai/checkpoint-5000")

In [None]:
audio_dataset_test

Dataset({
    features: ['audio', 'transcription', 'input_features', 'labels'],
    num_rows: 597
})

In [None]:
def map_to_pred(batch):

    audio = batch["audio"]

    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features

    batch["reference"] = processor.tokenizer._normalize(batch['transcription'])

    with torch.no_grad():

        predicted_ids = model.generate(input_features.to("cuda"))[0]

    transcription = processor.decode(predicted_ids)

    batch["prediction"] = processor.tokenizer._normalize(transcription)

    return batch

In [None]:
result = audio_dataset_test.map(map_to_pred)

Map: 100%|██████████| 597/597 [01:44<00:00,  5.69 examples/s]


In [None]:
from evaluate import load

wer = load("wer")

print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))

113.06244886828469


In [None]:
pd.DataFrame(result['reference'], result['prediction'])

Unnamed: 0,0
it is now 4 pm,right now i m in singapore
singaporean love,most people when they come to singapore
let us have a 3 on 3,they come to eat dim sum
let us see what is on the menu,visit merlion
i am in verso,visit universal
...,...
please like share and subscribe for me,please press like share and subscribe
who wants me to go on a trip,if you want me to take you somewhere
how do you play next,to eat or do any activities
you can comment,you can leave me a comment
