In [1]:
# ! pip install git+https://github.com/openai/whisper.git
#! pip install numpy torch pandas urllib3 torchaudio scipy tqdm matplotlib transformers deepcut jiwer pydub evaluate
#! pip install tf-keras

In [1]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd
from pydub import AudioSegment
from transformers import pipeline
import string
from datasets import Dataset, Audio

from scipy.io import wavfile
from tqdm.notebook import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


## Preprocessing Audio and Subtitles Files

In [16]:
# Retrieve .txt files
def list_files_in_directory(directory):
    file_list = []
    for filename in os.listdir(directory):
        # Only pick up files with .txt extensions (transcript)
        if filename.endswith(".txt"):
            file_list.append(filename.replace(".txt", ""))
    return file_list

# Create the dataframe
def get_reference_df(directory, audio_txt_file):
    txt_file_path = os.path.join(directory, audio_txt_file + ".txt")
    columns = ["start_time", "end_time", "reference"]
    # Read the text file into a DataFrame
    df = pd.read_csv(txt_file_path, sep="\t", header=None, names=columns, quoting=3)

    # Add file name
    df.insert(0, 'file_name', pd.Series([audio_txt_file] * len(df)))

    # Remove quotation marks
    df['reference'] = df['reference'].apply(lambda x : x.replace('"',""))
    
    return df

# Trim audio files
def trim_wav_by_timestamps(directory, wav_file_name, reference_df):
    # Create the output directory if it doesn't exist
    output_dir = "data/sub/"
    os.makedirs(output_dir, exist_ok=True)
    wav_file = os.path.join(directory, wav_file_name + ".wav") # get into data file
    
    # Load the WAV file
    audio = AudioSegment.from_wav(wav_file)
    
    def trim_segments(row):
        start_ms = float(row['start_time']) * 1000  # Convert start time to milliseconds
        end_ms = float(row['end_time']) * 1000      # Convert end time to milliseconds
        trimmed_segment = audio[start_ms:end_ms]
    
        return trimmed_segment
    
    # Iterate over timestamps and trim the audio
    for i, row in reference_df.iterrows():
        trimmed_segment = trim_segments(row)
        output_file = os.path.join(output_dir, wav_file_name + "_" f"trimmed_segment_{i+1}.wav")
        trimmed_segment.export(output_file, format="wav")
        reference_df.at[i, 'trimmed_segment_path'] = output_file
    
    return reference_df

# Helper function to remove punctuations from original subtitles
def strip_punctuation(text):
    # Create a translation table that maps each punctuation character to None
    translator = str.maketrans('', '', string.punctuation)
    # Use the translation table to remove all punctuation from the text
    return text.translate(translator)

# Filter english text and append it to dataframe
def filter_subs_by_lang(reference_df):
    # Helper function that is applied across the rows to filter english text only
    
    def filter_english_only(text):
        # Define a regex pattern to match English letters, numbers, spaces, and specific punctuation
        english_pattern = re.compile(r'[A-Za-z0-9\s.,!?\']+')
        # Find all matches in the text
        english_words_numbers_punctuations = english_pattern.findall(text)
        # Join the matches into a single string
        filtered_text = ' '.join(english_words_numbers_punctuations)
        return filtered_text
    
    def filter_thai_only(text):
        # Remove punctuation from text
        text = strip_punctuation(text)
        # Tokenize the string, split by spaces
        list_of_words = text.split()
        # Define a regex pattern to match English words
        english_pattern = re.compile(r'\b[A-Za-z]+\b')
        # Find all English words in the text
        english_words = english_pattern.findall(text)
        # Find all Thai words
        thai_words = [word for word in list_of_words if word not in english_words]
        # Concatenate the Thai words into a string
        thai_text = ' '.join(thai_words)
        # Return thai string
        return thai_text

    reference_df['eng_reference'] = reference_df['reference'].apply(filter_english_only)
    reference_df['thai_reference'] = reference_df['reference'].apply(filter_thai_only)

    return reference_df


def get_combined_audio_table(directory, file_names):
    combined_df = pd.DataFrame()
    for file_name in file_names:
        # Reads the transcript dataframe which has the start_time, end_time of each transcript
        print(file_name)
        reference_df = get_reference_df(directory, file_name)

        # Split subtitles by language
        reference_df = filter_subs_by_lang(reference_df)

        # Remove 'reference' column
        reference_df = reference_df.drop('reference', axis=1)

        # Uncomment to trim all the .wav file according to the subtitles start_time and end_time
        reference_df = trim_wav_by_timestamps(directory, file_name, reference_df)
        
        # Append the processed DataFrame to the combined DataFrame
        combined_df = pd.concat([combined_df, reference_df], ignore_index=True)

        # Comment out this section if not required
        #combined_df = combined_df.drop(['file_name', 'start_time', 'end_time'], axis=1)
    
    return combined_df

directory = os.path.join(os.getcwd(), "data/extra_train")
file_names = list_files_in_directory(directory)
combined_df = get_combined_audio_table(directory, file_names)
combined_df

GqSDS-BEmaw
CUFXxlFAH6M
Syu4jwafwFE
FMY8Kw91d9Y
dA4RQnTbOvg
6VIrFZy0v44
7Lz3koAwBco
ViEf5ueOb30
pkc5rmNo0TI
UzmWCFC9BKA
BXF7_X6aQgo
nK3jBdEeUIg
rLxCA6L4qSQ
MuRUW9aphlM
yp2Fl3tZVKY
mm5RfeABhjI
lT5-smTV2iQ
GR3oR5sFq-M
_g8nYGMmp7c
AiYQdtEfCP8
Wa5msc6siVQ
ZGCfygLwYkc
hbVX25jV_Rk
LKWSsxWkdck
5sKqpNlJuQo
nXev0A_wp44
IwDPstmCSwo
lCtSxw0a0eI
GAJ98oKN_1o
8ZD-3z0ii9Y
O0UrhODMYf0
TkMYWTq5pN8
XYEER_fSpJw
JSLL1Z-Y0Tc
i18vompw6Ts
wdvUCESXfWQ
anhgtR6MTd8
9njVYrbnFnM
re4D5tJ2xT8
241aOF7zA-A
o3Y-H94DbZM
b80uB1a-yk0
gh6bNXyBrtU
njso7AVzXKo
ITAbXovat24
eQxDIhEH8sQ
5TelA81BpFc
HkVdD_Sw6Ro
DDxnz74M-ns
SvAfRYQ8MYY
uK4_Oscw0pk


Unnamed: 0,file_name,start_time,end_time,eng_reference,thai_reference,trimmed_segment_path
0,GqSDS-BEmaw,7.040,8.373,Hello everyone.,,data/sub/GqSDS-BEmaw_trimmed_segment_1.wav
1,GqSDS-BEmaw,8.373,9.895,I'm Anucha Kornsawad.,,data/sub/GqSDS-BEmaw_trimmed_segment_2.wav
2,GqSDS-BEmaw,9.895,11.895,"You can call me Kean, I'm 22 years old.",22,data/sub/GqSDS-BEmaw_trimmed_segment_3.wav
3,GqSDS-BEmaw,11.895,13.784,"Please tell me about yourself, what do you do ...",,data/sub/GqSDS-BEmaw_trimmed_segment_4.wav
4,GqSDS-BEmaw,13.784,17.987,I'm a third year student at university.,,data/sub/GqSDS-BEmaw_trimmed_segment_5.wav
...,...,...,...,...,...,...
21623,uK4_Oscw0pk,3886.712,3888.080,As for whoever likes this clip?,,data/sub/uK4_Oscw0pk_trimmed_segment_1323.wav
21624,uK4_Oscw0pk,3888.080,3889.548,"Don't forget to click like, share,",,data/sub/uK4_Oscw0pk_trimmed_segment_1324.wav
21625,uK4_Oscw0pk,3889.548,3890.048,and subscribe,,data/sub/uK4_Oscw0pk_trimmed_segment_1325.wav
21626,uK4_Oscw0pk,3890.049,3891.083,for us.,,data/sub/uK4_Oscw0pk_trimmed_segment_1326.wav


In [23]:
combined_df = combined_df[combined_df['eng_reference'] != '']

In [24]:
import pandas as pd
from pydub import AudioSegment
import os

# Function to combine WAV files with padding and split into multiple files if necessary
def combine_wav_files_with_split(padding_duration_ms, max_duration_seconds, csv_df, output_dir):

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    combined = AudioSegment.silent(duration=0)  # Start with an empty audio segment
    padding = AudioSegment.silent(duration=padding_duration_ms)  # Create padding segment
    file_count = 1
    output_files = []
    new_csv_rows = []

    current_transcription = {
        'eng_reference': [],
        'thai_reference': [],
        'trimmed_segment_path': None
    }

    for index, row in csv_df.iterrows():
        audio = AudioSegment.from_wav(row['trimmed_segment_path'])

        # If adding the next audio exceeds 28 seconds
        if len(combined) + len(audio) + padding_duration_ms > max_duration_seconds * 1000:
            # Export the current combined audio to a new file
            output_filename = f'{output_dir}/combined_output_{file_count}.wav'
            combined.export(output_filename, format='wav')
            output_files.append(output_filename)
            file_count += 1

            # Update CSV with the current transcription information
            current_transcription['eng_reference'] = ' '.join(current_transcription['eng_reference'])
            current_transcription['thai_reference'] = ' '.join(current_transcription['thai_reference'])
            current_transcription['trimmed_segment_path'] = output_filename
            new_csv_rows.append(current_transcription)

            # Start a new combined segment and reset transcription
            combined = AudioSegment.silent(duration=0)
            current_transcription = {
                'eng_reference': [],
                'thai_reference': [],
                'trimmed_segment_path': None
            }

        combined += audio + padding
        current_transcription['eng_reference'].append(row['eng_reference'])
        current_transcription['thai_reference'].append(row['thai_reference'])

    # Export the last combined audio segment if it has any content
    if len(combined) > 0:
        output_filename = f'{output_dir}/combined_output_{file_count}.wav'
        combined.export(output_filename, format='wav')
        output_files.append(output_filename)

        current_transcription['eng_reference'] = ' '.join(current_transcription['eng_reference'])
        current_transcription['thai_reference'] = ' '.join(current_transcription['thai_reference'])
        current_transcription['trimmed_segment_path'] = output_filename
        new_csv_rows.append(current_transcription)

    new_csv_df = pd.DataFrame(new_csv_rows)
    return output_files, new_csv_df

# List of WAV files from the CSV
padding_duration_ms = 1000  # 1 second padding
max_duration_seconds = 28  # Maximum duration of 28 seconds per file

# Combine the WAV files with splitting if necessary
output_files, df_train3 = combine_wav_files_with_split(padding_duration_ms, max_duration_seconds, combined_df, "combined_wav_3")

In [25]:
df_train3 = df_train3.drop('thai_reference', axis = 1)

In [26]:
df_train3

Unnamed: 0,eng_reference,trimmed_segment_path
0,Hello everyone. I'm Anucha Kornsawad. You can ...,combined_wav_3/combined_output_1.wav
1,I was working and studying since I was vocatio...,combined_wav_3/combined_output_2.wav
2,"So, my father worried about me. At first I don...",combined_wav_3/combined_output_3.wav
3,You told us that you didn't like it at first. ...,combined_wav_3/combined_output_4.wav
4,It's not bad. How many hours did you work? For...,combined_wav_3/combined_output_5.wav
...,...,...
2907,Let's see if the boat crosses the island. They...,combined_wav_3/combined_output_2908.wav
2908,Tagbilaran. Go for 2 hours. We're already ther...,combined_wav_3/combined_output_2909.wav
2909,A city that exceeded my expectations a lot. Be...,combined_wav_3/combined_output_2910.wav
2910,Everyone can come and visit Cebu. which Cebu h...,combined_wav_3/combined_output_2911.wav


In [27]:
df_train3.to_csv("combined_train3.csv")