In [2]:
# !pip install libretranslatepy

Collecting libretranslatepy
  Downloading libretranslatepy-2.1.4-py3-none-any.whl.metadata (785 bytes)
Downloading libretranslatepy-2.1.4-py3-none-any.whl (3.5 kB)
Installing collected packages: libretranslatepy
Successfully installed libretranslatepy-2.1.4


In [5]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd
from pydub import AudioSegment
import string

In [8]:
# Retrieve .txt files
def list_files_in_directory(directory):
    file_list = []
    for filename in os.listdir(directory):
        # Only pick up files with .txt extensions (transcript)
        if filename.endswith(".txt"):
            file_list.append(filename.replace(".txt", ""))
    return file_list

# Create the dataframe
def get_reference_df(directory, audio_txt_file):
    txt_file_path = os.path.join(directory, audio_txt_file + ".txt")
    columns = ["start_time", "end_time", "reference"]
    # Read the text file into a DataFrame
    df = pd.read_csv(txt_file_path, sep="\t", header=None, names=columns, quoting=3)

    # Add file name
    df.insert(0, 'file_name', pd.Series([audio_txt_file] * len(df)))

    # Remove quotation marks
    df['reference'] = df['reference'].apply(lambda x : x.replace('"',""))
    
    return df

# Trim audio files
def trim_wav_by_timestamps(directory, wav_file_name, reference_df):
    # Create the output directory if it doesn't exist
    output_dir = "data/sub/"
    os.makedirs(output_dir, exist_ok=True)
    wav_file = os.path.join(directory, wav_file_name + ".wav") # get into data file
    
    # Load the WAV file
    audio = AudioSegment.from_wav(wav_file)
    
    def trim_segments(row):
        start_ms = float(row['start_time']) * 1000  # Convert start time to milliseconds
        end_ms = float(row['end_time']) * 1000      # Convert end time to milliseconds
        trimmed_segment = audio[start_ms:end_ms]
    
        return trimmed_segment
    
    # Iterate over timestamps and trim the audio
    for i, row in reference_df.iterrows():
        trimmed_segment = trim_segments(row)
        output_file = os.path.join(output_dir, wav_file_name + "_" f"trimmed_segment_{i+1}.wav")
        trimmed_segment.export(output_file, format="wav")
        reference_df.at[i, 'trimmed_segment_path'] = output_file
    
    return reference_df

# Helper function to remove punctuations from original subtitles
def strip_punctuation(text):
    # Create a translation table that maps each punctuation character to None
    translator = str.maketrans('', '', string.punctuation)
    # Use the translation table to remove all punctuation from the text
    return text.translate(translator)

# Filter english text and append it to dataframe
def filter_subs_by_lang(reference_df):
    # Helper function that is applied across the rows to filter english text only
    
    def filter_english_only(text):
        # Define a regex pattern to match English words
        english_pattern = re.compile(r'\b[A-Za-z]+\b')
        # Find all English words in the text
        english_words = english_pattern.findall(text)
        # Join the English words into a single string
        english_text = ' '.join(english_words)
        return english_text
    
    def filter_thai_only(text):
        # Remove punctuation from text
        text = strip_punctuation(text)
        # Tokenize the string, split by spaces
        list_of_words = text.split()
        # Define a regex pattern to match English words
        english_pattern = re.compile(r'\b[A-Za-z]+\b')
        # Find all English words in the text
        english_words = english_pattern.findall(text)
        # Find all Thai words
        thai_words = [word for word in list_of_words if word not in english_words]
        # Concatenate the Thai words into a string
        thai_text = ' '.join(thai_words)
        # Return thai string
        return thai_text

    reference_df['eng_reference'] = reference_df['reference'].apply(filter_english_only)
    reference_df['thai_reference'] = reference_df['reference'].apply(filter_thai_only)

    return reference_df


def get_combined_audio_table(directory, file_names):
    combined_df = pd.DataFrame()
    for file_name in file_names:
        # Reads the transcript dataframe which has the start_time, end_time of each transcript
        reference_df = get_reference_df(directory, file_name)

        # Split subtitles by language
        reference_df = filter_subs_by_lang(reference_df)

        # Remove 'reference' column
        reference_df = reference_df.drop('reference', axis=1)

        # Uncomment to trim all the .wav file according to the subtitles start_time and end_time
        reference_df = trim_wav_by_timestamps(directory, file_name, reference_df)
        
        # Append the processed DataFrame to the combined DataFrame
        combined_df = pd.concat([combined_df, reference_df], ignore_index=True)

        # Comment out this section if not required
        #combined_df = combined_df.drop(['file_name', 'start_time', 'end_time'], axis=1)
    
    return combined_df

directory = os.path.join(os.getcwd(), "unprocessed_data/")
file_names = list_files_in_directory(directory)
combined_df = get_combined_audio_table(directory, file_names)
combined_df

Unnamed: 0,file_name,start_time,end_time,eng_reference,thai_reference,trimmed_segment_path
0,sVnGI2S_OuA,0.000,1.768,,เหตุการณ์มันผ่านมาแล้ว 10 ปี,data/sub/sVnGI2S_OuA_trimmed_segment_1.wav
1,sVnGI2S_OuA,1.768,2.869,,บางคนปล่อยไม่ได้,data/sub/sVnGI2S_OuA_trimmed_segment_2.wav
2,sVnGI2S_OuA,2.869,4.938,,นั้นเครื่องมือที่จะหยุดมันได้ คือ,data/sub/sVnGI2S_OuA_trimmed_segment_3.wav
3,sVnGI2S_OuA,5.105,6.706,,ลมหายใจ,data/sub/sVnGI2S_OuA_trimmed_segment_4.wav
4,sVnGI2S_OuA,6.706,8.341,,ถ้าพี่เอาน้องทุกคนมาบอกว่า,data/sub/sVnGI2S_OuA_trimmed_segment_5.wav
...,...,...,...,...,...,...
19063,eVbLXknPOdk,70.640,72.440,,แล้วเราจ่ายไหว เราก็จ่าย,data/sub/eVbLXknPOdk_trimmed_segment_25.wav
19064,eVbLXknPOdk,72.440,75.280,,แล้วอะไรทำให้พฤติกรรมการเงินเราเปลี่ยนไป,data/sub/eVbLXknPOdk_trimmed_segment_26.wav
19065,eVbLXknPOdk,82.360,86.880,,เล่าให้ฟังนิดนึงว่าเราคิดถึงจุดที่มาถึงวันนี้,data/sub/eVbLXknPOdk_trimmed_segment_27.wav
19066,eVbLXknPOdk,86.880,89.120,,ที่กำลังจะเกิดขึ้นในปี 2025,data/sub/eVbLXknPOdk_trimmed_segment_28.wav


In [19]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", use_auth_token=True, src_lang="tha_Thai")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", use_auth_token=True).to("cuda").eval()



In [23]:
def translate_to_thai(row):
    article = row['thai_reference']
    inputs = tokenizer(article, return_tensors="pt").to("cuda")
    translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=30)
    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

In [26]:
from tqdm import tqdm
tqdm.pandas()

nllb_translation = combined_df.progress_apply(translate_to_thai, axis=1)

100%|██████████| 19068/19068 [25:45<00:00, 12.34it/s]


In [27]:
nllb_translation

0                                     It's been ten years.
1                                Some people can't let go.
2                              That's the tool to stop it.
3                                                The wind.
4              If you bring all your brothers and sisters,
                               ...                        
19063                          And we can pay. We can pay.
19064              So what changed our financial behavior?
19065    Let me tell you a little bit about how we're t...
19066                      That's going to happen in 2025.
19067          It's about when you decide to end the band.
Length: 19068, dtype: object

In [34]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-th-en", device = 0)



In [37]:
helsinki_translation = pipe(combined_df['thai_reference'].tolist())

In [38]:
helsinki_translation

[{'translation_text': "It's been 10 years."},
 {'translation_text': "Some people can't let go."},
 {'translation_text': "That's the tool to stop it."},
 {'translation_text': 'Breathe.'},
 {'translation_text': 'If you bring all your sisters to me,'},
 {'translation_text': 'Hey, sis.'},
 {'translation_text': 'Focus.'},
 {'translation_text': 'You read a book.'},
 {'translation_text': 'I took notes.'},
 {'translation_text': 'You work out like this.'},
 {'translation_text': 'All of you.'},
 {'translation_text': 'So what?'},
 {'translation_text': 'Breathe in.'},
 {'translation_text': "I can't breathe."},
 {'translation_text': "I'm most worried about this guy."},
 {'translation_text': '(Laughter)'},
 {'translation_text': "I'm so scared of his book."},
 {'translation_text': 'Oh, my God.'},
 {'translation_text': 'Supported by'},
 {'translation_text': "Ma'am, everyone says hello to Gott."},
 {'translation_text': 'Hello. Hello.'},
 {'translation_text': "Ma'am, you've seen a lot of teams today."},

In [43]:
helsinki_translation_text = [text['translation_text'] for text in helsinki_translation]

In [44]:
helsinki_translation_text

["It's been 10 years.",
 "Some people can't let go.",
 "That's the tool to stop it.",
 'Breathe.',
 'If you bring all your sisters to me,',
 'Hey, sis.',
 'Focus.',
 'You read a book.',
 'I took notes.',
 'You work out like this.',
 'All of you.',
 'So what?',
 'Breathe in.',
 "I can't breathe.",
 "I'm most worried about this guy.",
 '(Laughter)',
 "I'm so scared of his book.",
 'Oh, my God.',
 'Supported by',
 "Ma'am, everyone says hello to Gott.",
 'Hello. Hello.',
 "Ma'am, you've seen a lot of teams today.",
 "Um, I don't know if you have anything in common today.",
 "That's it, that's it, that's it.",
 "It's called Gayla, isn't it?",
 "Now we're Gayla.",
 'Trying to be social.',
 "Gla's the other guy.",
 "Today, you're asking us to go out with you.",
 "Gaya's team is together.",
 "Then we'll go together.",
 'Not this scratch.',
 '(Laughter)',
 "Hello, sir. It's Whit.",
 "Let's keep an eye on the subject right now.",
 'Part of it.',
 'And look at the whole picture, sir.',
 "I'm Arm.

In [48]:
data = {
    'helsinki_translation': helsinki_translation_text,
    'nllb_translation': nllb_translation
}

translated_df = pd.DataFrame(data)

In [49]:
translated_df

Unnamed: 0,helsinki_translation,nllb_translation
0,It's been 10 years.,It's been ten years.
1,Some people can't let go.,Some people can't let go.
2,That's the tool to stop it.,That's the tool to stop it.
3,Breathe.,The wind.
4,"If you bring all your sisters to me,","If you bring all your brothers and sisters,"
...,...,...
19063,And we can afford it. We can pay it.,And we can pay. We can pay.
19064,And what changed our financial behavior?,So what changed our financial behavior?
19065,Tell me a little bit about where we're going t...,Let me tell you a little bit about how we're t...
19066,That's going to happen in 2025.,That's going to happen in 2025.


In [51]:
import evaluate 

meteor = evaluate.load('meteor')

def compute_meteor(row, col1, col2):
    meteor_score = meteor.compute(predictions=[row[col1],], references=[row[col2],])
    return meteor_score['meteor']

meteor_scores = translated_df.apply(lambda row : compute_meteor(row, 'helsinki_translation', 'nllb_translation'), axis = 1)
meteor_scores.describe()

[nltk_data] Downloading package wordnet to /home/dhuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/dhuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/dhuser/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


count    19068.000000
mean         0.541398
std          0.260823
min          0.000000
25%          0.338442
50%          0.543820
75%          0.738955
max          0.999953
dtype: float64

In [56]:
translated_df['meteor_scores'] = meteor_scores

In [52]:
bleu = evaluate.load("bleu")

In [53]:
def compute_bleu(row, col1, col2):
    # Extract the hypothesis and reference
    hypothesis = row[col1]
    reference = row[col2]

    # Check for empty hypothesis or reference
    if not hypothesis or not reference:
        return 0.0

    # Compute BLEU score
    bleu_score = bleu.compute(predictions=[hypothesis], references=[[reference]])
    return bleu_score['bleu']

bleu_scores = translated_df.apply(lambda row : compute_bleu(row, 'helsinki_translation', 'nllb_translation'), axis = 1)
bleu_scores.describe()

count    19068.000000
mean         0.171685
std          0.283124
min          0.000000
25%          0.000000
50%          0.000000
75%          0.311709
max          1.000000
dtype: float64

In [85]:
translated_df['bleu_scores'] = bleu_scores
translated_df_filtered = translated_df[translated_df['meteor_scores']> 0.9]

In [87]:
import random

def randomly_pick_translation(row):
    def flip(p):
        return 'H' if random.random() < p else 'T'
    
    translation_col = ''
    if flip(0.5) == 'H':
        translation_col = 'helsinki_translation'
    else:
        translation_col = 'nllb_translation'

    return row[translation_col]

english_translations = translated_df_filtered.apply(randomly_pick_translation, axis = 1)

In [67]:
combined_df['meteor_scores'] = meteor_scores

In [71]:
combined_df_filtered = combined_df[combined_df['meteor_scores'] > 0.9]

In [84]:
combined_df_filtered.apply(lambda x : x['end_time'] - x['start_time'], axis = 1).sum()

3774.0000000000045

In [88]:
combined_df_filtered['eng_reference'] = english_translations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df_filtered['eng_reference'] = english_translations


In [89]:
combined_df_filtered

Unnamed: 0,file_name,start_time,end_time,eng_reference,thai_reference,trimmed_segment_path,meteor_scores
0,sVnGI2S_OuA,0.000,1.768,It's been 10 years.,เหตุการณ์มันผ่านมาแล้ว 10 ปี,data/sub/sVnGI2S_OuA_trimmed_segment_1.wav,0.997685
1,sVnGI2S_OuA,1.768,2.869,Some people can't let go.,บางคนปล่อยไม่ได้,data/sub/sVnGI2S_OuA_trimmed_segment_2.wav,0.998542
2,sVnGI2S_OuA,2.869,4.938,That's the tool to stop it.,นั้นเครื่องมือที่จะหยุดมันได้ คือ,data/sub/sVnGI2S_OuA_trimmed_segment_3.wav,0.999023
14,sVnGI2S_OuA,21.921,22.856,I'm worried about this guy.,ผมห่วงคนนี้ที่สุดเลย,data/sub/sVnGI2S_OuA_trimmed_segment_15.wav,0.974418
18,sVnGI2S_OuA,29.462,32.132,Supported by,สนับสนุนโดย,data/sub/sVnGI2S_OuA_trimmed_segment_19.wav,0.937500
...,...,...,...,...,...,...,...
19040,eVbLXknPOdk,4.360,7.200,"I think when we retire at 40,",ผมคิดว่าเมื่อเราเกษียณไปตอน 40,data/sub/eVbLXknPOdk_trimmed_segment_2.wav,0.999023
19041,eVbLXknPOdk,7.840,9.120,We have to live.,เราต้องมีชีวิตต่อ,data/sub/eVbLXknPOdk_trimmed_segment_3.wav,0.996000
19052,eVbLXknPOdk,44.080,46.280,I want you to see it right there.,อยากให้มองเห็นตรงนั้น,data/sub/eVbLXknPOdk_trimmed_segment_14.wav,0.933957
19057,eVbLXknPOdk,55.280,57.800,I've seen something new before.,เมื่อก่อนเห็นอะไรใหม่ๆ,data/sub/eVbLXknPOdk_trimmed_segment_19.wav,0.998542


In [90]:
combined_df_filtered.to_csv("train_1.csv")

In [81]:
# Get a list of all files in the directory
all_files = os.listdir("data/sub/")  # Replace "data/sub/" with your directory path

# Extract just the filenames from the DataFrame file paths
df_filenames = combined_df_filtered['trimmed_segment_path'].apply(lambda x: os.path.basename(x))

# Filter out files that are not in the DataFrame
files_to_remove = [file for file in all_files if file not in df_filenames.values]

# Remove the files not in the DataFrame
for file in files_to_remove:
    os.remove(os.path.join("data/sub/", file))  # Replace "data/sub/" with your directory path