In [2]:
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
import concurrent.futures
from tqdm import tqdm

In [3]:
data = pd.read_csv('Mental Health Training.csv')
data.head()

Unnamed: 0,s1,s2,similarity
0,"During a particularly stressful week, I made s...","When I feel stressed, I find that stepping awa...",1.0
1,"During a particularly stressful week, I made s...",I often find that I can manage my stress by ta...,0.75
2,"During a particularly stressful week, I made s...",I cope with overwhelming feelings by journalin...,0.5
3,"During a particularly stressful week, I made s...",I like to unwind by listening to music and ref...,0.25
4,"During a particularly stressful week, I made s...",I recently started a new hobby that involves p...,0.0


In [4]:
def load_models():
    model_name_en_fr = "Helsinki-NLP/opus-mt-en-fr"
    model_name_fr_en = "Helsinki-NLP/opus-mt-fr-en"
    
    tokenizer_en_fr = MarianTokenizer.from_pretrained(model_name_en_fr)
    model_en_fr = MarianMTModel.from_pretrained(model_name_en_fr)
    
    tokenizer_fr_en = MarianTokenizer.from_pretrained(model_name_fr_en)
    model_fr_en = MarianMTModel.from_pretrained(model_name_fr_en)
    
    return tokenizer_en_fr, model_en_fr, tokenizer_fr_en, model_fr_en

In [5]:
def back_translate(text, tokenizer_en_fr, model_en_fr, tokenizer_fr_en, model_fr_en):
    # Translate to French
    tokens = tokenizer_en_fr(text, return_tensors="pt", padding=True, truncation=True)
    translated = model_en_fr.generate(**tokens)
    french_text = tokenizer_en_fr.batch_decode(translated, skip_special_tokens=True)[0]
    
    # Translate back to English
    tokens_back = tokenizer_fr_en(french_text, return_tensors="pt", padding=True, truncation=True)
    back_translated = model_fr_en.generate(**tokens_back)
    english_text = tokenizer_fr_en.batch_decode(back_translated, skip_special_tokens=True)[0]
    
    return english_text

In [6]:
tokenizer_en_fr, model_en_fr, tokenizer_fr_en, model_fr_en = load_models()



In [7]:
back_translate("My name is Eddie. I love food!", tokenizer_en_fr, model_en_fr, tokenizer_fr_en, model_fr_en)

'My name is Eddie, I love food!'

In [8]:
def process_s2_rows(row, tokenizer_en_fr, model_en_fr, tokenizer_fr_en, model_fr_en):
    augmented_s2 = back_translate(row['s2'], tokenizer_en_fr, model_en_fr, tokenizer_fr_en, model_fr_en)
    return {"s1": row['s1'], "s2": augmented_s2, "similarity": row['similarity']}

In [9]:
augmented_data = []
total_rows = len(data)

print(total_rows)

with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    futures = [
        executor.submit(process_s2_rows, row, tokenizer_en_fr, model_en_fr, tokenizer_fr_en, model_fr_en)
        for _, row in data.iterrows()
    ]
    
    # Use tqdm to show the progress bar
    for future in tqdm(concurrent.futures.as_completed(futures), total=total_rows, desc="Processing rows"):
        augmented_data.append(future.result())

27402


Processing rows: 100%|██████████| 27402/27402 [16:20:47<00:00,  2.15s/it]     


In [10]:
# Convert augmented data back to a DataFrame
augmented_df = pd.DataFrame(augmented_data)

# Save augmented dataset if needed
augmented_df.to_csv('augmented_dataset.csv')