In [None]:
# -*- coding: utf-8 -*-
"""translate_ep_speeches.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1_IH0JaUFfoec2PQqA12_J1x9P58YXWF3
"""

#!pip install -U pip transformers
#!pip install sentencepiece


import pandas as pd

full_text = pd.read_csv("../04_clean_data/language_detection.csv")
data = pd.read_csv("../04_clean_data/missing_speeches_parlee_sents.csv")


data.dropna(subset=['sentence'], inplace=True)
data.reset_index(inplace = True)

filtered_data = pd.merge(data, full_text, on='text_id', how='left')

filtered_data = filtered_data[filtered_data['language'] != 'eng_Latn']

# Keep only specific columns
filtered_data = filtered_data[['text_id', 'session_id', 'id_speaker', 'Sentence_id', 'sentence','language']]

filtered_data.reset_index(inplace = True)
filtered_data.drop_duplicates(subset='sentence', inplace = True)
filtered_data

In [None]:
filtered_data['language'].unique()

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
checkpoint = 'facebook/nllb-200-distilled-600M'
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [20]:
from tqdm import tqdm
import pandas as pd
import json
import time
from datetime import timedelta
import ray
from transformers import pipeline
from datetime import datetime

# Initialize Ray cluster once, outside of loops
ray.init(num_cpus=20, ignore_reinit_error=True)

# @ray.remote decorator enables to use this 
# function in distributed setting
@ray.remote
def predict(pipeline, text_data):
    return pipeline(text_data)

# Open the JSONL file to store the translations
with open('translated_sentences.jsonl', 'a+') as f:
    unique_languages = filtered_data['language'].unique()[17:]
    for current_language in tqdm(unique_languages, desc="Translating by Language"):
        print(current_language)
        # Filter the DataFrame to only include rows with the current language
        language_specific_data = filtered_data[filtered_data['language'] == current_language]

        # Initialize the translation pipeline once per language
        translation_pipeline = pipeline('translation',
                                        model=model,
                                        tokenizer=tokenizer,
                                        src_lang=current_language,
                                        tgt_lang='eng_Latn',
                                        max_length=500)
        pipe_id = ray.put(translation_pipeline)

        batch_size = 1000
        # Process sentences in batches
        for i in tqdm(range(0, len(language_specific_data), batch_size), desc=f"Translating Sentences in {current_language}", leave=False):

            batch_data = language_specific_data.iloc[i:i+batch_size]
            batch_sentences = batch_data['sentence'].tolist()

            # Check existing IDs and filter new batch data
            f.seek(0)
            existing_ids = [(json.loads(line).get('text_id'), json.loads(line).get('Sentence_id')) for line in f]
            new_batch_data = [(row['text_id'], row['Sentence_id'], row['sentence']) for index, row in batch_data.iterrows() if (row['text_id'], row['Sentence_id']) not in existing_ids]
            
            if not new_batch_data:
                continue
            start_time = datetime.now()  # Start time of the batch
            print(f"Batch starting at index {i}, Start time: {start_time}")
            new_batch_text_ids, new_batch_sentence_ids, new_batch_sentences = map(list, zip(*new_batch_data))
            
            # Schedule multiple Ray tasks for translation
            future_results = [predict.remote(pipe_id, sentence) for sentence in new_batch_sentences]
            translation_output = ray.get(future_results)
            
            translated_texts = [out[0]["translation_text"] for out in translation_output]
            
            # Write to JSONL
            for text_id, sentence_id, translated_text in zip(new_batch_text_ids, new_batch_sentence_ids, translated_texts):
                json.dump({"text_id": text_id, "Sentence_id": sentence_id, "translated_sentence": translated_text}, f)
                f.write('\n')
            end_time = datetime.now()  # End time of the batch
            duration = end_time - start_time  # Duration of the batch
            
            print(f"Batch ending at index {i + batch_size}, End time: {end_time}, Duration: {duration}, part of: {current_language}")

# Shutdown Ray cluster after all tasks
ray.shutdown()


2023-11-08 11:17:24,653	INFO worker.py:1642 -- Started a local Ray instance.
Translating by Language:   0%|          | 0/20 [00:00<?, ?it/s]

slk_Latn


Translating by Language:   5%|▌         | 1/20 [01:52<35:39, 112.62s/it]

hun_Latn


Translating by Language:  10%|█         | 2/20 [04:29<41:31, 138.41s/it]

ces_Latn




Batch starting at index 7000, Start time: 2023-11-08 11:24:13.522547


In [19]:
ray.shutdown()