In [1]:
# -*- coding: utf-8 -*-
"""translate_ep_speeches.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1_IH0JaUFfoec2PQqA12_J1x9P58YXWF3
"""

#!pip install -U pip transformers
#!pip install sentencepiece


import pandas as pd

full_text = pd.read_csv("../04_clean_data/language_detection.csv")
data = pd.read_csv("../04_clean_data/missing_speeches_parlee_sents.csv")


data.dropna(subset=['sentence'], inplace=True)
data.reset_index(inplace = True)

filtered_data = pd.merge(data, full_text, on='text_id', how='left')

filtered_data = filtered_data[filtered_data['language'] != 'eng_Latn']

# Keep only specific columns
filtered_data = filtered_data[['text_id', 'session_id', 'id_speaker', 'Sentence_id', 'sentence','language']]

filtered_data.reset_index(inplace = True)
filtered_data.drop_duplicates(subset='sentence', inplace = True)
filtered_data



Unnamed: 0,index,text_id,session_id,id_speaker,Sentence_id,sentence,language
0,41077,3589,178,1,1,\n,cat_Latn
1,41078,3589,178,1,2,On Amendment No 90,cat_Latn
3,56730,4813,247,1,2,On Amendment No 40,fra_Latn
4,58146,4815,255,1,1,IN THE CHAIR: MRS HOFF,yue_Hant
6,168495,13617,762,1,2,(Parliament adopted the legislative resolution...,kor_Hang
...,...,...,...,...,...,...,...
393919,2073658,179799,14163,1,2,Und da darf ich vielleicht auch erläutern:,deu_Latn
393920,2073659,179799,14163,1,3,Die Stimmerklärungen ganz am Schluss werden in...,deu_Latn
393921,2073660,179799,14163,1,4,Und da wir zehn bis 15 Stimmerklärungen haben ...,deu_Latn
393922,2073661,179799,14163,1,5,"Das ist aber auch kein Skandal, sondern alles ...",deu_Latn


In [2]:
filtered_data['language'].unique()

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
checkpoint = 'facebook/nllb-200-distilled-600M'
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
from tqdm import tqdm
import pandas as pd
import json
import time
from datetime import timedelta
import ray
from transformers import pipeline
from datetime import datetime

# Initialize Ray cluster once, outside of loops
ray.init(num_cpus=20, ignore_reinit_error=True)

# @ray.remote decorator enables to use this 
# function in distributed setting
@ray.remote
def predict(pipeline, text_data):
    return pipeline(text_data)

# Open the JSONL file to store the translations
with open('translated_sentences.jsonl', 'a+') as f:
    # Loop through each unique language in the 'language' column with a progress bar
    for current_language in tqdm(filtered_data['language'].unique(), desc="Translating by Language"):
        print(current_language)
        # Filter the DataFrame to only include rows with the current language
        language_specific_data = filtered_data[filtered_data['language'] == current_language]

        # Initialize the translation pipeline once per language
        translation_pipeline = pipeline('translation',
                                        model=model,
                                        tokenizer=tokenizer,
                                        src_lang=current_language,
                                        tgt_lang='eng_Latn',
                                        max_length=500)
        pipe_id = ray.put(translation_pipeline)

        batch_size = 1000
        # Process sentences in batches
        for i in tqdm(range(0, len(language_specific_data), batch_size), desc=f"Translating Sentences in {current_language}", leave=False):

            batch_data = language_specific_data.iloc[i:i+batch_size]
            batch_sentences = batch_data['sentence'].tolist()

            # Check existing IDs and filter new batch data
            f.seek(0)
            existing_ids = [(json.loads(line).get('text_id'), json.loads(line).get('Sentence_id')) for line in f]
            new_batch_data = [(row['text_id'], row['Sentence_id'], row['sentence']) for index, row in batch_data.iterrows() if (row['text_id'], row['Sentence_id']) not in existing_ids]
            
            if not new_batch_data:
                continue
            start_time = datetime.now()  # Start time of the batch
            print(f"Batch starting at index {i}, Start time: {start_time}")
            new_batch_text_ids, new_batch_sentence_ids, new_batch_sentences = map(list, zip(*new_batch_data))
            
            # Schedule multiple Ray tasks for translation
            future_results = [predict.remote(pipe_id, sentence) for sentence in new_batch_sentences]
            translation_output = ray.get(future_results)
            
            translated_texts = [out[0]["translation_text"] for out in translation_output]
            
            # Write to JSONL
            for text_id, sentence_id, translated_text in zip(new_batch_text_ids, new_batch_sentence_ids, translated_texts):
                json.dump({"text_id": text_id, "Sentence_id": sentence_id, "translated_sentence": translated_text}, f)
                f.write('\n')
            end_time = datetime.now()  # End time of the batch
            duration = end_time - start_time  # Duration of the batch
            
            print(f"Batch ending at index {i + batch_size}, End time: {end_time}, Duration: {duration}, part of: {current_language}")

# Shutdown Ray cluster after all tasks
ray.shutdown()


2023-11-06 09:07:19,026	INFO worker.py:1642 -- Started a local Ray instance.
Translating by Language:   0%|          | 0/37 [00:00<?, ?it/s]

cat_Latn


Translating by Language:   3%|▎         | 1/37 [00:15<09:20, 15.57s/it]

fra_Latn


Translating by Language:   5%|▌         | 2/37 [03:13<1:04:48, 111.10s/it]

yue_Hant


Translating by Language:   8%|▊         | 3/37 [03:29<38:13, 67.45s/it]   

kor_Hang


Translating by Language:  11%|█         | 4/37 [03:45<26:00, 47.30s/it]

glg_Latn


Translating by Language:  14%|█▎        | 5/37 [04:01<19:07, 35.85s/it]

spa_Latn


Translating by Language:  16%|█▌        | 6/37 [07:16<46:30, 90.02s/it]

deu_Latn


Translating by Language:  19%|█▉        | 7/37 [14:56<1:45:35, 211.19s/it]

arb_Arab


Translating by Language:  22%|██▏       | 8/37 [15:11<1:11:53, 148.74s/it]

bod_Tibt


Translating by Language:  24%|██▍       | 9/37 [15:26<49:53, 106.90s/it]  

krc_Cyrl


Translating by Language:  27%|██▋       | 10/37 [15:42<35:26, 78.75s/it]

zho_Hans


Translating by Language:  30%|██▉       | 11/37 [15:57<25:40, 59.24s/it]

ita_Latn




Batch starting at index 9000, Start time: 2023-11-06 09:24:53.792703




Batch ending at index 10000, End time: 2023-11-06 09:46:05.509506, Duration: 0:21:11.716803, part of: ita_Latn
Batch starting at index 10000, Start time: 2023-11-06 09:46:15.563630


In [None]:
ray.shutdown()