In [1]:
# -*- coding: utf-8 -*-
"""translate_ep_speeches.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1_IH0JaUFfoec2PQqA12_J1x9P58YXWF3
"""

#!pip install -U pip transformers
#!pip install sentencepiece


import pandas as pd

full_text = pd.read_csv("../04_clean_data/language_detection.csv")
data = pd.read_csv("../04_clean_data/missing_speeches_parlee_sents.csv")


data.dropna(subset=['sentence'], inplace=True)
data.reset_index(inplace = True)

filtered_data = pd.merge(data, full_text, on='text_id', how='left')

filtered_data = filtered_data[filtered_data['language'] != 'eng_Latn']

# Keep only specific columns
filtered_data = filtered_data[['text_id', 'session_id', 'id_speaker', 'Sentence_id', 'sentence','language']]

filtered_data.reset_index(inplace = True)
filtered_data.drop_duplicates(subset='sentence', inplace = True)
filtered_data

Unnamed: 0,index,text_id,session_id,id_speaker,Sentence_id,sentence,language
0,41077,3589,178,1,1,\n,cat_Latn
1,41078,3589,178,1,2,On Amendment No 90,cat_Latn
3,56730,4813,247,1,2,On Amendment No 40,fra_Latn
4,58146,4815,255,1,1,IN THE CHAIR: MRS HOFF,yue_Hant
6,168495,13617,762,1,2,(Parliament adopted the legislative resolution...,kor_Hang
...,...,...,...,...,...,...,...
393919,2073658,179799,14163,1,2,Und da darf ich vielleicht auch erläutern:,deu_Latn
393920,2073659,179799,14163,1,3,Die Stimmerklärungen ganz am Schluss werden in...,deu_Latn
393921,2073660,179799,14163,1,4,Und da wir zehn bis 15 Stimmerklärungen haben ...,deu_Latn
393922,2073661,179799,14163,1,5,"Das ist aber auch kein Skandal, sondern alles ...",deu_Latn


In [2]:
filtered_data['language'].unique()

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
checkpoint = 'facebook/nllb-200-distilled-600M'
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
from tqdm import tqdm
import pandas as pd
import json
import time
from datetime import timedelta
import ray
from transformers import pipeline
from datetime import datetime

# Initialize Ray cluster once, outside of loops
## this takes about 80-100 GB of RAM so be careful when initializing on a "normal" workstation or laptop
ray.init(num_cpus=20, ignore_reinit_error=True)

# @ray.remote decorator enables to use this 
# function in distributed setting
@ray.remote
def predict(pipeline, text_data):
    return pipeline(text_data)

# Open the JSONL file to store the translations
with open('translated_sentences.jsonl', 'a+') as f:
    unique_languages = filtered_data['language'].unique()
    for current_language in tqdm(unique_languages, desc="Translating by Language"):
        print(current_language)
        # Filter the DataFrame to only include rows with the current language
        language_specific_data = filtered_data[filtered_data['language'] == current_language]

        # Initialize the translation pipeline once per language
        translation_pipeline = pipeline('translation',
                                        model=model,
                                        tokenizer=tokenizer,
                                        src_lang=current_language,
                                        tgt_lang='eng_Latn',
                                        max_length=500)
        pipe_id = ray.put(translation_pipeline)

        batch_size = 1000
        # Process sentences in batches
        for i in tqdm(range(0, len(language_specific_data), batch_size), desc=f"Translating Sentences in {current_language}", leave=False):

            batch_data = language_specific_data.iloc[i:i+batch_size]
            batch_sentences = batch_data['sentence'].tolist()

            # Check existing IDs and filter new batch data
            f.seek(0)
            existing_ids = [(json.loads(line).get('text_id'), json.loads(line).get('Sentence_id')) for line in f]
            new_batch_data = [(row['text_id'], row['Sentence_id'], row['sentence']) for index, row in batch_data.iterrows() if (row['text_id'], row['Sentence_id']) not in existing_ids]
            
            if not new_batch_data:
                continue
            start_time = datetime.now()  # Start time of the batch
            print(f"Batch starting at index {i}, Start time: {start_time}")
            new_batch_text_ids, new_batch_sentence_ids, new_batch_sentences = map(list, zip(*new_batch_data))
            
            # Schedule multiple Ray tasks for translation
            future_results = [predict.remote(pipe_id, sentence) for sentence in new_batch_sentences]
            translation_output = ray.get(future_results)
            
            translated_texts = [out[0]["translation_text"] for out in translation_output]
            
            # Write to JSONL
            for text_id, sentence_id, translated_text in zip(new_batch_text_ids, new_batch_sentence_ids, translated_texts):
                json.dump({"text_id": text_id, "Sentence_id": sentence_id, "translated_sentence": translated_text}, f)
                f.write('\n')
            end_time = datetime.now()  # End time of the batch
            duration = end_time - start_time  # Duration of the batch
            
            print(f"Batch ending at index {i + batch_size}, End time: {end_time}, Duration: {duration}, part of: {current_language}")

# Shutdown Ray cluster after all tasks
ray.shutdown()


2023-11-09 10:10:13,807	INFO worker.py:1642 -- Started a local Ray instance.
Translating by Language:   0%|          | 0/37 [00:00<?, ?it/s]

cat_Latn


Translating by Language:   3%|▎         | 1/37 [00:16<10:00, 16.69s/it]

fra_Latn


Translating by Language:   5%|▌         | 2/37 [04:33<1:32:07, 157.93s/it]

yue_Hant


Translating by Language:   8%|▊         | 3/37 [04:49<52:40, 92.94s/it]   

kor_Hang


Translating by Language:  11%|█         | 4/37 [05:04<34:22, 62.51s/it]

glg_Latn


Translating by Language:  14%|█▎        | 5/37 [05:20<24:13, 45.43s/it]

spa_Latn


Translating by Language:  16%|█▌        | 6/37 [09:23<58:15, 112.77s/it]

deu_Latn


Translating by Language:  19%|█▉        | 7/37 [18:42<2:09:14, 258.47s/it]

arb_Arab


Translating by Language:  22%|██▏       | 8/37 [18:58<1:27:42, 181.48s/it]

bod_Tibt


Translating by Language:  24%|██▍       | 9/37 [19:15<1:00:41, 130.06s/it]

krc_Cyrl


Translating by Language:  27%|██▋       | 10/37 [19:33<42:54, 95.37s/it]  

zho_Hans


Translating by Language:  30%|██▉       | 11/37 [19:49<30:51, 71.23s/it]

ita_Latn


Translating by Language:  32%|███▏      | 12/37 [24:25<55:35, 133.43s/it]

lit_Latn


Translating by Language:  35%|███▌      | 13/37 [25:12<42:55, 107.30s/it]

por_Latn


Translating by Language:  38%|███▊      | 14/37 [28:23<50:48, 132.53s/it]

swe_Latn


Translating by Language:  41%|████      | 15/37 [30:42<49:20, 134.55s/it]

pol_Latn


Translating by Language:  43%|████▎     | 16/37 [37:10<1:13:44, 210.68s/it]

ell_Grek


Translating by Language:  46%|████▌     | 17/37 [39:38<1:03:58, 191.91s/it]

slk_Latn


Translating by Language:  49%|████▊     | 18/37 [41:20<52:12, 164.84s/it]  

hun_Latn


Translating by Language:  51%|█████▏    | 19/37 [43:41<47:17, 157.63s/it]

ces_Latn


Translating by Language:  54%|█████▍    | 20/37 [46:07<43:44, 154.36s/it]

dan_Latn


Translating by Language:  57%|█████▋    | 21/37 [47:55<37:23, 140.25s/it]

mlt_Latn


Translating by Language:  59%|█████▉    | 22/37 [48:36<27:37, 110.51s/it]

nld_Latn


Translating by Language:  62%|██████▏   | 23/37 [53:38<39:11, 167.98s/it]

lvs_Latn


Translating by Language:  65%|██████▍   | 24/37 [54:13<27:46, 128.19s/it]

fin_Latn


Translating by Language:  68%|██████▊   | 25/37 [55:55<24:03, 120.27s/it]

est_Latn


Translating by Language:  70%|███████   | 26/37 [56:34<17:35, 95.95s/it] 

slv_Latn


Translating by Language:  73%|███████▎  | 27/37 [58:02<15:34, 93.42s/it]

gle_Latn


Translating by Language:  76%|███████▌  | 28/37 [58:27<10:56, 72.96s/it]

ron_Latn


Translating by Language:  78%|███████▊  | 29/37 [1:02:02<15:25, 115.64s/it]

bul_Cyrl


Translating by Language:  81%|████████  | 30/37 [1:04:18<14:11, 121.59s/it]

oci_Latn


Translating by Language:  84%|████████▍ | 31/37 [1:04:33<08:58, 89.80s/it] 

ind_Latn


Translating by Language:  86%|████████▋ | 32/37 [1:04:49<05:37, 67.52s/it]

ast_Latn


Translating by Language:  89%|████████▉ | 33/37 [1:05:04<03:27, 51.90s/it]

vie_Latn


Translating by Language:  92%|█████████▏| 34/37 [1:05:20<02:02, 40.94s/it]

ilo_Latn


Translating by Language:  95%|█████████▍| 35/37 [1:05:35<01:06, 33.27s/it]

hrv_Latn


Translating by Language:  97%|█████████▋| 36/37 [1:08:06<01:08, 68.47s/it]

kab_Latn


Translating by Language: 100%|██████████| 37/37 [1:08:21<00:00, 110.85s/it]


In [4]:
ray.shutdown()