In [None]:
!pip install nltk
!pip install deep_translator

In [1]:
import os
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from deep_translator import GoogleTranslator
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ctoruno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Defining parameters

In [14]:
compiler = "Carlos"
batch    = 1
mode     = "9to5" # One of two values: "9to5" OR "overnight"

## Defining functions

In [2]:
def trans2english_batch(text, sourcelang):
    """
    This functions gathers a text in a specific language and it returns its equivalent 
    in English using the Google translation engine. 

    Parameters:
        text:       String. Text to translate.
        sourcelang: String. Code of the source language you want to translate the text from.
    """
    try:
        sentences = sent_tokenize(text)
        batch  = GoogleTranslator(source = sourcelang, target = "en").translate_batch(sentences)
        result = " ".join(batch)
        return result
    except Exception as e:
        out = f"Translation through API failed. Reason: {e}"
        return out

## Reading the data

In [3]:
if os.getlogin() == "ctoruno":
    master_data = pd.read_parquet("../data/data-extraction-1/master4translation.parquet.gzip")
else:
    master_data = pd.read_parquet("master4translation.parquet.gzip")
master_data["compiler"].value_counts()

compiler
Horacio     193968
Allison     193944
Santiago    193924
Natalia     193291
Cristina    193059
Dalia       193046
Carlos      192896
Artha       192783
Name: count, dtype: int64

## Subsetting the data

In [15]:
if mode == "9to5":
    batch_size = 1000
if mode == "overnight":
    batch_size = 7500

starting_row    = batch_size*(batch-1)
final_row       = starting_row+batch_size
compiler_subset = (
    master_data.copy()
    .loc[(master_data["compiler"] == compiler) & (master_data["is_opinion"] == False)]
    .iloc[starting_row:final_row]
)

## Translating headline, description, and content

In [32]:
compiler_subset[["title_trans", "description_trans", "content_trans"]] = compiler_subset.apply(
    lambda row: row[["title", "description", "content"]].apply(lambda x: trans2english_batch(text = x, sourcelang = row["language_id"])),
    axis = 1
)

## Saving batch data

In [24]:
if os.getlogin() == "ctoruno":
    compiler_subset.to_parquet(f"../data/translation_batches/EU_trdata_{compiler}_batch_{batch}.parquet.gzip", compression = "gzip")
else:
    compiler_subset.to_parquet(f"EU_trdata_{compiler}_batch_{batch}.parquet.gzip", compression = "gzip")