In [1]:
import os
import translators as ts
import pandas as pd
import multiprocessing as mp
import nltk
from nltk.tokenize import sent_tokenize
from deep_translator import GoogleTranslator

nltk.download("punkt")

Using region District of Columbia server backend.

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ctoruno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Defining parameters

In [25]:
compiler = "carlos"
batch    = 25

## Defining functions

In [3]:
def trans2english_batch(text, sourcelang):
    """
    This functions gathers a text in a specific language and it returns its equivalent 
    in English using the Google translation engine. 

    Parameters:
        text:       String. Text to translate.
        sourcelang: String. Code of the source language you want to translate the text from.
    """
    try:
        sentences = sent_tokenize(text)
        batch  = GoogleTranslator(source = sourcelang, target = "en").translate_batch(sentences)
        result = " ".join(batch)
        return result
    except Exception as e:
        out = f"Translation through API failed. Reason: {e}"
        return out

def parallel_translation(df):
    with mp.Pool(5) as pool:
        df[["title_trans", "description_trans", "content_trans"]] = pool.starmap(
            trans2english_batch,
            zip(
                df[["title", "description", "content"]].values.tolist(),
                df["language_id"].values.tolist()
            )
        )
    return df

## Reading the data

In [4]:
if os.getlogin() == "ctoruno":
    master_data = pd.read_parquet("../data/eu-news-batch-unique.parquet.gzip")
else:
    master_data = pd.read_parquet("eu-news-batch-unique.parquet.gzip")
master_data["compiler"].value_counts()

compiler
artha     5114
other     5107
dalia     5064
santi     5049
carlos    4894
Name: count, dtype: int64

## Subsetting the data

In [26]:
starting_row    = 100*(batch-1)
final_row       = starting_row+100
compiler_subset = master_data.loc[master_data["compiler"] == compiler].iloc[starting_row:final_row]

In [27]:
compiler_subset

Unnamed: 0,country,journal,asspillar,language,article_id,title,link,keywords,creator,video_url,description,content,pubDate,image_url,source_id,source_priority,category,language_id,compiler
15949,[finland],https://www.hs.fi/,open_government,finnish,021b546ae5418dea020cce9ad4d13450,Iskut Venäjälle | Moskovaan tehtiin taas lenno...,https://www.hs.fi/ulkomaat/art-2000009751343.html,[Ulkomaat],,,Ukrainan lennokki-iskut Venäjälle ovat lisäänt...,Venäjän pääkaupunkiin Moskovaan tehtiin varhai...,2023-07-30 06:14:00,https://hs.mediadelivery.fi/img/1440/317046584...,hs,139886.0,[top],fi,carlos
15950,[finland],https://www.hs.fi/,open_government,finnish,876a220e8f7131c2071802b72fa044bd,Twitter | Rap-artisti Kanye Westin tili palaut...,https://www.hs.fi/kulttuuri/art-2000009751295....,[Kulttuuri],,,Kanye Westin tili on aiemmin jäädytetty muun m...,"Sosiaalisen median alusta Twitter, nykyisin my...",2023-07-30 05:15:00,https://hs.mediadelivery.fi/img/1440/2b9487610...,hs,139886.0,[environment],fi,carlos
15958,[finland],https://www.hs.fi/,open_government,finnish,1a40f66c1f384e704cf390ab0e29440f,HS Turku | Näin neuvosto­diplomaatit saivat 19...,https://www.hs.fi/kotimaa/turku/art-2000009731...,[Minna Arve],,,,Venäjän pääkonsulaatin henkilökunnan vuosikymm...,2023-07-24 07:09:00,https://hs.mediadelivery.fi/img/1440/1d8334b25...,hs,139886.0,[top],fi,carlos
15964,[finland],https://www.hs.fi/,open_government,finnish,f93c3d1e6185a03889741e90596045c7,Musiikki | Kansalais­aktivistit vaativat Ramms...,https://www.hs.fi/kulttuuri/art-2000009711183....,[Kulttuuri],,,Ahdistelusta epäillyn Till Lindemannin asianaj...,Saksalainen kansalaisaktivistiliike vetoaa Ber...,2023-07-11 08:35:00,https://hs.mediadelivery.fi/img/1440/31348f4e8...,hs,139886.0,[environment],fi,carlos
15967,[finland],https://www.hs.fi/,open_government,finnish,47e667e092973a46b3fd916c4f93629d,Museot | Lapsille suunnattu museo poisti LGBTQ...,https://www.hs.fi/kulttuuri/art-2000009698998....,[Kulttuuri],,,Myös transsukupuolisuutta juhliva juliste pois...,Young V&A -museo eli Lontoon Victoria & Albert...,2023-07-05 12:02:00,https://hs.mediadelivery.fi/img/1440/b00ae7be7...,hs,139886.0,[environment],fi,carlos
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16601,[finland],https://www.is.fi/,fundamental_freedoms,finnish,70a43798b9583112dce3b18b66b54680,Kova ennustus vain pari viikkoa ennen Prigozhi...,https://www.is.fi/ulkomaat/art-2000009805097.html,[Ulkomaat],,,Bellingcatin Christo Grozev arvioi elokuun puo...,Tutkivan journalismin ryhmä Bellingcatin toimi...,2023-08-24 00:11:00,https://is.mediadelivery.fi/img/1440/c026aaff2...,is,55661.0,[top],fi,carlos
16634,[finland],https://www.is.fi/,order_and_security,finnish,f331698f8d0560a715c367882f98783f,Väki­joukko kähmii ja raahaa kahta naista Inti...,https://www.is.fi/ulkomaat/art-2000009732188.html,[Ulkomaat],,,Video kahteen naiseen kohdistuvasta seksuaalis...,Kahteen naiseen kohdistettu seksuaalinen väkiv...,2023-07-20 13:06:00,https://is.mediadelivery.fi/img/1440/b79e89aaa...,is,55661.0,[top],fi,carlos
16636,[finland],https://www.is.fi/,order_and_security,finnish,0fa25aabc906de9b0b4f711afd92ed4d,Joe Biden jyrähti suomalais­toimittajan kysymy...,https://www.is.fi/kotimaa/art-2000009718700.html,[Kotimaa],,,Joe Bidenia haastettiin Presidentinlinnassa si...,Yhdysvaltain presidentti Joe Biden joutui tasa...,2023-07-13 19:31:00,https://is.mediadelivery.fi/img/1440/bd08b9878...,is,55661.0,[top],fi,carlos
16653,[finland],https://www.is.fi/,civil_justice,finnish,dc3136f0eabbf9e84202798ceff34c45,83-vuotiaan Juhanin suojatti on noussut kuning...,https://www.is.fi/ravit/art-2000009656679.html,[Ravit],,,Ikämiehet Juhani Heikkinen ja Risto Airaksinen...,Suomenhevostammojen huipulle on noussut ilahdu...,2023-06-14 18:27:00,https://is.mediadelivery.fi/img/1440/bfe994e9a...,is,55661.0,[top],fi,carlos


## Translating headline, description, and content

In [32]:
compiler_subset[["title_trans", "description_trans", "content_trans"]] = compiler_subset.apply(
    lambda row: row[["title", "description", "content"]].apply(lambda x: trans2english_batch(text = x, sourcelang = row["language_id"])),
    axis = 1
)

In [34]:
setcols = ["country", "journal", "language", "title", "description", "content", "title_trans", "description_trans", "content_trans"]
print("==== Title ====")
print(compiler_subset.loc[compiler_subset.title_trans.str.contains("Translation through API failed") == True, setcols].value_counts("title_trans"))
print("==== Description ====")
print(compiler_subset.loc[compiler_subset.description_trans.str.contains("Translation through API failed") == True, setcols].value_counts("description_trans"))
print("==== Content ====")
print(compiler_subset.loc[compiler_subset.content_trans.str.contains("Translation through API failed") == True, setcols].value_counts("content_trans"))

==== Title ====
title_trans
Translation through API failed. Reason: HTTPSConnectionPool(host='translate.google.com', port=443): Max retries exceeded with url: /m?tl=en&sl=fi&q=Alkoholia+voi+ensi+vuonna+tilata+kotiinkuljetuksella+kaupasta%2C+ravintolasta+ja+Alkosta%2C+jos+hallituksen+tavoite+toteutuu (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))                                                              1
Translation through API failed. Reason: HTTPSConnectionPool(host='translate.google.com', port=443): Max retries exceeded with url: /m?tl=en&sl=fi&q=Bild%3A+Rammsteinin+toimistoon+hy%C3%B6k%C3%A4ttiin (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))                                                                                                                                       1
Translation through API failed. Reason: HTTP

In [35]:
compiler_subset.loc[compiler_subset.content_trans.str.contains("Translation through API failed") == True, setcols]

Unnamed: 0,country,journal,language,title,description,content,title_trans,description_trans,content_trans
15950,[finland],https://www.hs.fi/,finnish,Twitter | Rap-artisti Kanye Westin tili palaut...,Kanye Westin tili on aiemmin jäädytetty muun m...,"Sosiaalisen median alusta Twitter, nykyisin my...",Twitter | Rap artist Kanye West's account was ...,Kanye West's account has previously been suspe...,Translation through API failed. Reason: HTTPSC...
15958,[finland],https://www.hs.fi/,finnish,HS Turku | Näin neuvosto­diplomaatit saivat 19...,,Venäjän pääkonsulaatin henkilökunnan vuosikymm...,Translation through API failed. Reason: HTTPSC...,Translation through API failed. Reason: expect...,Translation through API failed. Reason: HTTPSC...
16001,[finland],https://www.hs.fi/,finnish,Turkin jätti-inflaatio | Inflaatio tekee arjes...,,Istanbulilainen yliopisto-opettaja Sevgi lukeu...,Turkey's giant inflation | Inflation makes eve...,Translation through API failed. Reason: expect...,Translation through API failed. Reason: HTTPSC...
16004,[finland],https://www.hs.fi/,finnish,Matkustus | Helsinki-Vantaan turva­tarkastus r...,"Finavia varoittaa, että koko syyskuu on hyvin ...",Ruuhkat Helsinki-Vantaan lentoasemalla jatkuva...,Travel | Helsinki-Vantaa's security check was ...,Finavia warns that the whole of September will...,Translation through API failed. Reason: HTTPSC...
16019,[finland],https://www.hs.fi/,finnish,Politiikka | ”Täydellinen demari” otti yhteen ...,,A ntti Lindtman valittiin perjantai-iltana Sdp...,"Politics | The ""perfect dem"" clashed with Sann...",Translation through API failed. Reason: expect...,Translation through API failed. Reason: HTTPSC...
16059,[finland],https://www.hs.fi/,finnish,HS-analyysi | Ruotsi on turvallisuus­poliittis...,,Tukholma Ruotsi on joutunut huonojen uutisten ...,HS Analysis | Sweden is in a security politica...,Translation through API failed. Reason: expect...,Translation through API failed. Reason: HTTPSC...
16067,[finland],https://www.hs.fi/,finnish,Muistokirjoitus | Tony Bennett oli viimeinen s...,Laulaja Tony Bennett kuoli 21. heinäkuuta koto...,Yhdysvaltalainen viihde- ja jazzlaulaja Tony B...,Obituary | Tony Bennett was the last great Ame...,Singer Tony Bennett died on July 21 at his hom...,Translation through API failed. Reason: HTTPSC...
16072,[finland],https://www.hs.fi/,finnish,Televisioarvio | Netflixin uusi hittisarja Lap...,Pätevä saksalainen jännityssarja kertoo kaappa...,Hannah näyttää saamaansa kirjaa valvontakamera...,Television Review | Netflix's new hit series L...,The competent German suspense series is about ...,Translation through API failed. Reason: HTTPSC...
16077,[finland],https://www.hs.fi/,finnish,Rakentaminen | ”Tilanne on katastrofaalinen” –...,Pääkaupunkiseudun salibandyseurat joutuvat pel...,Salibandyseura Esport Oilersin toiminnanjohtaj...,"Construction | ""The situation is catastrophic""...",Translation through API failed. Reason: HTTPSC...,Translation through API failed. Reason: HTTPSC...
16090,[finland],https://www.hs.fi/,finnish,Rakentaminen | Rakennusalan alho painaa nyt ko...,,"Muurari Ural Agejev napauttaa tiiltä, ja se ka...",Construction | Construction industry Alho is n...,Translation through API failed. Reason: expect...,Translation through API failed. Reason: HTTPSC...


In [24]:
if os.getlogin() == "ctoruno":
    compiler_subset.to_parquet(f"../data/translation_batches/EU_trdata_{compiler}_batch_{batch}.parquet.gzip", compression = "gzip")
else:
    compiler_subset.to_parquet(f"EU_trdata_{compiler}_batch_{batch}.parquet.gzip", compression = "gzip")

## Multiprocessing translation (not working)

In [11]:
# if __name__ == "__main__":
#     compiler_subset_par = parallel_translation(compiler_subset)

## Compiling batches

In [6]:
batches = os.listdir("../data/translation_batches")
batch_data = [pd.read_parquet(f"../data/translation_batches/{x}") for x in batches]
compiled_data = pd.concat(batch_data)
compiled_data.head(10)

Unnamed: 0,country,journal,asspillar,language,article_id,title,link,keywords,creator,video_url,...,pubDate,image_url,source_id,source_priority,category,language_id,compiler,title_trans,description_trans,content_trans
8,[austria],https://www.wienerzeitung.at/,constraints_of_government_powers,german,30c1ac3bca2d3737de521e67cfcaaf11,Queer statt Kaiser,https://www.wienerzeitung.at/a/queer-bad-ischl...,,,,...,2023-10-08 03:00:00,,wienerzeitung,2003474.0,[top],de,artha,Queer instead of emperor,Emperor Franz Joseph abdicates in 2024: Bad Is...,Bad Ischl would almost have remained one villa...
13,[austria],https://www.wienerzeitung.at/,constraints_of_government_powers,german,a07106f5aedf74895b36c832e50ba1ee,Das dreckige Geschäft mit der Reinigung,https://www.wienerzeitung.at/a/das-dreckige-ge...,,,,...,2023-10-02 12:58:39,,wienerzeitung,2003474.0,[top],de,artha,The dirty business of cleaning,Cleaning workers recruited via platforms such ...,Anuya* proudly takes a bunch of a dozen keys o...
18,[austria],https://www.wienerzeitung.at/,constraints_of_government_powers,german,0b0551c60900b9dbc09942d8dc74c1a4,Chronologie einer gescheiterten Rettung,https://www.wienerzeitung.at/nachrichten/polit...,,,,...,2023-06-29 10:00:00,https://www.wienerzeitung.at/_em_daten/_cache/...,wienerzeitung,2003474.0,[top],de,artha,Chronology of a failed rescue,The bad news comes suddenly and unexpectedly. ...,The bad news comes suddenly and unexpectedly. ...
23,[austria],https://www.wienerzeitung.at/,constraints_of_government_powers,german,dbccedd1a05373f358adf7de8aad3374,"Private bauen aus, Energieversorger zögern",https://www.wienerzeitung.at/nachrichten/wirts...,,,,...,2023-06-27 16:30:00,https://www.wienerzeitung.at/_em_daten/_cache/...,wienerzeitung,2003474.0,[top],de,artha,"Private companies are expanding, energy suppli...","More than 1,000 megawatts of photovoltaic capa...","More than 1,000 megawatts of photovoltaic capa..."
25,[austria],https://www.wienerzeitung.at/,constraints_of_government_powers,german,e1705dc80ca543412b88966f32c27db0,Fensterln für ein Bier,https://www.wienerzeitung.at/nachrichten/polit...,,,,...,2023-06-27 15:30:00,https://www.wienerzeitung.at/_em_daten/_cache/...,wienerzeitung,2003474.0,[top],de,artha,Sitting in the window for a beer,It's conspiratorial if you want to buy beer in...,It's conspiratorial if you want to buy beer in...
26,[austria],https://www.wienerzeitung.at/,constraints_of_government_powers,german,2b65437ffac3b7f7a3f258dbe0eed9ed,"""Es war unerträglich heiß und roch nach Blut""",https://www.wienerzeitung.at/nachrichten/polit...,,,,...,2023-06-27 14:55:33,https://www.wienerzeitung.at/_em_daten/_cache/...,wienerzeitung,2003474.0,[politics],de,artha,"""It was unbearably hot and smelled of blood""",He is the last survivor of those terrible even...,He is the last survivor of those terrible even...
31,[austria],https://www.wienerzeitung.at/,constraints_of_government_powers,german,819280cf97424f3467eff11713cc5d33,Time to Say Goodbye!,https://www.wienerzeitung.at/nachrichten/kultu...,,,,...,2023-06-27 06:00:00,https://www.wienerzeitung.at/_em_daten/_cache/...,wienerzeitung,2003474.0,[top],de,artha,Time to say goodbye!,Nineteen stacks. Nineteen stacks lie on the fl...,Nineteen stacks. Nineteen stacks lie on the fl...
33,[austria],https://www.wienerzeitung.at/,constraints_of_government_powers,german,a90c961e6b3816b4757bd03a5f165363,Ein Rückgang auf hohem Niveau,https://www.wienerzeitung.at/nachrichten/polit...,,,,...,2023-06-26 15:57:00,https://www.wienerzeitung.at/_em_daten/_cache/...,wienerzeitung,2003474.0,[top],de,artha,A decline at a high level,These are the same asylum numbers that the ÖVP...,These are the same asylum numbers that the ÖVP...
36,[austria],https://www.wienerzeitung.at/,constraints_of_government_powers,german,d086e5692cdab023b70eda07ace090cf,Dominanz auf allen Ebenen,https://www.wienerzeitung.at/nachrichten/polit...,,,,...,2023-06-24 16:00:00,https://www.wienerzeitung.at/_em_daten/_cache/...,wienerzeitung,2003474.0,[top],de,artha,Dominance on all levels,Greece is divided into 333 municipalities acro...,Greece is divided into 333 municipalities acro...
45,[austria],https://www.wienerzeitung.at/,constraints_of_government_powers,german,38827e8cc5e8634229f43f39abc3de0f,Kontroverse um Mitgefühl für Titan-Team,https://www.wienerzeitung.at/nachrichten/chron...,,,,...,2023-06-22 15:25:15,https://www.wienerzeitung.at/_em_daten/_cache/...,wienerzeitung,2003474.0,[top],de,artha,Titan Team Compassion Controversy,Do we feel more compassion for five men missin...,Do we feel more compassion for five men missin...


In [7]:
compiled_data.to_parquet("../data/eu-news-batch-unique-translated.parquet.gzip", compression = "gzip")