# Setting up and installing everything

In [None]:
%pip install tika==2.6.0

In [None]:
%pip install langdetect

In [None]:
%pip install mtdata

In [None]:
%pip install nlcodec

In [None]:
%pip install rtg

In [None]:
%%bash

# run this in terminal
IMAGE=tgowda/rtg-model:500toEng-v1
docker run --rm -i -p 6060:6060 $IMAGE

In [1]:
import tika
from tika import translate
import pandas as pd
import re

In [16]:
translated = translate.auto_from_buffer('Hola, amigo', 'en', requestOptions = {'timeout': 180})
print(translated)

Hey, buddy.


# Trying with Pixstory data

In [4]:
# reading the file after language detection
pixstory = pd.read_csv('pixstory_langdetect.csv')
# pixstory_df.head()

In [5]:
pixstory = pixstory.drop(columns=['Story Primary ID', 'User Primary ID', 'User ID', 'Gender', 'Age', 'Title', 
                                        'Media', 'Account Created Date', 'median_activity', 'least_likely_activity', 
                                        'ADI_age', 'DDI_age', 'ADDI_age', 'ADI_sex', 'DDI_sex', 'ADDI_sex', 
                                        'Interest', 'sport_event', 'Festivals', 'GLAAD', 'ADL', 'sarc', 
                                        'diagnosis_1', 'diagnosis_2', 'diagnosis_3', 'most_likely_activity'])
pixstory.to_csv('pixstory_trans.csv')

In [19]:
pixstory_df = pd.read_csv('pixstory_trans.csv')
pixstory_df = pixstory_df.drop(columns=['Unnamed: 0'])
pixstory_df.head(10)

Unnamed: 0,Story ID,Narrative,tika_lan_code,ggl_lan_code
0,STY1659204380,Its too early to determine if the Kings will m...,en,en
1,STY1659204297,#milap #santosh #intercollegiate,it,it
2,STY1659204103,"Oggi come esattamente un anno fa, Jake Dennis ...",it,it
3,STY1659203785,"Pur senza la tanto temuta pioggia, le qualific...",it,it
4,STY1659202826,Manipuri Weighlifter Mirabai Chanu won India's...,en,en
5,STY1659201808,Happy Saturday!,ms,tl
6,STY1659200907,Choco lava cake a delicious indulgence for eve...,en,en
7,STY1659200567,Cristiano Ronaldo's surprise return to Sportin...,en,en
8,STY1659200539,"On Saturday, Punjab Chief Minister Bhagwant Ma...",en,en
9,STY1659200387,"“Djokovic non è una minaccia nazionale, è nell...",it,it


In [20]:
# understanding whether the narrative is written in english or not
for row in pixstory_df.index:
    lan1 = pixstory_df.at[row,'tika_lan_code']
    lan2 = pixstory_df.at[row,'ggl_lan_code']
    narrative = str(pixstory_df.at[row,'Narrative'])

    if lan1 == 'en' or lan2 == 'en':  # if any one of the language detection system says it's English, we assume it's English
        pixstory_df.at[row, 'Language'] = "en"
    elif lan1 == 'zz' or lan2 == 'zz':  # for those that weren't processable by either of the language detection systems
        pixstory_df.at[row, 'Language'] = "undetectable"
    elif lan1 != lan2:  # for those rows which language detection results were conflict
        pixstory_df.at[row, 'Language'] = "not match"
    elif re.findall(r'[\u263a-\U0001f645]', narrative) != []:  # filter out the ones with emoticons, which RTG cannot handle
        pixstory_df.at[row, 'Language'] = "emoticons"
    else:  # the rest of them will be translated
        pixstory_df.at[row, 'Language'] = "OTHER"  

# filling the translated column with original narrative if it's written in English
eng_rows = pixstory_df.loc[pixstory_df['Language'] == 'en'].index
for k in eng_rows:
    narrative = pixstory_df.loc[k, 'Narrative']
    pixstory_df.loc[k, 'translated'] = narrative

In [21]:
eng_df = pixstory_df.loc[pixstory_df['Language'] == 'en']
eng_df.to_csv('eng.csv')

In [8]:
needs_trans_df = pixstory_df.loc[pixstory_df['Language'] == 'OTHER']
needs_trans_df.to_csv('pixstory_need.csv')

In [17]:
df_need_trans = pd.read_csv('pixstory_need.csv').drop(columns=['Unnamed: 0'])
df_need_trans.head(20)

Unnamed: 0,Story ID,Narrative,tika_lan_code,ggl_lan_code,Language,translated
0,STY1659204297,#milap #santosh #intercollegiate,it,it,OTHER,
1,STY1659204103,"Oggi come esattamente un anno fa, Jake Dennis ...",it,it,OTHER,
2,STY1659203785,"Pur senza la tanto temuta pioggia, le qualific...",it,it,OTHER,
3,STY1659200387,"“Djokovic non è una minaccia nazionale, è nell...",it,it,OTHER,
4,STY1659200328,Purtroppo gli esami effettuati in Germania han...,it,it,OTHER,
5,STY1659200277,L'Europeo femminile in svolgimento in Inghilte...,it,it,OTHER,
6,STY1659200206,"Kim Min-Jae, neo acquisto del Napoli, non è di...",it,it,OTHER,
7,STY1659200093,Ormai la trattativa è conclusa e Robert Lewand...,it,it,OTHER,
8,STY1659200045,Da un lato Sinner ha la possibilità di acceder...,it,it,OTHER,
9,STY1659199969,Novak Djokovic non sa ancora se potrà entrare ...,it,it,OTHER,


In [None]:
# translating rows which narrative was not written in English to English
for i in df_need_trans.index:
    narrative = df_need_trans.loc[i, 'Narrative']
    if len(str(narrative)) < 50:  # if the text is not too long
        try:
            df_need_trans.loc[i, 'translated'] = translate.auto_from_buffer(narrative, 'en', requestOptions = {'timeout': 120})
        except Exception as e:
            print(str(e))
            continue
    else: # if the text is long, split them into shorter chunks before translating them
        try:
            translated_parts = []
            for j in range(0, len(narrative), 50):
                part = narrative[j:j+50]
                print(part)
                trans_part = translate.auto_from_buffer(part, 'en', requestOptions = {'timeout': 120})
                print(trans_part)
                translated_parts.append(trans_part)
            df_need_trans.loc[i, 'translated'] = ''.join(translated_parts)
        except Exception as e:
            print(str(e))
            continue

In [None]:
# for those rows that failed to be translated
# may require to run several times or increasing timeout
failed = pixstory_df[(pixstory_df['translated'].isna()) & (pixstory_df['Language'].str.contains('OTHER'))].index

for i in failed:
    narrative = pixstory_df.loc[i, 'Narrative']
    
    if len(narrative) < 5:  # not length problem
        pixstory_df.loc[i, 'translated'] = narrative
    else:
        try:
            translated_parts = []
            for j in range(0, len(narrative), 30):
                part = narrative[j:j+30]
                trans_part = translate.auto_from_buffer(part, 'en', requestOptions = {'timeout': 180})
                translated_parts.append(trans_part)
            pixstory_df.loc[i, 'translated'] = ' '.join(translated_parts)
        except Exception as e:
            print(str(e))

# pixstory_df.head()

In [None]:
pixstory_df.drop(columns=['Language'])