# Translation script for generating translated data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
import nltk
import json

from itertools import chain


from nltk.collocations import *
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import MWETokenizer
from nltk.stem import PorterStemmer
from nltk.probability import *
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score, confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

## Load the data

In [None]:
trainData = pd.read_csv('./Training Dataset-20190429/train_data.csv')
trainLabel = pd.read_csv('./Training Dataset-20190429/train_label.csv')

## Translating other language into English version

In [1]:
# path defined
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/dt/Documents/key.json"

from google.cloud import translate
from langdetect import detect


# translate the list of text into english text
def translate_text(text, target = 'en'):
    transalte_client = translate.Client()
    translation = transalte_client.translate(
        text,
        target_language=target
    )
    print('Text: ', translation['input'] )
    print('Translation: ', translation['translatedText'])
    print('Detected source: ', translation['detectedSourceLanguage'])
    return translation['translatedText']


# check the text language type
def get_language_id(dataDf):
    lang = []
    for index, row in dataDf.iterrows():
        try:
            lang.append(detect(row["text"]))
        except:
            lang.append('error')
            
    trn = dataDf["trn_id"].values
    text = dataDf["text"].values
    df = pd.DataFrame(np.array([trn, text, lang])).T
    df.columns = ["trn_id", "text", "lang"]
    return df

# translate all non english type texts into dictionary with trn ID
def translate_non_eng_dic(dataLangDf):
    dataLangTransDf = dataLangDf.copy()
    langList = list(set(dataLangDf["lang"].values))
    listOfKeys = [index  for (index, row) in dataLangDf.iterrows() if row["lang"] != "en"]
    for index in listOfKeys:
        dataLangTransDf.loc[index, "text"] = translate_text(dataLangDf.loc[index, "text"]) 
    dataLangTransDf["org_text"] = dataLangDf["text"]
    dataLangTransDf.rename(columns = {"text": "trans_text"}, inplace=True)
    return dataLangTransDf

In [None]:
def write_csvfiles_for_translated_data(data):

    totalRecLen = len(data)
    recLen = 100000
    segRecLenL = 0
    segRecLenR = recLen
    
    while (segRecLenR <= totalRecLen):
        segData = data[segRecLenL:segRecLenR]
        # detect how many non-English text
        segDataLang = get_language_id(segData)
        # translate other language into English and records those trnIDs
        segDataLangTrans = translate_non_eng_dic(segDataLang)
        fileName = "trainDaTrans" + str(segRecLenL) + "to"  + str(segRecLenR) +  ".csv"
        # write dataframe into csv file 
        segDataLangTrans.to_csv(fileName)
        segRecLenL = segRecLenR
        segRecLenR = segRecLenR + recLen
        
        if (segRecLenR > totalRecLen and segRecLenL < totalRecLen):
            segRecLenR = totalRecLen
            

In [None]:
def union_transfiles_dataframe(segRecLenL = 0, segRecLenR = 100000):
    segRecLen = segRecLenR - segRecLenL
    sum_df = pd.DataFrame()
    
    while (segRecLenR <= 650000):
        fileName = "trainDaTrans" + str(segRecLenL) + "to"  + str(segRecLenR) +  ".csv"

        each_df = pd.read_csv(fileName, index_col=[0])
        sum_df = pd.concat([sum_df,each_df],ignore_index=True)
        
        segRecLenL = segRecLenR
        segRecLenR = segRecLenR + segRecLen
        
        if (segRecLenL == 600000):
            segRecLenR = 650000
    return sum_df

In [None]:
def write_dic_into_json(documentDic, fileName):
    fileName = fileName + ".json"
    with open(fileName, 'w') as outfile:
        json.dump(documentDic, outfile)
    outfile.closed

In [None]:
# translating the data into english one
# write_csvfiles_for_translated_data(trainData)

# read the translated data from file
trainDataTrans = union_transfiles_dataframe()

# detect non English text and record it's trnID
# reshape the dataframe with two columns trn_id and trans_text left
trainDataTransNoteng = trainDataTrans[trainDataTrans["lang"] != 'en']
trainDataTransNoteng = trainDataTransNoteng[["trn_id","trans_text"]].rename(columns={"trn_id":"trn_id","trans_text":"text"})

# detect the new version text for recording text language type
# find out all non English text trn_id documents
langDf = get_language_id(trainDataTransNoteng)
dropList = list(langDf[langDf["lang"] != 'en']["trn_id"].values)

# reshape the dataframe with two columns trn_id and trans_text left
# remove all other language documents
trainDataTransReshape = trainDataTrans[["trn_id","trans_text"]].rename(columns={"trn_id":"trn_id","trans_text":"text"})
trainDataTransReshape = trainDataTransReshape[~trainDataTransReshape["trn_id"].isin(dropList)]

In [None]:
# convert dataframe into dictionary
trDataDic = trainDataTransReshape.set_index('trn_id').T.to_dict('list')
trDataDic = dict((trnID, trDataDic[trnID][0]) for trnID in trDataDic.keys())

# write the dictionary into json file
# write_dic_into_json(trDataDic, "trDataDic")