In [1]:
import mysql.connector
import numpy as np
import pandas as pd
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
from pickle import load
from numpy.random import shuffle

In [2]:
def load_data(lang_key = 'EN'):
    db_writer = mysql.connector.connect(
        host="35.232.80.174",
        database="masters",
        user="root",
        passwd="MBBmasters!")

    query = "SELECT a.text as EN, b.text as PT FROM masters.Translations left join masters.Sentences a on a.sentence_id=Translations.sentence_id_1 left join masters.Sentences b on b.sentence_id=Translations.sentence_id_2 where Translations.sentence_id_1 in (select sentence_id from masters.Sentences where language_key='EN') AND trim(b.language_key)='PT'"     
    cursor = db_writer.cursor()
    cursor.execute(query)

    sql_text_data = pd.DataFrame(cursor.fetchall())
    sql_text_data.columns = cursor.column_names

#     data = cursor.fetchall()
#     data = list(data)
    # Close the session
    db_writer.close()

    # Show the data
    print(sql_text_data.head())
    return sql_text_data

In [3]:
sentences_df = load_data()

                        EN                          PT
0  I have lost my passport     Eu perdi meu passaporte
1   Someone stole my money  Alguém roubou meu dinheiro
2                     Help                     Socorro
3      May I have the bill         Pode trazer a conta
4     I would like dessert    Eu gostaria de sobremesa


In [4]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.strip().lower()
        text = text.replace(punctuation, '')
    return text


In [5]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [6]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [7]:
#Test data
arr = clean_pairs(array(sentences_df))
df = pd.DataFrame(arr)
df = df.rename(index=str, columns={0:'EN',1:"PT"})

df.head()

Unnamed: 0,EN,PT
0,i have lost my passport,eu perdi meu passaporte
1,someone stole my money,alguem roubou meu dinheiro
2,help,socorro
3,may i have the bill,pode trazer a conta
4,i would like dessert,eu gostaria de sobremesa


In [8]:
save_clean_data(df, "../data/en_pt_test.pkl")

Saved: ../data/en_pt_test.pkl


#### Training data (from por.txt) (Data downloaded from http://www.manythings.org/anki/)

In [9]:
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

In [10]:
#load doc into memory
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf8')
    text = file.read()
    file.close()
    return text

In [11]:
# load training dataset
filename = '../Test/por-eng/mini_por.txt'

doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_data = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_data, '../data/english-portuguese_training.pkl')
# spot check
for i in range(100):
    print('[%s] => [%s]' % (clean_data[i,0], clean_data[i,1]))

Saved: ../data/english-portuguese_training.pkl
[go] => [vai]
[go] => [va]
[hi] => [oi]
[run] => [corre]
[run] => [corra]
[run] => [corram]
[run] => [corre]
[run] => [corra]
[run] => [corram]
[who] => [quem]
[wow] => [uau]
[wow] => [nossa]
[wow] => [wow]
[fire] => [fogo]
[help] => [ajuda]
[help] => [socorro]
[jump] => [pule]
[jump] => [pulem]
[jump] => [pule]
[stop] => [pare]
[stop] => [parem]
[wait] => [espere]
[wait] => [espere]
[wait] => [esperem]
[go on] => [va]
[hello] => [oi]
[hello] => [alo]
[hello] => [ola]
[i ran] => [eu corri]
[i see] => [estou vendo]
[i try] => [eu tento]
[i try] => [tento]
[i won] => [ganhei]
[i won] => [eu venci]
[oh no] => [ah nao]
[relax] => [relaxe]
[relax] => [relaxa]
[smile] => [sorria]
[smile] => [sorriam]
[attack] => [atacar]
[attack] => [ataquem]
[attack] => [ataque]
[cheers] => [saude]
[get up] => [levantese]
[get up] => [levantemse]
[get up] => [levantate]
[get up] => [levantese]
[get up] => [levantate]
[go now] => [va agora]
[got it] => [entendi]

In [12]:
clean_data.shape

(5001, 2)

In [13]:
# reduce dataset size
#n_sentences = 10000
#ds = clean_data[:n_sentences, :]
# random shuffle
shuffle(clean_data)
# split into train/test
train, test = clean_data[:100000], clean_data[100000:]
# save
save_clean_data(clean_data, '../data/english-portuguese-training-both.pkl')
save_clean_data(train, '../data/english-portuguese-training.pkl')
save_clean_data(test, '../data/english-portuguese-validation.pkl')

Saved: ../data/english-portuguese-training-both.pkl
Saved: ../data/english-portuguese-training.pkl
Saved: ../data/english-portuguese-validation.pkl


In [14]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [15]:
print(train.shape, test.shape)

(5001, 2) (0, 2)
