In [1]:
import collections
import helper
import numpy as np
import project_tests as tests
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

Using TensorFlow backend.


In [2]:
english_sentences = helper.load_data('source_files/small_vocab_en.txt')
french_sentences = helper.load_data('source_files/small_vocab_fr.txt')
print('Dataset Loaded')

Dataset Loaded


In [3]:
for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en Line 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .
small_vocab_fr Line 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
small_vocab_en Line 2:  the united states is usually chilly during july , and it is usually freezing in november .
small_vocab_fr Line 2:  les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .


In [4]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])
print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823250 English words.
227 unique English words.
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French words.
355 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


In [5]:
def tokenize(x):
    x_tk = Tokenizer(char_level = False)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


In [6]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')
tests.test_pad(pad)
# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


In [7]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
# Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)
print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


In [8]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])
print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


In [None]:
import pandas as pd
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    input_seq = Input(input_shape[1:])
    rnn = GRU(64, return_sequences = True)(input_seq)
    logits = TimeDistributed(Dense(french_vocab_size))(rnn)
    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model
tests.test_simple_model(simple_model)
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
print(tmp_x[0])
# print(tmp_x.shape)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

print(tmp_x.shape)
# Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

print(tmp_x[0])
# simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)
simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10)
# # Print prediction(s)
# print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

[17 23  1  8 67  4 39  7  3  1 55  2 44  0  0  0  0  0  0  0  0]
(137861, 21, 1)
[[17]
 [23]
 [ 1]
 [ 8]
 [67]
 [ 4]
 [39]
 [ 7]
 [ 3]
 [ 1]
 [55]
 [ 2]
 [44]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]
 [ 0]]
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/10


In [None]:
# import os
# import pickle

# import pandas as pd
# import numpy as np

# from utils import Logger
# # from translator import EnglishNovel, FrenchNovel
# from translator import FR_Minutes, EN_Minutes, ES_Minutes, PT_Minutes, IT_Minutes 

In [None]:
# log_path = '/Users/davidhaase/Documents/Flatiron/Projects/machine-translator/logs/'
# pickle_path = '/Users/davidhaase/Documents/Flatiron/Projects/machine-translator/pickles/'
# # path_to_en = '/Users/davidhaase/Documents/Flatiron/Projects/machine-translator/source_files/c1-EN.txt'
# # path_to_fr = '/Users/davidhaase/Documents/Flatiron/Projects/machine-translator/source_files/c1-FR.txt'
# path_to_minutes = '/Users/davidhaase/Documents/Flatiron/Projects/machine-translator/source_files/europarl_raw/ep-00-01-18'



In [None]:
# languages = [FR_Minutes, EN_Minutes, ES_Minutes, PT_Minutes, IT_Minutes]
  
# FR = FR_Minutes(path_to_minutes)
# EN = EN_Minutes(path_to_minutes)
# ES = ES_Minutes(path_to_minutes)
# PT = PT_Minutes(path_to_minutes)
# IT = IT_Minutes(path_to_minutes)
# minutes = {'Français': FR.sentences, 'English': EN.sentences, 'Italiano': IT.sentences, 'Portugês':PT.sentences, 'Español': ES.sentences}

# df = pd.DataFrame.from_dict(minutes, orient='index').T
# df.to_csv('output/languages.csv')
# EN = EnglishNovel(path_to_en)
# FR = EnglishNovel(path_to_fr)
# novels = [EN, FR]
# for novel in novels:
#     novel.clean()

In [None]:
# data_file = 'source_files/languages - languages.csv'
# df = pd.read_csv(data_file, usecols=['Français', 'English'])
# df.head()

In [None]:
# FR = EuroParl(path_to_fr)
# FR.get_sentences()

In [None]:
# for novel in novels:
#     print(len(novel.get_sentences()))
# for index, sent in enumerate(FR.get_sentences()[0:10]):
#     print(index, sent, '\n')

In [None]:
# en_df = pd.DataFrame.from_dict(EN.get_sentences())
# fr_df = pd.DataFrame.from_dict(FR.get_sentences())

In [None]:
# fr_df.head()