### The code in this notebook is borrowed from "TensorFlow Core" at: https://www.tensorflow.org/tutorials/text/nmt_with_attention  
### There may be some local change for study purposes.

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [49]:
# Download the file
path_to_zip = tf.keras.utils.get_file(
    fname = 'spa-eng.zip', 
    origin = 'http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip', 
    extract = True)
filepath = os.path.dirname(path_to_zip) + '/spa-eng/spa.txt'

In [50]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r'([?.!,¿])', r' \1 ', w)
    w = re.sub(r'[""]', " ", w)
    
    # replace everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r'[^a-zA-Z?.,!¿]', ' ', w)
    
    w.rstrip().strip()
    
    # add a start and an end token to the sentence
    # so that the model know when to start and stop
    w = '<start> ' + w + ' <end>'
    return w

In [51]:
en_sentence = u'May I borrow your book?'
sp_sentence = u'¿Peudo tomar prestado este libro?'

print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> may i borrow your book ?  <end>
b'<start>  \xc2\xbf peudo tomar prestado este libro ?  <end>'


In [107]:
my = "nǐ céngjīng duì wǒ shuōguò"
print(preprocess_sentence(my))

<start> ni cengjing dui wo shuoguo <end>


In [64]:
# remove the accent & clean sentences & return word pairs [eng, spn]
def create_dataset(path, num_exmaples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_exmaples]]
    return zip(*word_pairs)

In [104]:
en, sp = create_dataset(filepath2, None)
print(en[-1])
print(sp[-1])

ValueError: too many values to unpack (expected 2)

In [106]:
io.open(filepath2, encoding='UTF-8').read().strip().split('\n')

['Hi.\t嗨。\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #891077 (Martha)',
 'Hi.\t你好。\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4857568 (musclegirlxyp)',
 'Run.\t你用跑的。\tCC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #3748344 (egg0073)',
 'Wait!\t等等！\tCC-BY 2.0 (France) Attribution: tatoeba.org #1744314 (belgavox) & #4970122 (wzhd)',
 'Hello!\t你好。\tCC-BY 2.0 (France) Attribution: tatoeba.org #373330 (CK) & #4857568 (musclegirlxyp)',
 'I try.\t让我来。\tCC-BY 2.0 (France) Attribution: tatoeba.org #20776 (CK) & #5092185 (mirrorvan)',
 'I won!\t我赢了。\tCC-BY 2.0 (France) Attribution: tatoeba.org #2005192 (CK) & #5102367 (mirrorvan)',
 'Oh no!\t不会吧。\tCC-BY 2.0 (France) Attribution: tatoeba.org #1299275 (CK) & #5092475 (mirrorvan)',
 'Cheers!\t乾杯!\tCC-BY 2.0 (France) Attribution: tatoeba.org #487006 (human600) & #765577 (Martha)',
 'Got it?\t你懂了吗？\tCC-BY 2.0 (France) Attribution: tatoeba.org #455353 (FeuDRenais) & #7768205 (jiangche)',
 'He ran.

In [68]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [69]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    
    # get word to index dictionary for sequences
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    
    # pad converted sequences
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    
    return tensor, lang_tokenizer

In [75]:
def load_dataset(path, num_examples=None):
    targ_lang, inp_lang = create_dataset(path, num_examples)
    
    inp_tensor, inp_token = tokenize(inp_lang)
    targ_tensor, targ_token = tokenize(targ_lang)
    return inp_tensor, targ_tensor, inp_token, targ_token

In [76]:
num_expl = 30000

In [102]:
filepath2 = os.getcwd() + '/data/cmn-eng/cmn.txt'

In [103]:
input_tensor, target_tensor, input_token, target_token = load_dataset(filepath2, num_expl)

ValueError: too many values to unpack (expected 2)

In [81]:
max_length_input = max_length(input_tensor)
max_length_target = max_length(target_tensor)

In [82]:
input_tensor_train, input_tensor_valid, target_tensor_train, target_tensor_valid = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(len(input_tensor_train), len(input_tensor_valid), len(target_tensor_train), len(target_tensor_valid))

24000 6000 24000 6000


In [88]:
def convert(token, tensor):
    for t in tensor:
        if t != 0:
            print(f"{t} -----> {token.index_word[t]}")

In [91]:
print('Input index ------> input language')
print('==================================')
convert(input_token, input_tensor_train[0])
print()
convert(target_token, target_tensor_train[0])

Input index ------> input language
1 -----> <start>
133 -----> deja
11 -----> que
18 -----> lo
2412 -----> arregle
3 -----> .
2 -----> <end>

1 -----> <start>
46 -----> let
17 -----> me
583 -----> fix
20 -----> that
3 -----> .
2 -----> <end>


In [95]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = BUFFER_SIZE
embedding_dim = 256
units = 1024
vocab_input_size = len(input_token.word_index) + 1
vocab_target_size = len(target_token.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [93]:
example_input_batch

<ShuffleDataset shapes: ((16,), (11,)), types: (tf.int32, tf.int32)>