### The code in this notebook is borrowed from "TensorFlow Core" at: https://www.tensorflow.org/tutorials/text/nmt_with_attention  
### There may be some local change for study purposes.

In [17]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from sklearn.model_selection import train_test_split

from bs4 import BeautifulSoup
import requests

import unicodedata
import re
import numpy as np
import os
import io
import time

In [18]:
# Download the file
path_to_zip = tf.keras.utils.get_file(
    fname = 'spa-eng.zip', 
    origin = 'http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip', 
    extract = True)
filepath = os.path.dirname(path_to_zip) + '/spa-eng/spa.txt'

In [38]:

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFKD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r'([?.!,¿])', r' \1 ', w)
    w = re.sub(r'[""]', " ", w)
    
    # add space between chinese characters without affecting english letters
    w = re.sub(r'(?<=[^a-z\W\d_])(?=[^a-z\W\d_])', ' ', w)
    
    # replace everything with space except (a-z, A-Z, ".", "?", "!", ",")
#     w = re.sub(r'[^a-zA-Z?.,!¿]', ' ', w)
    
    w.rstrip().strip()
    
    # add a start and an end token to the sentence
    # so that the model know when to start and stop
    w = '<start> ' + w + ' <end>'
    return w


#### Use this function when we have a dataset; use the fuction below for now
```python
# remove the accent & clean sentences & return word pairs [eng, spn]
def create_dataset(path, num_exmaples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_exmaples]]
    return zip(*word_pairs)
```

In [39]:
def create_dataset(url):
    html_content = requests.get(url).text
    soup = BeautifulSoup(html_content)
    lines = soup.find_all(class_="ltf")
    word_pairs = [[preprocess_sentence(w) for w in l.get_text().split('\n')] for l in lines]
#     return zip(*word_pairs)
    return word_pairs

In [40]:
url = "https://lyricstranslate.com/en/nǐ-zěnme-shuō-你怎么说-nǐ-zěnme-shuō.html"
aa, bb = create_dataset(url)
print(aa[-1])
print(bb[-1])

<start> 把 我 的 爱 情 还 给 我 <end>
<start> ba wo de aiqing hai gei wo <end>


In [41]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [42]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    
    # get word to index dictionary for sequences
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    
    # pad converted sequences
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    
    return tensor, lang_tokenizer

In [43]:
def load_dataset(path, num_examples=None):
    inp_lang, targ_lang = create_dataset(path)
    
    inp_tensor, inp_token = tokenize(inp_lang)
    targ_tensor, targ_token = tokenize(targ_lang)
    return inp_tensor, targ_tensor, inp_token, targ_token

In [44]:
input_tensor, target_tensor, input_token, target_token = load_dataset(url)

In [45]:
max_length_input = max_length(input_tensor)
max_length_target = max_length(target_tensor)

In [46]:
input_tensor_train, input_tensor_valid, target_tensor_train, target_tensor_valid = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(len(input_tensor_train), len(input_tensor_valid), len(target_tensor_train), len(target_tensor_valid))

15 4 15 4


In [47]:
def convert(token, tensor):
    for t in tensor:
        if t != 0:
            print(f"{t} -----> {token.index_word[t]}")

In [48]:
print('Input index ------> input language')
print('==================================')
convert(input_token, input_tensor_train[0])
print()
convert(target_token, target_tensor_train[0])

Input index ------> input language
1 -----> <start>
42 -----> 连
43 -----> 名
44 -----> 字
4 -----> 你
14 -----> 都
6 -----> 说
45 -----> 错
2 -----> <end>

1 -----> <start>
33 -----> lian
34 -----> mingzi
4 -----> ni
8 -----> dou
9 -----> shuo
35 -----> cuo
2 -----> <end>


In [36]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 4
steps_per_epoch = BUFFER_SIZE
embedding_dim = 256
units = 1024
vocab_input_size = len(input_token.word_index) + 1
vocab_target_size = len(target_token.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [37]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([4, 18]), TensorShape([4, 16]))