### Creating a Seq2Seq LSTM Sequence-to-Sequence model for translating Japanese to English using Movie Subtitle Translations

In [1]:
import os
import json
import unicodedata
import re
import io
import time
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from nltk.translate.meteor_score import single_meteor_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [2]:
# get JSON file of sentence pairs
benchmark_directory = os.getcwd()
os.chdir(os.path.join(benchmark_directory, './..'))
sentences_file = open('./subtitle_corpus.json', 'r')
sentences_json = json.load(sentences_file)['translations']

In [59]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # References:
    #   https://stackoverflow.com/questions/36640587/how-to-remove-chinese-punctuation-in-python
    #   http://www.localizingjapan.com/blog/2012/01/20/regular-expressions-for-japanese-text/
    w = re.sub(r"([、。･.?!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"^[[一-龠ぁ-ゔァ-ヴーａ-ｚＡ-Ｚ０-９々〆〤。.?!,a-zA-Z]]+", " ", w)

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [60]:
example = sentences_json[8]['j']
converted_example = unicode_to_ascii(example)
processed = preprocess_sentence(converted_example)
print(example)
print(processed)

わぁ~! いつも すみません。 いいのよ~。
<start> わぁ~ ! いつも すみません 。 いいのよ~ 。 <end>


In [61]:
sentences_json[:10]

[{'e': "you are back, aren't you, harold?", 'j': 'あなたは戻ったのね ハロルド?'},
 {'e': 'my opponent is shark.', 'j': '俺の相手は シャークだ。'},
 {'e': 'this is one thing in exchange for another.', 'j': '引き換えだ ある事とある物の'},
 {'e': "yeah, i'm fine.", 'j': 'もういいよ ごちそうさま ううん'},
 {'e': "don't come to the office anymore. don't call me either.",
  'j': 'もう会社には来ないでくれ 電話もするな'},
 {'e': 'looks beautiful.', 'j': 'きれいだ。'},
 {'e': 'get him out of here, because i will fucking kill him.',
  'j': '連れて行け 殺しそうだ わかったか?'},
 {'e': 'you killed him!', 'j': '殺したのか!'},
 {'e': 'okay, then who?', 'j': 'わぁ~! いつも すみません。 いいのよ~。'},
 {'e': 'it seems a former employee...', 'j': 'カンパニーの元社員が'}]

In [62]:
# 1. Clean the sentences
# 2. Return word pairs in the format: [ENGLISH, JAPANESE]
def create_dataset(json, num_examples):
#     word_pairs = [
#                  preprocess_sentence(w) for w in l.split('\t') for l['e'] in json[:num_examples],
#                  preprocess_sentence(w) for w in l.split('\t') for l['j'] in json[:num_examples]
#                  ]
    word_pairs = [[preprocess_sentence(sentence['e']), preprocess_sentence(sentence['j'])]
                 for sentence in json[:num_examples]]

    return zip(*word_pairs)

In [67]:
test_sen = create_dataset(sentences_json, 10)
en, jp = test_sen
print(en[0])
print(jp[0])

<start> you are back , aren't you , harold ? <end>
<start> あなたは戻ったのね ハロルト ? <end>


In [68]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

    return tensor, lang_tokenizer

In [69]:
def load_dataset(json, num_examples=None):
    # creating cleaned input, output pairs
    targ_lang, inp_lang = create_dataset(json, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [71]:
# Try experimenting with the size of that dataset
num_examples = 30000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(sentences_json, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [72]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

NameError: name 'train_test_split' is not defined