In [29]:
import tensorflow as tf

import io
import os
import unicodedata
import re
import mojimoji
from spacy.lang.ja import Japanese
from sklearn.model_selection import train_test_split

### Import dataset

In [2]:
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[w for w in l.split('\t')]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [3]:
# move current directory to get access to dataset directory
benchmark_directory = os.getcwd()
os.chdir(os.path.join(benchmark_directory, './..'))

In [4]:
# start with 30000 for local training
en, jp = create_dataset('./datasets/jesc-corpus.txt', 30000)

### Preprocess Text

In [7]:
# Tokenize Japanese text (since Japanese doesn't naturally put spaces between words)

# reference: https://github.com/WorksApplications/SudachiPy
# Load SudachiPy with split mode B: "国家公務員" => ['国家', '公務員']
# default is split mode A: "国家公務員" => ['国家公務員']
# NOTE: this may be worth adjusting in future training
jcfg = {"split_mode": "B"}
j_tokenizer = Japanese(meta={"tokenizer": {"config": jcfg}})

def tokenize_jp_sentence(text):
    return " ".join([i.text for i in j_tokenizer(text)])

In [8]:
print(jp[0])
print(tokenize_jp_sentence(jp[0]))

あなたは戻ったのね ハロルド?
あなた は 戻っ た の ね ハロルド ?


In [9]:
# convert any half-width katakana to normal-width katakana using mojimoji library
def norm_kt(text):
    return mojimoji.han_to_zen(text)

In [10]:
print("ﾆｭｰﾗﾙﾈｯﾄﾜｰｸ: " + norm_kt("ﾆｭｰﾗﾙﾈｯﾄﾜｰｸ"))

ﾆｭｰﾗﾙﾈｯﾄﾜｰｸ: ニューラルネットワーク


In [11]:
# convert unicode to ascii
def jp_unicode_to_ascii(text):
    return ''.join(ascii_text for ascii_text in unicodedata.normalize('NFKD', text))

# remove any accented characters for English-language text
def en_unicode_to_ascii(text):
    return ''.join(ascii_text for ascii_text in unicodedata.normalize('NFKD', text)
                   .encode('ascii', 'ignore').decode('utf-8', 'ignore'))

In [12]:
print(en_unicode_to_ascii("It's in my résumé."))
print(jp_unicode_to_ascii("それは履歴書にあります。"))

It's in my resume.
それは履歴書にあります。


In [18]:
# keep only Kanji, Hiragana, Katakana, numerals, and common punctuation: ("。", "、", "?", "!", "！"))
def jp_preprocessing_and_spacing(text):
    text = re.sub(r"([。、?!！])", r" \1", text)
    pattern = r"[^\u3041-\u309F\u30A1-\u30FF\uFF66-\uFF9F\u4E00-\u9FD0\u309B\u3099\uFF9E\u309C\u309A\uFF9F?!！\s、。.,0-9]+"
    text = re.sub(pattern, '', text).rstrip().strip()

    # add spaces between words and punctuation
    text = re.sub(r'[" "]+', " ", text)
    # remove interpunct (黒丸)
    text = text.replace("・" , "")

    text = text.lower()

    return text

# remove special characters and place spaces between words and punctuation
def en_preprocessing_and_spacing(text):
    text = en_unicode_to_ascii(text.lower().strip())

    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    text = re.sub(r"[^a-zA-Z?.!,]+", " ", text)
    text = text.strip()

    return text

In [19]:
print(en_preprocessing_and_spacing('Hello, email@world!'))
print(jp_preprocessing_and_spacing('こんにちは、エメール＠世界！'))

hello , email world !
こんにちは 、エメール世界 ！


### Normalize Text

In [20]:
# utilize preprocessing functions and mark start and end of sentences
def normalize_text(japanese_text, english_text):
    
    inputs = []
    targets = []
    
    for jp_text, en_text in zip(japanese_text, english_text):

        # normalize Japanese
        jp_text = jp_unicode_to_ascii(jp_text)
        jp_text = jp_preprocessing_and_spacing(jp_text)
        jp_text = tokenize_jp_sentence(jp_text)
        jp_text = norm_kt(jp_text)

        jp_text = "<start> " + jp_text + " <end>"
        
        inputs.append(jp_text)
        
        # normalize English
        en_text = en_unicode_to_ascii(en_text)
        en_text = en_preprocessing_and_spacing(en_text)

        en_text = "<start> " + en_text + " <end>"
        targets.append(en_text)

    return inputs, targets

In [21]:
inputs, targets = normalize_text(jp, en)

In [23]:
for i in range(5):
    print(inputs[i])
    print(targets[i])

<start> あなた　は　戻っ　た　の　ね　ハロルド　？ <end>
<start> you are back , aren t you , harold ? <end>
<start> 俺　の　相手　は　シャーク　だ　。 <end>
<start> my opponent is shark . <end>
<start> 引き換え　だ　ある　事　と　ある　物　の <end>
<start> this is one thing in exchange for another . <end>
<start> もう　いい　よ　ごちそう　さま　ううん <end>
<start> yeah , i m fine . <end>
<start> もう　会社　に　は　来　ない　で　くれ　電話　も　する　な <end>
<start> don t come to the office anymore . don t call me either . <end>


### Tokenize text

In [24]:
def tokenize(lang):
    # vectorize a text corpus
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters=' ')

    # updates internal vocabulary based on a corpus
    lang_tokenizer.fit_on_texts(lang)

    # Transforms each text in texts to a sequence of integers.
    tensor = lang_tokenizer.texts_to_sequences(lang)

    # Pads sequences to the same length.
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                          padding='post')
    return tensor, lang_tokenizer

In [25]:
print(jp[9])
print(en[9])
tokenize([inputs[9], targets[9]])

カンパニーの元社員が
it seems a former employee...


(array([[2, 4, 3, 0, 0, 0, 0, 0, 0, 0],
        [2, 5, 6, 7, 8, 9, 1, 1, 1, 3]], dtype=int32),
 <keras_preprocessing.text.Tokenizer at 0x7faba2ee41d0>)

In [27]:
input_tensor, input_lang_tokenizer = tokenize(jp)
target_tensor, target_lang_tokenizer = tokenize(en)

### Create input and target datasets

In [54]:
input_tensor, input_lang_tokenize = tokenize(jp)
target_tensor, target_lang_tokenize = tokenize(en)

In [55]:
# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [56]:
# Creating train-test-validation splits
# Reference: https://datascience.stackexchange.com/questions/15135/train-test-validation-set-splitting-in-sklearn
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is 75% of the entire data set
input_tensor_train, input_tensor_test, target_tensor_train, target_tensor_test = train_test_split(input_tensor, target_tensor, test_size=1 - train_ratio, random_state=1)

# test is 10% of the initial data set
# validation is 15% of the initial data set
input_tensor_val, input_tensor_test, target_tensor_val, target_tensor_test = train_test_split(input_tensor_test, target_tensor_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val), len(input_tensor_test), len(target_tensor_test))

22500 22500 4500 4500 3000 3000


In [57]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [58]:
print ("Input Language; index to word mapping")
convert(input_lang_tokenizer, input_tensor_train[1])
print ()
print ("Target Language; index to word mapping")
convert(target_lang_tokenizer, target_tensor_train[1])

Input Language; index to word mapping
7401 ----> 僕はただ...
37 ----> .

Target Language; index to word mapping
20 ----> it's
589 ----> just...
