In [7]:
import tensorflow as tf

import io
import os
import unicodedata
import re
import mojimoji

### Import dataset

In [3]:
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[w for w in l.split('\t')]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [None]:
# move current directory to get access to dataset directory
benchmark_directory = os.getcwd()
os.chdir(os.path.join(benchmark_directory, './..'))

In [4]:
# start with 30000 for local training
en, jp = create_dataset('./datasets/jesc-corpus.txt', 30000)

### Preprocess Text

In [8]:
# convert any half-width katakana to normal-width katakana using mojimoji library
def norm_kt(text):
    return mojimoji.han_to_zen(text)

In [10]:
print("ﾆｭｰﾗﾙﾈｯﾄﾜｰｸ: " + norm_kt("ﾆｭｰﾗﾙﾈｯﾄﾜｰｸ"))

ﾆｭｰﾗﾙﾈｯﾄﾜｰｸ: ニューラルネットワーク


In [14]:
# convert unicode to ascii
def jp_unicode_to_ascii(text):
    return ''.join(ascii_text for ascii_text in unicodedata.normalize('NFKD', text))

# remove any accented characters for English-language text
def en_unicode_to_ascii(text):
    return ''.join(ascii_text for ascii_text in unicodedata.normalize('NFKD', text)
                   .encode('ascii', 'ignore').decode('utf-8', 'ignore'))

In [15]:
print(en_unicode_to_ascii("It's in my résumé."))
print(jp_unicode_to_ascii("それは履歴書にあります。"))

It's in my resume.
それは履歴書にあります。


In [30]:
# keep only Kanji, Hiragana, Katakana, numerals, and common punctuation: ("。", "、", "?", "!", "！"))
def jp_preprocessing_and_spacing(text):
    text = re.sub(r"([。、?!！])", r" \1", text)
    pattern = r"[^\u3041-\u309F\u30A1-\u30FF\uFF66-\uFF9F\u4E00-\u9FD0\u309B\u3099\uFF9E\u309C\u309A\uFF9F?!！\s、。.,0-9]+"
    text = re.sub(pattern, '', text).rstrip().strip()

    # add spaces between words and punctuation
    text = re.sub(r'[" "]+', " ", text)
    # remove interpunct (黒丸)
    text = text.replace("・" , "")

    text = text.lower()

    return text

# remove special characters and place spaces between words and punctuation
def en_preprocessing_and_spacing(text):
    text = en_unicode_to_ascii(text.lower().strip())

    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    text = re.sub(r"[^a-zA-Z?.!,]+", " ", text)
    text = text.strip()

    return text

In [31]:
print(en_preprocessing_and_spacing('Hello, email@world!'))
print(jp_preprocessing_and_spacing('こんにちは、エメール＠世界！'))

hello , email world !
こんにちは 、エメール世界 ！


### Normalize Text

In [37]:
# utilize preprocessing functions and mark start and end of sentences
def normalize_text(japanese_text, english_text):
    
    inputs = []
    targets = []
    
    for jp_text, en_text in zip(japanese_text, english_text):

        # normalize Japanese
        jp_text = jp_unicode_to_ascii(jp_text)
        jp_text = jp_preprocessing(jp_text)
        jp_text = norm_kt(jp_text)

        jp_text = "<start> " + jp_text + " <end>"
        
        inputs.append(jp_text)
        
        # normalize English
        en_text = en_unicode_to_ascii(en_text)
        en_text = en_preprocessing(en_text)

        en_text = "<start> " + en_text + " <end>"
        targets.append(en_text)

    return inputs, targets

In [38]:
inputs, targets = normalize_text(jp, en)

In [41]:
for i in range(10):
    print(inputs[i])
    print(targets[i])

<start> あなたは戻ったのね　ハロルド　？ <end>
<start> you are back , aren t you , harold ? <end>
<start> 俺の相手は　シャークだ　。 <end>
<start> my opponent is shark . <end>
<start> 引き換えだ　ある事とある物の <end>
<start> this is one thing in exchange for another . <end>
<start> もういいよ　ごちそうさま　ううん <end>
<start> yeah , i m fine . <end>
<start> もう会社には来ないでくれ　電話もするな <end>
<start> don t come to the office anymore . don t call me either . <end>
<start> きれいだ　。 <end>
<start> looks beautiful . <end>
<start> 連れて行け　殺しそうだ　わかったか　？ <end>
<start> get him out of here , because i will fucking kill him . <end>
<start> 殺したのか　！ <end>
<start> you killed him ! <end>
<start> わぁ　！　いつも　すみません　。　いいのよ　。 <end>
<start> okay , then who ? <end>
<start> カンパニーの元社員が <end>
<start> it seems a former employee . . . <end>
