In [13]:
import io
import re
from tensorflow import keras
from tensorflow.keras import layers
import math
from typing import Tuple
import string

In [14]:
with open('jpn.txt',encoding='UTF-8') as f:
    lines = f.read().split("\n")[:-1]
text_pairs=[]
for line in lines:
        en,jp=line.split("\t",1)
        jp = re.sub('\t' , "",jp)
        jp = re.sub('\W+' , "",jp)
        en = re.sub('[%s]' % re.escape(string.punctuation) , "",en)
        jp = "[start]" + jp + "[end]"
        text_pairs.append((en,jp))
      

In [15]:
import random
for i in range(3):
    print(random.choice(text_pairs))

('Snow has been falling steadily since this morning', '[start]朝から休みなく雪が降り続いている[end]')
('Tom soon adapted himself to school life', '[start]トムはすぐに学校に馴染んだ[end]')
('We dont talk a lot', '[start]そんなに話しないよ[end]')


In [16]:
random.shuffle(text_pairs)
totaldatalen = len(text_pairs)
num_val_samples = int(0.15*len(text_pairs))

In [17]:
train_samples = totaldatalen - 2 * num_val_samples
train_pairs = text_pairs[:train_samples]
val_pairs= text_pairs[train_samples:train_samples+ num_val_samples]
test_pairs= text_pairs[train_samples + num_val_samples:]

In [18]:
import tensorflow as tf
import string 
import re

In [19]:
from tensorflow.keras import layers
def custom_standarization(input_string):
    lowercase = tf.strings.lower(input_string)
    return lowercase
vocab_size = 15000
sequence_length=20
source_vectorization= layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode = "int",
    output_sequence_length = sequence_length,
    standardize = custom_standarization,
)
target_vectorization= layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode = "int",
    output_sequence_length = sequence_length+1,
    split = 'character'  
)
          

In [20]:
train_english_texts = [pair[0] for pair in train_pairs]
train_jp_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_jp_texts)

In [21]:
batch_size= 64
def format_dataset(eng,jp):
    eng = source_vectorization(eng)
    jp = target_vectorization(jp)
    return ({
        "english": eng,
        "japanese":jp[:,:-1],
    },jp[:,1:])
def make_dataset(pairs):
    eng_texts , jp_texts = zip(*pairs)
    eng_texts = list (eng_texts)
    jp_texts = list(jp_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts,jp_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset,num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [18]:
for inputs,targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print ({targets.shape})

inputs['english'].shape: (64, 20)
{TensorShape([64, 20])}


In [19]:
from tensorflow import keras
from tensorflow.keras import layers

In [20]:
embed_size = 256
latent_dim = 1024

In [23]:
from tensorflow import keras
from tensorflow.keras import layers

class TransformerEncoder(layers.Layer):
    def __init__(self,embed_size,dense_dim,num_heads,**kwargs):
        super().__init__(**kwargs)
        self.embed_size = embed_size
        self.dense_dim = dense_dim
        self.num_heads= num_heads
        self.attention = layers.MultiHeadAttention(num_heads=num_heads,key_dim=embed_size)
        self.dense_proj = keras.Sequential([layers.Dense(dense_dim,activation="relu"),layers.Dense(embed_size),])
        self.layernorm_1= layers.LayerNormalization()
        self.layernorm_2= layers.LayerNormalization()
    def call(self,inputs,mask=None):
        if mask is not None:
            mask = mask[:,tf.newaxis,:]
        attention_output= self.attention(inputs,inputs,attention_mask=mask)
        proj_input= self.layernorm_1(inputs+ attention_output)
        proj_output = self.dense_proj(proj_input)
        return  self.layernorm_2(proj_input + proj_output)
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_size": self.embed_size,
            "num_heads":  self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config
  
    

In [24]:
class PositionalEmbedding(layers.Layer):
    def __init__(self,sequence_length,input_dim,output_dim,**kwargs):
                    super().__init__(**kwargs)
                    self.token_embeddings = layers.Embedding(
                    input_dim=input_dim,output_dim=output_dim)
                    self.position_embeddings = layers.Embedding(
                    input_dim=sequence_length,output_dim=output_dim)
                    self.sequence_length=sequence_length
                    self.output_dim=output_dim
    def call(self,inputs):
            length= tf.shape(inputs)[-1]
            positions = tf.range(start=0,limit=length,delta=1)
            embedded_tokens = self.token_embeddings(inputs)
            embedded_positions = self.position_embeddings(positions)
            return embedded_tokens + embedded_positions
    def compute_mask(self,inputs,mask=None):
            return tf.math.not_equal(inputs,0)
    def get_config(self):
            config = super(PositionalEmbedding,self).get_config()
            config.update({
                       "output_dim":self.output_dim,
                       "sequence_length":self.sequence_length,
                       "input_dim":self.input_dim
                   })
            return config
            

In [25]:
class TransformerDecoder(layers.Layer):
    def __init__(self,embed_size,dense_dim,num_heads,**kwargs):
        super().__init__(**kwargs)
        self.embed_size = embed_size
        self.dense_dim = dense_dim
        self.num_heads= num_heads
        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads,key_dim=embed_size)
        self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads,key_dim=embed_size)
        self.dense_proj = keras.Sequential([layers.Dense(dense_dim,activation="relu"),layers.Dense(embed_size),])
        self.layernorm_1= layers.LayerNormalization()
        self.layernorm_2= layers.LayerNormalization()
        self.layernorm_3= layers.LayerNormalization()
        self.supports_masking = True
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_size": self.embed_size,
            "num_heads":  self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config
    
    def get_causal_attention_mask(self,inputs):
        input_shape = tf.shape(inputs)
        batch_size,sequence_length = input_shape[0],input_shape[1]
        i = tf.range(sequence_length)[:,tf.newaxis]
        j = tf.range(sequence_length)
        mask= tf.cast(i >= j , dtype="int32")
        mask = tf.reshape(mask,(1,input_shape[1],input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size,-1),
                   tf.constant([1,1],dtype=tf.int32)],axis=0)
        return tf.tile(mask,mult)
    def call(self,inputs,encoder_outputs,mask=None):
        casual_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(
            mask[:,tf.newaxis,:],dtype="int32")
            padding_mask= tf.minimum(padding_mask,casual_mask)
        attention_output_1 = self.attention_1(
        query=inputs,value=inputs,key=inputs,attention_mask=casual_mask)
        attention_output_1= self.layernorm_1(inputs+attention_output_1)
        attention_output_2= self.attention_2(
        query=attention_output_1,value=encoder_outputs,key=encoder_outputs,attention_mask=padding_mask)
        attention_output_2=self.layernorm_2(attention_output_1+attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)


In [26]:
embed_size = 256
dense_dim = 2048
num_heads = 8
vocab_size = 15000
sequence_length=20
encoder_inputs = keras.Input(shape=(None,),dtype="int64",name="english")
x=PositionalEmbedding(sequence_length,vocab_size,embed_size)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_size,dense_dim,num_heads)(x)

decoder_inputs = keras.Input(shape=(None,),dtype="int64",name="japanese")
x=PositionalEmbedding(sequence_length,vocab_size,embed_size)(decoder_inputs)
x=TransformerDecoder(embed_size,dense_dim,num_heads)(x,encoder_outputs)
x= layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size,activation="softmax")(x)
transformer= keras.Model([encoder_inputs,decoder_inputs],decoder_outputs)
    
    

In [27]:
transformer.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 english (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 japanese (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 256)   3845120     ['english[0][0]']                
 alEmbedding)                                                                                     
                                                                                                  
 positional_embedding_1 (Positi  (None, None, 256)   3845120     ['japanese[0][0]']           

In [28]:
transformer.compile(optimizer="rmsprop",loss="sparse_categorical_crossentropy",metrics=["accuracy"])
transformer.fit(train_ds,epochs=40,validation_data=val_ds)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x2bace337a00>

In [30]:
import numpy as np
jp_vocab= target_vectorization.get_vocabulary()
jp_index_lookup = dict(zip(range(len(jp_vocab)),jp_vocab))
max_decoded = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:,:-1]
        predictions = transformer([tokenized_input_sentence,tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0,i,:])
        sampled_token = jp_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return re.sub('\W+' , "",decoded_sentence)
test_eng_texts= [pair[0] for pair in test_pairs]
for _ in range (20):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))
          

-
He has learned to be patient
starttart彼に彼彼でのはでをだはだなははだ
-
I have low blood pressure
starttart血血血血さでではししさししだがだ
-
I dont mind your staying here
starttartいだここでしししこしんしよしとし
-
Dont go near the water until you learn how to swim
starttart私んな理のんまましまますでしいし
-
That road is too narrow for a car to drive on
starttartそはあはのははで道でうで車でまで
-
Inhale and exhale
starttartやしやとしししではしすしてししが
-
She comes from a good family
starttart彼は彼はでなでで彼で女だよで女だ
-
He is working hard in order to pass the entrance examination
starttart彼の彼のにのの試試だのの入をはい
-
Dont waste time
starttart無は時もでしではさすしすすで間な
-
How old is this temple
starttartこしこははははeこでなでてででで
-
This book is too difficult for me to read
starttartこにこのをっじで私ででではさでで
-
The storm knocked out power
starttart嵐は嵐のははのさささにしなさでし
-
I think Tom is a very kind man
starttartトしトトにのさささだムだてだムだ
-
The goods were transported by ship
starttart料品料品でがでがあししししさでさ
-
Tom told me to shut the gate
starttartトトト理がにをを私をムししししし
-
Was Tom the one who broke the window
starttartトのトのでをさをトしムさ人eムだ
-
If I had time I wou

In [22]:
target_vectorization.get_vocabulary()

['',
 '[UNK]',
 't',
 'a',
 'd',
 'e',
 'n',
 's',
 'r',
 'い',
 'は',
 'た',
 'の',
 'な',
 'て',
 'に',
 'っ',
 'し',
 'を',
 'で',
 'か',
 'が',
 'る',
 'す',
 'と',
 'だ',
 'ま',
 'ん',
 'こ',
 'ト',
 'よ',
 'う',
 'く',
 'れ',
 'ム',
 'も',
 '彼',
 'ら',
 'り',
 'あ',
 'き',
 '私',
 'ー',
 'そ',
 'さ',
 'け',
 'ち',
 'ス',
 'ン',
 'お',
 'ど',
 '日',
 'つ',
 'せ',
 '女',
 'え',
 '行',
 'ア',
 'リ',
 '人',
 '何',
 'め',
 'わ',
 '見',
 '一',
 'ラ',
 'み',
 'ゃ',
 'ね',
 'メ',
 '時',
 '今',
 '本',
 'や',
 'ろ',
 '思',
 '分',
 '言',
 '話',
 'じ',
 '手',
 '事',
 '出',
 'ば',
 '間',
 '気',
 'べ',
 '食',
 '好',
 'ょ',
 'ル',
 '大',
 'イ',
 '語',
 '来',
 '子',
 'フ',
 'ッ',
 '家',
 '車',
 'ご',
 '前',
 '生',
 'ク',
 '中',
 '知',
 '１',
 '自',
 '年',
 'ド',
 'テ',
 '学',
 '誰',
 '上',
 '会',
 '僕',
 'レ',
 '方',
 '明',
 '君',
 'ほ',
 '入',
 'カ',
 'コ',
 'タ',
 'バ',
 '電',
 '当',
 '持',
 '物',
 'ぐ',
 'ボ',
 'げ',
 '聞',
 'ず',
 '合',
 '買',
 'パ',
 '０',
 '金',
 '仕',
 '部',
 '達',
 'ジ',
 '下',
 'チ',
 'へ',
 '書',
 '全',
 'キ',
 'ぎ',
 '少',
 '３',
 '理',
 '目',
 '屋',
 '着',
 '昨',
 '父',
 '要',
 '待',
 '後',
 'ビ',
 '２',
 'オ',
 'シ',
 