In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from keras.utils.vis_utils import plot_model
import numpy as np
import codecs
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
import warnings
warnings.filterwarnings('ignore')

# **Read Dataset**

In [4]:
!gdown --id 101S8yZESRK5YL0a886tTz1hnCFzhaRxe

Downloading...
From: https://drive.google.com/uc?id=101S8yZESRK5YL0a886tTz1hnCFzhaRxe
To: /content/en-fa_MT_dataset.csv
100% 55.4M/55.4M [00:01<00:00, 50.9MB/s]


In [5]:
data = pd.read_csv('en-fa_MT_dataset.csv')

In [6]:
data.head(3)

Unnamed: 0,en,fa
0,raspy breathing .,صداي خر خر .
1,dad .,پدر .
2,maybe its the wind .,شايد صداي باد باشه .


In [7]:
len(data)

612086

In [8]:
data['en_size'] = data['en'].str.count(' ')
data['fa_size'] = data['fa'].str.count(' ')

In [9]:
data.head()

Unnamed: 0,en,fa,en_size,fa_size
0,raspy breathing .,صداي خر خر .,2,3
1,dad .,پدر .,1,1
2,maybe its the wind .,شايد صداي باد باشه .,4,4
3,no .,نه .,1,1
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .,3,8


In [10]:
data['en_no_punctuation'] = data['en'].str.replace('[^\w\s]','')
data['en_no_punctuation'] = '<start> ' + data["en_no_punctuation"].str.lower() + ' <end>'

In [11]:
data['fa_no_punctuation'] = '<start> ' + data['fa'].str.replace('[^\w\s]','') + ' <end>'

In [12]:
data.head()

Unnamed: 0,en,fa,en_size,fa_size,en_no_punctuation,fa_no_punctuation
0,raspy breathing .,صداي خر خر .,2,3,<start> raspy breathing <end>,<start> صداي خر خر <end>
1,dad .,پدر .,1,1,<start> dad <end>,<start> پدر <end>
2,maybe its the wind .,شايد صداي باد باشه .,4,4,<start> maybe its the wind <end>,<start> شايد صداي باد باشه <end>
3,no .,نه .,1,1,<start> no <end>,<start> نه <end>
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .,3,8,<start> stop please stop <end>,<start> دست نگه داريد خواهش ميکنم دست نگه داري...


In [13]:
data.loc[1, 'fa_no_punctuation']

'<start> پدر  <end>'

In [14]:
en_data = data['en_no_punctuation'].values[0:150_000]
fa_data = data['fa_no_punctuation'].values[0:150_000]

In [15]:
en_data = [re.sub('\s+', ' ', str(sentence)) for sentence in en_data]

In [16]:
fa_data = [re.sub('\s+', ' ', str(sentence)) for sentence in fa_data]

In [17]:
en_data[0:5]

['<start> raspy breathing <end>',
 '<start> dad <end>',
 '<start> maybe its the wind <end>',
 '<start> no <end>',
 '<start> stop please stop <end>']

In [18]:
fa_data[0:5]

['<start> صداي خر خر <end>',
 '<start> پدر <end>',
 '<start> شايد صداي باد باشه <end>',
 '<start> نه <end>',
 '<start> دست نگه داريد خواهش ميکنم دست نگه داريد <end>']

In [19]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [20]:
def tokenize(lang):
    lang_tokenizer = Tokenizer()
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

In [21]:
def load_dataset():
    inp_lang = en_data
    targ_lang = fa_data
    input_tensor, input_lang_tokenizer = tokenize(inp_lang)
    target_tensor, target_lang_tokenizer = tokenize(targ_lang)
    return input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer

In [22]:
input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer = load_dataset()

In [23]:
input_tensor

array([[    2, 17589,  1808, ...,     0,     0,     0],
       [    2,   325,     1, ...,     0,     0,     0],
       [    2,   173,    30, ...,     0,     0,     0],
       ...,
       [    2,   272,    23, ...,     0,     0,     0],
       [    2,   272,    23, ...,     0,     0,     0],
       [    2,    90,    19, ...,     0,     0,     0]], dtype=int32)

In [27]:
print(input_tensor.shape)
print(target_tensor.shape)

(150000, 36)
(150000, 32)


In [28]:
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

In [29]:
max_length_targ, max_length_inp

(32, 36)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [31]:
def convert_tensor_to_word(lang_tokenizer, tensor):
    for t in tensor:
        if t != 0:
            print(t, ': ', lang_tokenizer.index_word[t])

In [32]:
convert_tensor_to_word(input_lang_tokenizer, input_tensor[0])

2 :  start
17589 :  raspy
1808 :  breathing
1 :  end


In [33]:
input_tensor[0]

array([    2, 17589,  1808,     1,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0],
      dtype=int32)

In [34]:
input_tensor.shape

(150000, 36)

In [35]:
list(target_lang_tokenizer.word_index)[0:20]

['start',
 'end',
 'را',
 'من',
 'به',
 'تو',
 'و',
 'که',
 'از',
 'اين',
 'اون',
 'يک',
 'ما',
 'در',
 'با',
 'كه',
 'نه',
 'هم',
 'براي',
 'بود']

In [36]:
BUFFER_SIZE = len(X_train)
BATCH_SIZE = 1024
steps_per_epoch = len(X_train) // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(input_lang_tokenizer.word_index) + 1
vocab_targ_size = len(target_lang_tokenizer.word_index) + 1

In [37]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
class Encoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True)
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state
    def initilize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [None]:
encoder

<__main__.Encoder at 0x7f309411c0d0>

In [None]:
simple_hidden = encoder.initilize_hidden_state()
simple_hidden

<tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [None]:
example_input_batch, example_target_batch = next(iter(dataset))

In [None]:
encoder(example_input_batch, simple_hidden)

(<tf.Tensor: shape=(64, 36, 1024), dtype=float32, numpy=
 array([[[ 4.76778811e-03,  4.27769228e-05,  6.47662440e-04, ...,
          -9.46064387e-03, -2.78141652e-03, -1.52624901e-02],
         [ 4.69231186e-03, -2.65270146e-03, -4.60728770e-03, ...,
          -7.93961156e-03,  3.41843884e-03, -5.37993945e-03],
         [-1.30449748e-03, -6.89162826e-03,  1.69798988e-03, ...,
           1.20301777e-03, -2.15936359e-03, -5.85939642e-03],
         ...,
         [-5.66742732e-04, -1.73659122e-03,  1.18973833e-02, ...,
          -3.74847581e-03, -8.98322184e-03,  1.18433516e-02],
         [-5.66839240e-04, -1.73657713e-03,  1.18971877e-02, ...,
          -3.74859222e-03, -8.98294710e-03,  1.18434522e-02],
         [-5.66906878e-04, -1.73656992e-03,  1.18970610e-02, ...,
          -3.74865602e-03, -8.98277201e-03,  1.18434951e-02]],
 
        [[ 4.76778811e-03,  4.27769228e-05,  6.47662440e-04, ...,
          -9.46064387e-03, -2.78141652e-03, -1.52624901e-02],
         [-5.16955974e-04,  3.

In [None]:
simple_output, simple_states = encoder(example_input_batch, simple_hidden)

In [None]:
class Attention(keras.layers.Layer):
    def __init__ (self, units):
        super(Attention, self).__init__()
        self.W1 = keras.layers.Dense(units)
        self.W2 = keras.layers.Dense(units)
        self.V = keras.layers.Dense(1)
    def call (self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        atteion_weights = tf.nn.softmax(score, axis=1)
        context_vector = atteion_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, atteion_weights

In [None]:
attention_layer = Attention(10)
attention_layer(simple_hidden, simple_output)

(<tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
 array([[-0.00144463, -0.00295026,  0.00912169, ..., -0.0033915 ,
         -0.00630622,  0.00736675],
        [ 0.0004775 ,  0.00076909,  0.00818022, ..., -0.00259093,
         -0.00583887,  0.00577949],
        [ 0.00047016, -0.00100788,  0.00978865, ..., -0.00231028,
         -0.00753438,  0.0081187 ],
        ...,
        [ 0.00062883, -0.0015618 ,  0.00886149, ..., -0.00193209,
         -0.00646909,  0.0079165 ],
        [ 0.00136512, -0.00261301,  0.01090115, ..., -0.0030711 ,
         -0.00766282,  0.00884432],
        [-0.0011033 , -0.00249512,  0.01094301, ..., -0.00198215,
         -0.0074179 ,  0.00666509]], dtype=float32)>,
 <tf.Tensor: shape=(64, 36, 1), dtype=float32, numpy=
 array([[[0.02783762],
         [0.02813881],
         [0.02766192],
         ...,
         [0.02767112],
         [0.02767109],
         [0.02767108]],
 
        [[0.0278667 ],
         [0.02756146],
         [0.02817884],
         ...,
         [0.

In [None]:
attention_result, attention_weights = attention_layer(simple_hidden, simple_output)

In [None]:
class Decoder(keras.Model):
    def __init__ (self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True)
        self.fc = keras.layers.Dense(vocab_size)
        self.attention = Attention(self.dec_units)
    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_targ_size, embedding_dim, units, BATCH_SIZE)

In [None]:
decoder(tf.random.uniform((BATCH_SIZE, 1)), simple_hidden, simple_output)

(<tf.Tensor: shape=(64, 14063), dtype=float32, numpy=
 array([[-0.0018821 , -0.00126402,  0.00187614, ...,  0.001965  ,
         -0.00266947, -0.00211682],
        [-0.00188992, -0.00151722,  0.00239011, ...,  0.00195522,
         -0.00265795, -0.00207566],
        [-0.00188682, -0.00117242,  0.00215933, ...,  0.00189119,
         -0.00303163, -0.00205385],
        ...,
        [-0.0018289 , -0.00126717,  0.00188937, ...,  0.0020097 ,
         -0.00264076, -0.00219136],
        [-0.00179501, -0.00110777,  0.00216814, ...,  0.00179939,
         -0.00272797, -0.0020465 ],
        [-0.00163617, -0.00099326,  0.0019545 , ...,  0.00171457,
         -0.0026966 , -0.00203061]], dtype=float32)>,
 <tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
 array([[ 0.00957308, -0.00541056, -0.00229575, ..., -0.00043806,
         -0.00710973,  0.00463084],
        [ 0.00906679, -0.00590708, -0.00245306, ..., -0.00011788,
         -0.00704469,  0.0044682 ],
        [ 0.00920943, -0.00516213, -0.00242189

In [None]:
optimizer = keras.optimizers.Adam()
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [None]:
checkpoint_dir = 'chckpnts'
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

In [None]:
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([target_lang_tokenizer.word_index['start']] * BATCH_SIZE, 1)
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [None]:
EPOCH = 10
for epoch in range(EPOCH):
    enc_hidden = encoder.initilize_hidden_state()
    total_loss = 0
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        print('Epoch: ', epoch)
        print('Loss: ', batch_loss.numpy())
    checkpoint.save(file_prefix='test1')

Epoch:  0
Loss:  2.2077267
Epoch:  0
Loss:  2.332972
Epoch:  0
Loss:  2.2280352
Epoch:  0
Loss:  1.8233036
Epoch:  0
Loss:  1.6429102
Epoch:  0
Loss:  1.5432467
Epoch:  0
Loss:  1.6914159
Epoch:  0
Loss:  1.6284164
Epoch:  0
Loss:  1.7932659
Epoch:  0
Loss:  1.9295249
Epoch:  0
Loss:  1.8394992
Epoch:  0
Loss:  1.79862
Epoch:  0
Loss:  1.8266114
Epoch:  0
Loss:  1.8102815
Epoch:  0
Loss:  1.8831841
Epoch:  0
Loss:  1.6486939
Epoch:  0
Loss:  1.961117
Epoch:  0
Loss:  1.6935047
Epoch:  0
Loss:  1.7654332
Epoch:  0
Loss:  1.5875679
Epoch:  0
Loss:  1.7287428
Epoch:  0
Loss:  1.7666519
Epoch:  0
Loss:  1.6757268
Epoch:  0
Loss:  1.8113145
Epoch:  0
Loss:  1.5425848
Epoch:  0
Loss:  1.6027446
Epoch:  0
Loss:  1.616174
Epoch:  0
Loss:  1.7288326
Epoch:  0
Loss:  1.5646371
Epoch:  0
Loss:  1.6300225
Epoch:  0
Loss:  1.6634853
Epoch:  0
Loss:  1.7320808
Epoch:  0
Loss:  1.7001557
Epoch:  0
Loss:  1.5792702
Epoch:  0
Loss:  1.7471089
Epoch:  0
Loss:  1.8210549
Epoch:  0
Loss:  1.5323147
Epoch:

KeyboardInterrupt: ignored

In [None]:
def preprocess_senetence(w):
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.~,]+", " ", w)
    w = w.rstrip().strip()
    w = '<strat> ' + w + ' <end>'
    return w

In [None]:
def evaluate(sentence):
    sentence = preprocess_senetence(sentence)
    inputs = [input_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<strat>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += target_lang_tokenizer.index_word[predicted_id] + ' '
        if target_lang_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence

In [None]:
checkpoint.restore(tf.train.latest_checkpoint(''))

<tensorflow.python.training.tracking.util.InitializationOnlyStatus at 0x7f41321bd090>

In [None]:
evaluate('hello.')

NameError: ignored