In [87]:
import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from keras.utils.vis_utils import plot_model
import numpy as np
import codecs
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
%matplotlib inline

In [88]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [89]:
import warnings
warnings.filterwarnings('ignore')

### **Read en,fa data**

In [None]:
%%time
with open("TEP-fa.txt") as f:
    fa_file = f.read() 

with open("TEP-en.txt") as f:
    en_file = f.read()    

CPU times: user 184 ms, sys: 77.5 ms, total: 262 ms
Wall time: 259 ms


In [None]:
fa_data = fa_file.strip().split('\n')
en_data = en_file.strip().split('\n')

In [None]:
len(fa_data)

612086

In [None]:
len(en_data)

612086

In [None]:
data = pd.DataFrame(columns=['en','fa'])
data['fa'] = fa_data
data['en'] = en_data

In [None]:
data.head()

Unnamed: 0,en,fa
0,raspy breathing .,صداي خر خر .
1,dad .,پدر .
2,maybe its the wind .,شايد صداي باد باشه .
3,no .,نه .
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .


### **Export DataFrame to CSV**

In [None]:
data.to_csv('en-fa_MT_dataset.csv', index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r "/content/en-fa_MT_dataset.csv" "/content/drive/MyDrive/en-fa_MT_dataset.csv"

Mounted at /content/drive


# **Read Dataset**

In [5]:
!gdown --id 101S8yZESRK5YL0a886tTz1hnCFzhaRxe

Downloading...
From: https://drive.google.com/uc?id=101S8yZESRK5YL0a886tTz1hnCFzhaRxe
To: /content/en-fa_MT_dataset.csv
100% 55.4M/55.4M [00:00<00:00, 175MB/s]


In [70]:
data = pd.read_csv('en-fa_MT_dataset.csv')

In [7]:
data.head(3)

Unnamed: 0,en,fa
0,raspy breathing .,صداي خر خر .
1,dad .,پدر .
2,maybe its the wind .,شايد صداي باد باشه .


In [8]:
len(data)

612086

In [9]:
data['en_size'] = data['en'].str.count(' ')
data['fa_size'] = data['fa'].str.count(' ')

In [10]:
data.head()

Unnamed: 0,en,fa,en_size,fa_size
0,raspy breathing .,صداي خر خر .,2,3
1,dad .,پدر .,1,1
2,maybe its the wind .,شايد صداي باد باشه .,4,4
3,no .,نه .,1,1
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .,3,8


In [11]:
data['en_no_punctuation'] = data['en'].str.replace('[^\w\s]','')
data['en_no_punctuation'] = '<start> ' + data["en_no_punctuation"].str.lower() + ' <end>'

In [12]:
data['fa_no_punctuation'] = '<start> ' + data['fa'].str.replace('[^\w\s]','') + ' <end>'

In [13]:
data.head()

Unnamed: 0,en,fa,en_size,fa_size,en_no_punctuation,fa_no_punctuation
0,raspy breathing .,صداي خر خر .,2,3,<start> raspy breathing <end>,<start> صداي خر خر <end>
1,dad .,پدر .,1,1,<start> dad <end>,<start> پدر <end>
2,maybe its the wind .,شايد صداي باد باشه .,4,4,<start> maybe its the wind <end>,<start> شايد صداي باد باشه <end>
3,no .,نه .,1,1,<start> no <end>,<start> نه <end>
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .,3,8,<start> stop please stop <end>,<start> دست نگه داريد خواهش ميکنم دست نگه داري...


In [14]:
data.loc[1, 'fa_no_punctuation']

'<start> پدر  <end>'

In [15]:
en_data = data['en_no_punctuation'].values[:200_000]
fa_data = data['fa_no_punctuation'].values[:200_000]

In [16]:
del data

In [17]:
en_data = [re.sub('\s+', ' ', str(sentence)) for sentence in en_data]

In [18]:
fa_data = [re.sub('\s+', ' ', str(sentence)) for sentence in fa_data]

In [19]:
en_data[0:5]

['<start> raspy breathing <end>',
 '<start> dad <end>',
 '<start> maybe its the wind <end>',
 '<start> no <end>',
 '<start> stop please stop <end>']

In [20]:
fa_data[0:5]

['<start> صداي خر خر <end>',
 '<start> پدر <end>',
 '<start> شايد صداي باد باشه <end>',
 '<start> نه <end>',
 '<start> دست نگه داريد خواهش ميکنم دست نگه داريد <end>']

In [21]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [22]:
def tokenize(lang):
    lang_tokenizer = Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

In [23]:
def load_dataset():
    inp_lang = fa_data
    targ_lang = en_data
    input_tensor, input_lang_tokenizer = tokenize(inp_lang)
    target_tensor, target_lang_tokenizer = tokenize(targ_lang)
    return input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer

In [24]:
input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer = load_dataset()

In [25]:
input_tensor

array([[    1,   477,  3187, ...,     0,     0,     0],
       [    1,   226,     2, ...,     0,     0,     0],
       [    1,   136,   477, ...,     0,     0,     0],
       ...,
       [    1, 10717,    71, ...,     0,     0,     0],
       [    1,    42,   193, ...,     0,     0,     0],
       [    1,   106, 59716, ...,     0,     0,     0]], dtype=int32)

In [26]:
print(input_tensor.shape)
print(target_tensor.shape)

(200000, 32)
(200000, 36)


In [27]:
del en_data
del fa_data

In [28]:
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

In [29]:
max_length_targ, max_length_inp

(36, 32)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [31]:
def convert_tensor_to_word(lang_tokenizer, tensor):
    for t in tensor:
        if t != 0:
            print(t, ': ', lang_tokenizer.index_word[t])

In [32]:
convert_tensor_to_word(input_lang_tokenizer, input_tensor[2])

1 :  <start>
136 :  شايد
477 :  صداي
1259 :  باد
28 :  باشه
2 :  <end>


In [33]:
input_tensor[0]

array([   1,  477, 3187, 3187,    2,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
      dtype=int32)

In [34]:
input_tensor.shape

(200000, 32)

In [35]:
list(input_lang_tokenizer.word_index)[0:20]

['<start>',
 '<end>',
 'من',
 'را',
 'به',
 'تو',
 'و',
 'که',
 'از',
 'اين',
 'اون',
 'يک',
 'ما',
 'در',
 'با',
 'كه',
 'نه',
 'هم',
 'براي',
 'بود']

In [36]:
del input_tensor
del target_tensor

In [37]:
BUFFER_SIZE = len(X_train)
BATCH_SIZE = 256
steps_per_epoch = len(X_train) // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(input_lang_tokenizer.word_index) + 1
vocab_targ_size = len(target_lang_tokenizer.word_index) + 1

In [38]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [39]:
class Encoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True)
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state
    def initilize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))

In [40]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [41]:
encoder

<__main__.Encoder at 0x7f2ee1d08f10>

In [42]:
simple_hidden = encoder.initilize_hidden_state()
simple_hidden

<tf.Tensor: shape=(256, 1024), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [43]:
example_input_batch, example_target_batch = next(iter(dataset))

In [44]:
encoder(example_input_batch, simple_hidden)

(<tf.Tensor: shape=(256, 32, 1024), dtype=float32, numpy=
 array([[[ 3.77016491e-03, -7.40111759e-03, -3.10534355e-03, ...,
           1.84245946e-04,  9.45575908e-03, -4.22568672e-04],
         [ 1.43094165e-02, -4.95086098e-03, -3.59178614e-03, ...,
          -1.08001940e-03,  7.01352675e-03, -7.49554671e-03],
         [ 9.17888712e-03, -3.51509079e-03, -5.13092242e-03, ...,
           1.74724672e-03, -6.96828123e-04,  1.49085163e-03],
         ...,
         [ 1.51304603e-02,  2.33308859e-02, -5.16554015e-03, ...,
           7.14046415e-03, -4.95023909e-04, -1.21821631e-02],
         [ 1.51304938e-02,  2.33308673e-02, -5.16557135e-03, ...,
           7.14048836e-03, -4.95011976e-04, -1.21820290e-02],
         [ 1.51305087e-02,  2.33308487e-02, -5.16558904e-03, ...,
           7.14050699e-03, -4.95001557e-04, -1.21819461e-02]],
 
        [[ 3.77016491e-03, -7.40111759e-03, -3.10534355e-03, ...,
           1.84245946e-04,  9.45575908e-03, -4.22568672e-04],
         [-6.59216568e-03, -6

In [45]:
simple_output, simple_states = encoder(example_input_batch, simple_hidden)

In [46]:
class Attention(keras.layers.Layer):
    def __init__ (self, units):
        super(Attention, self).__init__()
        self.W1 = keras.layers.Dense(units)
        self.W2 = keras.layers.Dense(units)
        self.V = keras.layers.Dense(1)
    def call (self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        atteion_weights = tf.nn.softmax(score, axis=1)
        context_vector = atteion_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, atteion_weights

In [47]:
attention_layer = Attention(10)
attention_layer(simple_hidden, simple_output)

(<tf.Tensor: shape=(256, 1024), dtype=float32, numpy=
 array([[ 1.18238162e-02,  1.67556182e-02, -3.51891154e-03, ...,
          5.10962168e-03, -4.74922417e-05, -9.99994855e-03],
        [ 1.19330883e-02,  1.81259401e-02, -5.69641171e-03, ...,
          4.88073146e-03,  3.88423447e-04, -1.03118354e-02],
        [ 1.11963954e-02,  1.54621759e-02, -2.21119891e-03, ...,
          4.07443289e-03, -6.54630858e-05, -1.11262035e-02],
        ...,
        [ 1.08942855e-02,  1.62885282e-02, -3.99995409e-03, ...,
          4.91558947e-03,  6.79101940e-05, -8.63143988e-03],
        [ 9.26167704e-03,  1.29014654e-02, -4.56997845e-03, ...,
          3.23833758e-03,  7.04381964e-04, -9.59435757e-03],
        [ 1.04670543e-02,  1.60419047e-02, -2.98135681e-03, ...,
          4.16330900e-03,  2.13870150e-03, -9.21541080e-03]], dtype=float32)>,
 <tf.Tensor: shape=(256, 32, 1), dtype=float32, numpy=
 array([[[0.03014959],
         [0.03025215],
         [0.03051675],
         ...,
         [0.03147958]

In [48]:
attention_result, attention_weights = attention_layer(simple_hidden, simple_output)

In [49]:
class Decoder(keras.Model):
    def __init__ (self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True)
        self.fc = keras.layers.Dense(vocab_size)
        self.attention = Attention(self.dec_units)
    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights

In [50]:
decoder = Decoder(vocab_targ_size, embedding_dim, units, BATCH_SIZE)

In [51]:
decoder(tf.random.uniform((BATCH_SIZE, 1)), simple_hidden, simple_output)

(<tf.Tensor: shape=(256, 37100), dtype=float32, numpy=
 array([[ 0.00162454,  0.00144953,  0.00163456, ..., -0.00168079,
          0.00150761, -0.00140324],
        [ 0.00173804,  0.00148182,  0.00159424, ..., -0.00155479,
          0.00160442, -0.00139779],
        [ 0.00157346,  0.00146153,  0.00152603, ..., -0.00165587,
          0.00156326, -0.00140716],
        ...,
        [ 0.00157429,  0.00137736,  0.00165561, ..., -0.001668  ,
          0.00154553, -0.00113606],
        [ 0.00151637,  0.00124735,  0.00150236, ..., -0.00167397,
          0.00145433, -0.00105091],
        [ 0.00163552,  0.00145066,  0.00164072, ..., -0.00168059,
          0.00162405, -0.00128105]], dtype=float32)>,
 <tf.Tensor: shape=(256, 1024), dtype=float32, numpy=
 array([[ 0.0172008 , -0.00677849,  0.00447173, ...,  0.00470538,
         -0.00102777,  0.00762323],
        [ 0.01748273, -0.00730359,  0.00454241, ...,  0.00517327,
         -0.00202286,  0.00811886],
        [ 0.0170823 , -0.0069825 ,  0.004374

In [52]:
optimizer = keras.optimizers.Adam()
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [53]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [54]:
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [55]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [56]:
def checkpoint(model, name=None):
    if name is not None:
        model.save_weights('/content/drive/MyDrive/fa2en_MT_weights/{}.h5'.format(name))
    else:
        raise NotImplementedError

In [None]:
EPOCH = 10
log_every = 50
for epoch in range(EPOCH):
    enc_hidden = encoder.initilize_hidden_state()
    total_loss = 0
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        print('Epoch: ', epoch)
        print('Loss: ', batch_loss.numpy())
        if batch % log_every == 0:
            checkpoint(encoder, 'encoder')
            checkpoint(decoder, 'decoder')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch:  5
Loss:  0.6261297
Epoch:  5
Loss:  0.63228166
Epoch:  5
Loss:  0.5477145
Epoch:  5
Loss:  0.59479386
Epoch:  5
Loss:  0.6139003
Epoch:  5
Loss:  0.6408852
Epoch:  5
Loss:  0.59989804
Epoch:  5
Loss:  0.65430045
Epoch:  5
Loss:  0.5754805
Epoch:  5
Loss:  0.6002171
Epoch:  5
Loss:  0.6542004
Epoch:  5
Loss:  0.5736279
Epoch:  5
Loss:  0.5544191
Epoch:  5
Loss:  0.5596962
Epoch:  5
Loss:  0.6216301
Epoch:  5
Loss:  0.5571698
Epoch:  5
Loss:  0.6395873
Epoch:  5
Loss:  0.6093684
Epoch:  5
Loss:  0.58724
Epoch:  5
Loss:  0.5959436
Epoch:  5
Loss:  0.62102944
Epoch:  5
Loss:  0.6420165
Epoch:  5
Loss:  0.67710763
Epoch:  5
Loss:  0.5513896
Epoch:  5
Loss:  0.63660884
Epoch:  5
Loss:  0.6048305
Epoch:  5
Loss:  0.5615454
Epoch:  5
Loss:  0.62983525
Epoch:  5
Loss:  0.5609554
Epoch:  5
Loss:  0.55303603
Epoch:  5
Loss:  0.611983
Epoch:  5
Loss:  0.60429054
Epoch:  5
Loss:  0.64048547
Epoch:  5
Loss:  0.56453097
Epoch:  

## Load encoder and decoder weights

In [104]:
!gdown --id 1XgkMG7Jx7ZrX0X6TWQDQ87gcT9Dr2zNx
!gdown --id 1-H5e2RgWv1myhEt8Kz6aptLXwkhMivQt

Downloading...
From: https://drive.google.com/uc?id=1XgkMG7Jx7ZrX0X6TWQDQ87gcT9Dr2zNx
To: /content/encoder.h5
100% 76.9M/76.9M [00:00<00:00, 306MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-H5e2RgWv1myhEt8Kz6aptLXwkhMivQt
To: /content/decoder.h5
100% 227M/227M [00:00<00:00, 259MB/s]


In [105]:
encoder.load_weights('encoder.h5')
decoder.load_weights('decoder.h5')

In [140]:
def preprocess_senetence(w):
    w = '<start> ' + w + ' <end>'
    return w

In [141]:
def evaluate(sentence):
    sentence = preprocess_senetence(sentence)
    inputs = [input_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<start>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += target_lang_tokenizer.index_word[predicted_id] + ' '
        if target_lang_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence

In [142]:
evaluate('بله')

('yes <end> ', '<start> بله <end>')

In [143]:
evaluate('شايد صداي باد باشه')

('maybe the wind or the wind <end> ', '<start> شايد صداي باد باشه <end>')

In [144]:
evaluate('سلام')

('hello <end> ', '<start> سلام <end>')

In [145]:
evaluate('ما قرار بود تصميماتو با هم بگيريم')

('we were supposed to each other convincing <end> ',
 '<start> ما قرار بود تصميماتو با هم بگيريم <end>')

In [146]:
evaluate('مگه تو تصميمه ديگه اي ميگرفتي')

('have you have made anything different <end> ',
 '<start> مگه تو تصميمه ديگه اي ميگرفتي <end>')

In [147]:
evaluate('شروع شد')

('its begun <end> ', '<start> شروع شد <end>')

In [148]:
evaluate('تو ميخواي اونجا چيکار کني')

('what are you going to do <end> ', '<start> تو ميخواي اونجا چيکار کني <end>')

In [149]:
evaluate('امروز درس بزرگی یاد گرفتم')

('i have great lesson has decided <end> ',
 '<start> امروز درس بزرگی یاد گرفتم <end>')

In [150]:
evaluate('کجا')

('where <end> ', '<start> کجا <end>')

In [151]:
evaluate('بردهها را بکشيد')

('kill the slaves <end> ', '<start> بردهها را بکشيد <end>')

In [152]:
evaluate('ما ميريم دنبالش')

('were going to him <end> ', '<start> ما ميريم دنبالش <end>')

In [153]:
evaluate('آن سريع ترين راه عبور است')

('that the quickest way of the quickest way <end> ',
 '<start> آن سريع ترين راه عبور است <end>')

In [154]:
evaluate('من نميدونم براي چي بايد بجنگم')

('i dont know what to fight <end> ',
 '<start> من نميدونم براي چي بايد بجنگم <end>')