In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from keras.utils.vis_utils import plot_model
import numpy as np
import codecs
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
import warnings
warnings.filterwarnings('ignore')

### **Read en,fa data**

In [None]:
%%time
with open("TEP-fa.txt") as f:
    fa_file = f.read() 

with open("TEP-en.txt") as f:
    en_file = f.read()    

CPU times: user 184 ms, sys: 77.5 ms, total: 262 ms
Wall time: 259 ms


In [None]:
fa_data = fa_file.strip().split('\n')
en_data = en_file.strip().split('\n')

In [None]:
len(fa_data)

612086

In [None]:
len(en_data)

612086

In [None]:
data = pd.DataFrame(columns=['en','fa'])
data['fa'] = fa_data
data['en'] = en_data

In [None]:
data.head()

Unnamed: 0,en,fa
0,raspy breathing .,صداي خر خر .
1,dad .,پدر .
2,maybe its the wind .,شايد صداي باد باشه .
3,no .,نه .
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .


### **Export DataFrame to CSV**

In [None]:
data.to_csv('en-fa_MT_dataset.csv', index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r "/content/en-fa_MT_dataset.csv" "/content/drive/MyDrive/en-fa_MT_dataset.csv"

Mounted at /content/drive


# **Read Dataset**

In [4]:
!gdown --id 101S8yZESRK5YL0a886tTz1hnCFzhaRxe

Downloading...
From: https://drive.google.com/uc?id=101S8yZESRK5YL0a886tTz1hnCFzhaRxe
To: /content/en-fa_MT_dataset.csv
100% 55.4M/55.4M [00:00<00:00, 178MB/s]


In [82]:
data = pd.read_csv('en-fa_MT_dataset.csv')

In [6]:
data.head(3)

Unnamed: 0,en,fa
0,raspy breathing .,صداي خر خر .
1,dad .,پدر .
2,maybe its the wind .,شايد صداي باد باشه .


In [7]:
len(data)

612086

In [8]:
data['en_size'] = data['en'].str.count(' ')
data['fa_size'] = data['fa'].str.count(' ')

In [9]:
data.head()

Unnamed: 0,en,fa,en_size,fa_size
0,raspy breathing .,صداي خر خر .,2,3
1,dad .,پدر .,1,1
2,maybe its the wind .,شايد صداي باد باشه .,4,4
3,no .,نه .,1,1
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .,3,8


In [10]:
data['en_no_punctuation'] = data['en'].str.replace('[^\w\s]','')
data['en_no_punctuation'] = '<start> ' + data["en_no_punctuation"].str.lower() + ' <end>'

In [11]:
data['fa_no_punctuation'] = '<start> ' + data['fa'].str.replace('[^\w\s]','') + ' <end>'

In [12]:
data.head()

Unnamed: 0,en,fa,en_size,fa_size,en_no_punctuation,fa_no_punctuation
0,raspy breathing .,صداي خر خر .,2,3,<start> raspy breathing <end>,<start> صداي خر خر <end>
1,dad .,پدر .,1,1,<start> dad <end>,<start> پدر <end>
2,maybe its the wind .,شايد صداي باد باشه .,4,4,<start> maybe its the wind <end>,<start> شايد صداي باد باشه <end>
3,no .,نه .,1,1,<start> no <end>,<start> نه <end>
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .,3,8,<start> stop please stop <end>,<start> دست نگه داريد خواهش ميکنم دست نگه داري...


In [13]:
data.loc[1, 'fa_no_punctuation']

'<start> پدر  <end>'

In [14]:
en_data = data['en_no_punctuation'].values[:200_000]
fa_data = data['fa_no_punctuation'].values[:200_000]

In [15]:
del data

In [16]:
en_data = [re.sub('\s+', ' ', str(sentence)) for sentence in en_data]

In [17]:
fa_data = [re.sub('\s+', ' ', str(sentence)) for sentence in fa_data]

In [18]:
en_data[0:5]

['<start> raspy breathing <end>',
 '<start> dad <end>',
 '<start> maybe its the wind <end>',
 '<start> no <end>',
 '<start> stop please stop <end>']

In [19]:
fa_data[0:5]

['<start> صداي خر خر <end>',
 '<start> پدر <end>',
 '<start> شايد صداي باد باشه <end>',
 '<start> نه <end>',
 '<start> دست نگه داريد خواهش ميکنم دست نگه داريد <end>']

In [20]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [21]:
def tokenize(lang):
    lang_tokenizer = Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

In [22]:
def load_dataset():
    inp_lang = en_data
    targ_lang = fa_data
    input_tensor, input_lang_tokenizer = tokenize(inp_lang)
    target_tensor, target_lang_tokenizer = tokenize(targ_lang)
    return input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer

In [23]:
input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer = load_dataset()

In [24]:
input_tensor

array([[    1, 20984,  1476, ...,     0,     0,     0],
       [    1,   318,     2, ...,     0,     0,     0],
       [    1,   174,    30, ...,     0,     0,     0],
       ...,
       [    1,    13,     4, ...,     0,     0,     0],
       [    1,    58,    24, ...,     0,     0,     0],
       [    1,    68,     6, ...,     0,     0,     0]], dtype=int32)

In [25]:
print(input_tensor.shape)
print(target_tensor.shape)

(200000, 36)
(200000, 32)


In [26]:
del en_data
del fa_data

In [27]:
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

In [28]:
max_length_targ, max_length_inp

(32, 36)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [30]:
def convert_tensor_to_word(lang_tokenizer, tensor):
    for t in tensor:
        if t != 0:
            print(t, ': ', lang_tokenizer.index_word[t])

In [31]:
convert_tensor_to_word(input_lang_tokenizer, input_tensor[2])

1 :  <start>
174 :  maybe
30 :  its
4 :  the
1458 :  wind
2 :  <end>


In [32]:
input_tensor[0]

array([    1, 20984,  1476,     2,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0],
      dtype=int32)

In [33]:
input_tensor.shape

(200000, 36)

In [34]:
list(input_lang_tokenizer.word_index)[0:20]

['<start>',
 '<end>',
 'you',
 'the',
 'i',
 'to',
 'a',
 'it',
 'and',
 'of',
 'is',
 'that',
 'in',
 'me',
 'what',
 'this',
 'we',
 'my',
 'your',
 'have']

In [35]:
del input_tensor
del target_tensor

In [36]:
BUFFER_SIZE = len(X_train)
BATCH_SIZE = 200
steps_per_epoch = len(X_train) // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(input_lang_tokenizer.word_index) + 1
vocab_targ_size = len(target_lang_tokenizer.word_index) + 1

In [37]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [38]:
class Encoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True)
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state
    def initilize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))

In [39]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [40]:
encoder

<__main__.Encoder at 0x7f644d4a3310>

In [41]:
simple_hidden = encoder.initilize_hidden_state()
simple_hidden

<tf.Tensor: shape=(200, 1024), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [42]:
example_input_batch, example_target_batch = next(iter(dataset))

In [43]:
encoder(example_input_batch, simple_hidden)

(<tf.Tensor: shape=(200, 36, 1024), dtype=float32, numpy=
 array([[[-5.2777231e-03, -4.8777731e-03,  2.5587322e-03, ...,
          -1.0248774e-02,  5.9589311e-03,  5.7505779e-03],
         [-1.4479983e-02, -5.0848732e-03,  4.0809410e-03, ...,
           2.8903973e-03,  5.9600850e-03, -3.0573828e-03],
         [-5.2719391e-03, -2.0395624e-03,  2.3599302e-03, ...,
           7.8763654e-03,  4.3845642e-03, -1.4996913e-03],
         ...,
         [ 9.0218605e-03, -1.0795330e-02,  9.4768433e-03, ...,
           1.6218087e-02, -8.1156269e-03,  1.4065650e-02],
         [ 9.0218103e-03, -1.0795333e-02,  9.4767986e-03, ...,
           1.6218072e-02, -8.1156250e-03,  1.4065676e-02],
         [ 9.0217786e-03, -1.0795335e-02,  9.4767706e-03, ...,
           1.6218059e-02, -8.1156231e-03,  1.4065693e-02]],
 
        [[-5.2777231e-03, -4.8777731e-03,  2.5587322e-03, ...,
          -1.0248774e-02,  5.9589311e-03,  5.7505779e-03],
         [-3.1269204e-03, -6.7732772e-03, -6.1075338e-03, ...,
        

In [44]:
simple_output, simple_states = encoder(example_input_batch, simple_hidden)

In [45]:
class Attention(keras.layers.Layer):
    def __init__ (self, units):
        super(Attention, self).__init__()
        self.W1 = keras.layers.Dense(units)
        self.W2 = keras.layers.Dense(units)
        self.V = keras.layers.Dense(1)
    def call (self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        atteion_weights = tf.nn.softmax(score, axis=1)
        context_vector = atteion_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, atteion_weights

In [46]:
attention_layer = Attention(10)
attention_layer(simple_hidden, simple_output)

(<tf.Tensor: shape=(200, 1024), dtype=float32, numpy=
 array([[ 0.0058201 , -0.00942161,  0.00640603, ...,  0.01268562,
         -0.00661358,  0.00912093],
        [ 0.00737156, -0.01016865,  0.00805544, ...,  0.01325531,
         -0.0070998 ,  0.01236563],
        [ 0.004738  , -0.00674086,  0.00431343, ...,  0.0085616 ,
         -0.00351958,  0.00846427],
        ...,
        [ 0.00577961, -0.00850583,  0.00749395, ...,  0.0108787 ,
         -0.00584806,  0.0100695 ],
        [ 0.00438281, -0.00514495,  0.00696491, ...,  0.00774326,
         -0.00428016,  0.00834264],
        [ 0.00602061, -0.00913087,  0.00621669, ...,  0.01116116,
         -0.00450776,  0.01079305]], dtype=float32)>,
 <tf.Tensor: shape=(200, 36, 1), dtype=float32, numpy=
 array([[[0.02804267],
         [0.02866533],
         [0.02836319],
         ...,
         [0.02770694],
         [0.02770694],
         [0.02770693]],
 
        [[0.02811116],
         [0.02767248],
         [0.02743405],
         ...,
         [

In [47]:
attention_result, attention_weights = attention_layer(simple_hidden, simple_output)

In [48]:
class Decoder(keras.Model):
    def __init__ (self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True)
        self.fc = keras.layers.Dense(vocab_size)
        self.attention = Attention(self.dec_units)
    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights

In [49]:
decoder = Decoder(vocab_targ_size, embedding_dim, units, BATCH_SIZE)

In [50]:
decoder(tf.random.uniform((BATCH_SIZE, 1)), simple_hidden, simple_output)

(<tf.Tensor: shape=(200, 59717), dtype=float32, numpy=
 array([[-0.00068839,  0.00177633, -0.00131601, ..., -0.00150491,
         -0.00055517, -0.00022684],
        [-0.00076747,  0.00195568, -0.00125803, ..., -0.00166468,
         -0.00054042, -0.00024197],
        [-0.00072248,  0.00189996, -0.00120766, ..., -0.00126339,
         -0.00068507, -0.00024817],
        ...,
        [-0.00074209,  0.00194128, -0.00126508, ..., -0.00141771,
         -0.0005609 , -0.00023583],
        [-0.00055961,  0.00174428, -0.00128144, ..., -0.00129504,
         -0.00059595, -0.00031038],
        [-0.0006819 ,  0.00190099, -0.00131103, ..., -0.00151373,
         -0.00067483, -0.00026306]], dtype=float32)>,
 <tf.Tensor: shape=(200, 1024), dtype=float32, numpy=
 array([[-0.00098163,  0.00684141, -0.0016221 , ..., -0.00259868,
         -0.00184307, -0.00210538],
        [-0.00141947,  0.00717833, -0.0016638 , ..., -0.00278007,
         -0.00289647, -0.00213855],
        [-0.00093132,  0.00658672, -0.002106

In [51]:
optimizer = keras.optimizers.Adam()
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [52]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [53]:
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [54]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
def checkpoint(model, name=None):
    if name is not None:
        model.save_weights('/content/drive/MyDrive/en2fa_MT_weights/{}.h5'.format(name))
    else:
        raise NotImplementedError

In [56]:
EPOCH = 10
log_every = 50
for epoch in range(EPOCH):
    enc_hidden = encoder.initilize_hidden_state()
    total_loss = 0
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        print('Epoch: ', epoch)
        print('Loss: ', batch_loss.numpy())
        if batch % log_every == 0:
            checkpoint(encoder, 'encoder')
            checkpoint(decoder, 'decoder')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch:  8
Loss:  0.6191947
Epoch:  8
Loss:  0.6191947
Epoch:  8
Loss:  0.55477107
Epoch:  8
Loss:  0.55477107
Epoch:  8
Loss:  0.53790486
Epoch:  8
Loss:  0.53790486
Epoch:  8
Loss:  0.5421505
Epoch:  8
Loss:  0.5421505
Epoch:  8
Loss:  0.58570343
Epoch:  8
Loss:  0.58570343
Epoch:  8
Loss:  0.62236696
Epoch:  8
Loss:  0.62236696
Epoch:  8
Loss:  0.55759853
Epoch:  8
Loss:  0.55759853
Epoch:  8
Loss:  0.5631861
Epoch:  8
Loss:  0.5631861
Epoch:  8
Loss:  0.5502275
Epoch:  8
Loss:  0.5502275
Epoch:  8
Loss:  0.6060231
Epoch:  8
Loss:  0.6060231
Epoch:  8
Loss:  0.57753795
Epoch:  8
Loss:  0.57753795
Epoch:  8
Loss:  0.53980964
Epoch:  8
Loss:  0.53980964
Epoch:  8
Loss:  0.5667476
Epoch:  8
Loss:  0.5667476
Epoch:  8
Loss:  0.5702552
Epoch:  8
Loss:  0.5702552
Epoch:  8
Loss:  0.5937152
Epoch:  8
Loss:  0.5937152
Epoch:  8
Loss:  0.5913311
Epoch:  8
Loss:  0.5913311
Epoch:  8
Loss:  0.54163086
Epoch:  8
Loss:  0.54163086
E

In [69]:
!gdown --id 1lc7L6fOGOz2fhibx1QixQeJOgM9FI-jM
!gdown --id 1fE7z3T_fNCyZAFKrvMh2RP4L4uUcEj5i

Downloading...
From: https://drive.google.com/uc?id=1lc7L6fOGOz2fhibx1QixQeJOgM9FI-jM
To: /content/encoder.h5
100% 53.8M/53.8M [00:00<00:00, 298MB/s]
Downloading...
From: https://drive.google.com/uc?id=1fE7z3T_fNCyZAFKrvMh2RP4L4uUcEj5i
To: /content/decoder.h5
100% 343M/343M [00:01<00:00, 262MB/s]


In [70]:
encoder.load_weights('encoder.h5')
decoder.load_weights('decoder.h5')

In [118]:
def preprocess_senetence(w):
    w = re.sub(r"[,\.]", "", w)
    w = re.sub(r'[" "]+', " ", w)
    w = w.rstrip().strip().lower()
    w = '<start> ' + w + ' <end>'
    return w

In [119]:
def evaluate(sentence):
    sentence = preprocess_senetence(sentence)
    inputs = [input_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<start>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += target_lang_tokenizer.index_word[predicted_id] + ' '
        if target_lang_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence

In [132]:
evaluate('dad')

('بابا <end> ', '<start> dad <end>')

In [133]:
evaluate('maybe its the wind')

('شايد اين باد باشه <end> ', '<start> maybe its the wind <end>')

In [75]:
evaluate('stop please stop')

('صبركنید صبركنید صبركنید <end> ', '<start> stop please stop <end>')

In [84]:
evaluate('you told me we made payments to hollander')

('تو به من گفته بودي که ما به هلندر پرداخت کرديم <end> ',
 '<start> you told me we made payments to hollander <end>')

In [77]:
evaluate('i have great lessons today')

('امروز صبح دارم <end> ', '<start> i have great lessons today <end>')

In [134]:
evaluate('boss ')

('رئيس <end> ', '<start> boss <end>')

In [80]:
evaluate('zodiac')

('زودياک <end> ', '<start> zodiac <end>')

In [122]:
evaluate('last night,  I saw a dream')

('ديشب خواب ديدم <end> ', '<start> last night i saw a dream <end>')

In [113]:
evaluate('would you have made anything different')

('تو تصميمه ديگه اي داري <end> ',
 '<start> would you have made anything different <end>')

In [121]:
evaluate('you lied to me , dan')

('تو به من دروغ گفتي دن <end> ', '<start> you lied to me dan <end>')

In [125]:
evaluate('good morning , pinkerton')

('صبح بخير پينکرتون <end> ', '<start> good morning pinkerton <end>')

In [126]:
evaluate('names charlie prince , i expect youve heard of me')

('من چارلي پرينسم فکر کنم از من شنيدم <end> ',
 '<start> names charlie prince i expect youve heard of me <end>')

In [131]:
evaluate('he is not what i expected')

('اون چيزي نيست که انتظارش را کردم <end> ',
 '<start> he is not what i expected <end>')

In [136]:
evaluate('cows are going to be fat')

('گاوها چاق خواهند بود <end> ', '<start> cows are going to be fat <end>')