In [51]:
import tensorflow as tf
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from keras.utils.vis_utils import plot_model
import numpy as np
import codecs
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
%matplotlib inline

In [52]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [64]:
import warnings
warnings.filterwarnings('ignore')

### **Read en,fa Data**

In [79]:
%%time
with open("TEP-fa.txt") as f:
    fa_file = f.read() 

with open("TEP-en.txt") as f:
    en_file = f.read()    

CPU times: user 186 ms, sys: 43.8 ms, total: 230 ms
Wall time: 231 ms


In [152]:
fa_data = fa_file.strip().split('\n')
en_data = en_file.strip().split('\n')[0:len(fa_data)]

In [153]:
len(fa_data)

306345

In [154]:
len(en_data)

306345

In [155]:
data = pd.DataFrame(columns=['en','fa'])
data['fa'] = fa_data
data['en'] = en_data

In [156]:
data.head()

Unnamed: 0,en,fa
0,raspy breathing .,صداي خر خر .
1,dad .,پدر .
2,maybe its the wind .,شايد صداي باد باشه .
3,no .,نه .
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .


### **Export DataFrame to CSV**

In [128]:
data.to_csv('MT_dataset.csv', index=False)

# **Read Dataset**

In [129]:
data = pd.read_csv('MT_dataset.csv')

In [130]:
data.head(3)

Unnamed: 0,en,fa
0,raspy breathing .,صداي خر خر .
1,dad .,پدر .
2,maybe its the wind .,شايد صداي باد باشه .


In [131]:
len(data)

306345

In [132]:
data['en_size'] = data['en'].str.count(' ')
data['fa_size'] = data['fa'].str.count(' ')

In [133]:
data.head()

Unnamed: 0,en,fa,en_size,fa_size
0,raspy breathing .,صداي خر خر .,2,3
1,dad .,پدر .,1,1
2,maybe its the wind .,شايد صداي باد باشه .,4,4
3,no .,نه .,1,1
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .,3,8


In [134]:
data['en_no_punctuation'] = data['en'].str.replace('[^\w\s]','')
data['en_no_punctuation'] = '<strat> ' + data["en_no_punctuation"].str.lower() + ' <end>'

In [135]:
data['fa_no_punctuation'] = '<strat> ' + data['fa'].str.replace('[^\w\s]','') + ' <end>'

In [136]:
data.head()

Unnamed: 0,en,fa,en_size,fa_size,en_no_punctuation,fa_no_punctuation
0,raspy breathing .,صداي خر خر .,2,3,<strat> raspy breathing <end>,<strat> صداي خر خر <end>
1,dad .,پدر .,1,1,<strat> dad <end>,<strat> پدر <end>
2,maybe its the wind .,شايد صداي باد باشه .,4,4,<strat> maybe its the wind <end>,<strat> شايد صداي باد باشه <end>
3,no .,نه .,1,1,<strat> no <end>,<strat> نه <end>
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .,3,8,<strat> stop please stop <end>,<strat> دست نگه داريد خواهش ميکنم دست نگه داري...


In [137]:
data.loc[1, 'fa_no_punctuation']

'<strat> پدر  <end>'

In [138]:
texts_en = data['en_no_punctuation'].values
texts_fa = data['fa_no_punctuation'].values

In [139]:
texts_en = [re.sub('\s+', ' ', str(sentence)) for sentence in texts_en]

In [140]:
texts_fa = [re.sub('\s+', ' ', str(sentence)) for sentence in texts_fa]

In [141]:
texts_en[0:5]

['<strat> raspy breathing <end>',
 '<strat> dad <end>',
 '<strat> maybe its the wind <end>',
 '<strat> no <end>',
 '<strat> stop please stop <end>']

In [142]:
texts_fa[0:5]

['<strat> صداي خر خر <end>',
 '<strat> پدر <end>',
 '<strat> شايد صداي باد باشه <end>',
 '<strat> نه <end>',
 '<strat> دست نگه داريد خواهش ميکنم دست نگه داريد <end>']

In [143]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [144]:
def tokenize(lang):
    lang_tokenizer = Tokenizer()
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

In [145]:
def load_dataset(num_examples=None):
    inp_lang = texts_en
    targ_lang = texts_fa
    input_tensor, input_lang_tokenizer = tokenize(inp_lang)
    target_tensor, target_lang_tokenizer = tokenize(targ_lang)
    return input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer

In [146]:
input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer = load_dataset(20000)

In [147]:
input_tensor

array([[    2, 27435,  1516, ...,     0,     0,     0],
       [    2,   302,     1, ...,     0,     0,     0],
       [    2,   174,    31, ...,     0,     0,     0],
       ...,
       [    2,    11,    12, ...,     0,     0,     0],
       [    2,    28,    25, ...,     0,     0,     0],
       [    2,     9,  2244, ...,     0,     0,     0]], dtype=int32)

In [148]:
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

In [149]:
max_length_targ, max_length_inp

(32, 36)

In [150]:
X_train, X_test, y_train, y_test = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [151]:
def convert_tensor_to_word(lang_tokenizer, tensor):
    for t in tensor:
        if t != 0:
            print(t, ': ', lang_tokenizer.index_word[t])

In [46]:
convert_tensor_to_word(input_lang_tokenizer, input_tensor[0])

2 :  strat
14038 :  raspy
2150 :  breathing
1 :  end


In [47]:
input_tensor[0]

array([    2, 14038,  2150,     1,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0],
      dtype=int32)

In [48]:
list(target_lang_tokenizer.word_index)[0:20]

['strat',
 'end',
 'nan',
 'را',
 'من',
 'به',
 'تو',
 'که',
 'و',
 'از',
 'اين',
 'اون',
 'يک',
 'ما',
 'در',
 'با',
 'كه',
 'نه',
 'براي',
 'بود']