In [1]:
import tensorflow as tf
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from keras.utils.vis_utils import plot_model
import numpy as np
import codecs
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.DataFrame(columns=['en','fa'])
df

Unnamed: 0,en,fa


In [5]:
limit = 100_000

In [None]:
%%time
with codecs.open("TEP-en.txt", "r", "utf-8") as text:
    for i,line in enumerate(text):
        df.at[i,'en'] = line.strip()[:-1]
        if(i==(limit-1)):
            break

CPU times: user 8min 18s, sys: 11.2 s, total: 8min 29s
Wall time: 8min 29s


In [None]:
%%time
with codecs.open("TEP-fa.txt", "r", "utf-8") as text:
    for i,line in enumerate(text):
        df.at[i,'fa'] = line.strip()[:-1]
        if(i==(limit-1)):
            break

CPU times: user 1.06 s, sys: 16 ms, total: 1.07 s
Wall time: 1.07 s


### **Export DataFrame to CSV**

In [None]:
df.to_csv('en_to_fa.csv')

# **Read Dataset**

In [6]:
data = pd.read_csv('en_to_fa.csv')

In [7]:
data = data.iloc[: , 1:]

In [8]:
data.head(3)

Unnamed: 0,en,fa
0,raspy breathing,صداي خر خر
1,dad,پدر
2,maybe its the wind,شايد صداي باد باشه


In [9]:
len(data)

100000

In [10]:
data['en_size'] = data['en'].str.count(' ')
data['fa_size'] = data['fa'].str.count(' ')

In [11]:
data.head()

Unnamed: 0,en,fa,en_size,fa_size
0,raspy breathing,صداي خر خر,2.0,3.0
1,dad,پدر,1.0,1.0
2,maybe its the wind,شايد صداي باد باشه,4.0,4.0
3,no,نه,1.0,1.0
4,stop please stop,دست نگه داريد خواهش ميکنم دست نگه داريد,3.0,8.0


In [12]:
data['en_no_punctuation'] = data['en'].str.replace('[^\w\s]','')
data['en_no_punctuation'] = data["en_no_punctuation"].str.lower()

In [13]:
data['fa_no_punctuation'] = '<strat> ' + data['fa'].str.replace('[^\w\s]','') + ' <end>'

In [14]:
data.head()

Unnamed: 0,en,fa,en_size,fa_size,en_no_punctuation,fa_no_punctuation
0,raspy breathing,صداي خر خر,2.0,3.0,raspy breathing,<strat> صداي خر خر <end>
1,dad,پدر,1.0,1.0,dad,<strat> پدر <end>
2,maybe its the wind,شايد صداي باد باشه,4.0,4.0,maybe its the wind,<strat> شايد صداي باد باشه <end>
3,no,نه,1.0,1.0,no,<strat> نه <end>
4,stop please stop,دست نگه داريد خواهش ميکنم دست نگه داريد,3.0,8.0,stop please stop,<strat> دست نگه داريد خواهش ميکنم دست نگه داري...


In [15]:
data.loc[1, 'fa_no_punctuation']

'<strat> پدر  <end>'

In [23]:
texts_en = data['en_no_punctuation'].values
texts_fa = data['fa_no_punctuation'].values

In [24]:
texts_en = [re.sub('\s+', ' ', str(sentence)) for sentence in texts_en]

In [25]:
texts_fa = [re.sub('\s+', ' ', str(sentence)) for sentence in texts_fa]

In [33]:
texts_en[0:5]

['raspy breathing ', 'dad ', 'maybe its the wind ', 'no ', 'stop please stop ']

In [32]:
texts_fa[0:5]

['<strat> صداي خر خر <end>',
 '<strat> پدر <end>',
 '<strat> شايد صداي باد باشه <end>',
 '<strat> نه <end>',
 '<strat> دست نگه داريد خواهش ميکنم دست نگه داريد <end>']

In [27]:
maxlen_en = 20
tokenizer_en = Tokenizer()
tokenizer_en.fit_on_texts(texts_en)
en_data = tokenizer_en.texts_to_sequences(texts_en)
padd_en_data = pad_sequences(en_data, maxlen=maxlen_en)

In [30]:
maxlen_fa = 20
tokenizer_fa = Tokenizer()
tokenizer_fa.fit_on_texts(texts_fa)
fa_data = tokenizer_fa.texts_to_sequences(texts_fa)
padd_fa_data = pad_sequences(fa_data, maxlen=maxlen_fa, padding = 'post')