### Loading Dataset

In [17]:
import pandas as pd

In [24]:
Dataset = pd.read_csv("hin.txt", sep='\t',header=None,names=['English','Hindi','Extra'])

In [25]:
Dataset.head()

Unnamed: 0,English,Hindi,Extra
0,Wow!,वाह!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Help!,बचाओ!,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
2,Jump.,उछलो.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
3,Jump.,कूदो.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
4,Jump.,छलांग.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...


In [26]:
# Dropping empty row
Dataset = Dataset.dropna()

In [27]:
Dataset.shape

(2773, 3)

### Splitting in Hindi and English Sentences with words <= 300

In [28]:
input_sentences = []
output_sentences = []
for i in Dataset.index:
    if len(Dataset["English"][i])<=300 :
        input_sentences.append(Dataset["English"][i])
        output_sentences.append(Dataset["Hindi"][i])

In [29]:
print (len(input_sentences))
print (len(output_sentences))

2773
2773


### Keeping only 20,000 sentences

In [30]:
input_sentences = input_sentences[:20000]
output_sentences = output_sentences[:20000]

In [31]:
print (len(input_sentences))
print (len(output_sentences))

2773
2773


In [32]:
input_sentences[0]

'Wow!'

In [33]:
output_sentences[0]

'वाह!'

### Pre-processing Text

#### Lower-casing

In [34]:
for i in range(len(input_sentences)):
    input_sentences[i] = input_sentences[i].lower()

In [35]:
for i in range(len(output_sentences)):
    output_sentences[i] = output_sentences[i].lower()

#### Expand contractions

In [36]:
from pycontractions import Contractions

In [37]:
# Loading Contraction Data
cont = Contractions('GoogleNews-vectors-negative300.bin')

In [38]:
cont.load_models()

In [39]:
for i in range(len(input_sentences)):
    input_sentences[i] = str(list(cont.expand_texts([input_sentences[i]], precise=True)))

In [40]:
for i in range(len(output_sentences)):
    output_sentences[i] = str(list(cont.expand_texts([output_sentences[i]], precise=True)))

#### Remove Special characters (includes double-quotes)

In [41]:
import string

In [42]:
special = set(string.punctuation)

In [43]:
for i in range(len(input_sentences)):
    temp = ""
    for j in input_sentences[i]:
        if j not in special:
            temp = temp + j
    input_sentences[i] = temp

In [44]:
for i in range(len(output_sentences)):
    temp = ""
    for j in output_sentences[i]:
        if j not in special:
            temp = temp + j
    output_sentences[i] = temp

In [45]:
input_sentences[1]

'help'

In [46]:
output_sentences[1]

'बचाओ'

#### Removing Numbers

In [47]:
from string import digits

In [48]:
remove_digits_english = str.maketrans('', '', digits)

In [49]:
for i in range(len(input_sentences)):
    input_sentences[i] = input_sentences[i].translate(remove_digits_english)

In [50]:
remove_digits_hindi = str.maketrans('', '', "[२३०८१५७९४६]")

In [51]:
for i in range(len(output_sentences)):
    output_sentences[i] = output_sentences[i].translate(remove_digits_hindi)
    output_sentences[i] = output_sentences[i].translate(remove_digits_english)

#### Removing Extra spaces

In [52]:
for i in range(len(input_sentences)):
    text = input_sentences[i].strip()
    input_sentences[i] = " ".join(text.split())

In [53]:
for i in range(len(output_sentences)):
    text = output_sentences[i].strip()
    output_sentences[i] = " ".join(text.split())

### Saving File

In [54]:
import numpy as np

In [55]:
np.save("English_text", input_sentences)
np.save("Hindi_text", output_sentences)