In [1]:
import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

In [2]:
df= pd.read_csv('/kaggle/input/nepali-data/Eng_Nep.csv')
df.head()

Unnamed: 0,english_sentence,nepali_sentence
0,politicians do not have permission to do what ...,राजनीतिज्ञ गरिन आवश्यक के गर्न अनुमति छैन।
1,"I'd like to tell you about one such child,","म एक यस्तो बच्चा बारेमा बताउन चाहन्छु,"
2,This percentage is even greater than the perce...,यो प्रतिशत भारत प्रतिशत भन्दा पनि ठूलो छ।
3,what we really mean is that they're bad at not...,हामी साँच्चै के मतलब तिनीहरूले ध्यान छैन मा हु...
4,.The ending portion of these Vedas is called U...,यी Vedas को अन्त्य भाग उपनिषद् भनिन्छ।


In [4]:
df.drop_duplicates(inplace=True)
df['nepali']=df['nepali_sentence'].apply(lambda x: True if re.search('[A-Za-z]',x) else False )
df.drop(df[df['nepali'] == True].index, inplace=True)

In [5]:
df.head()

Unnamed: 0,english_sentence,nepali_sentence,nepali
0,politicians do not have permission to do what ...,राजनीतिज्ञ गरिन आवश्यक के गर्न अनुमति छैन।,False
1,"I'd like to tell you about one such child,","म एक यस्तो बच्चा बारेमा बताउन चाहन्छु,",False
2,This percentage is even greater than the perce...,यो प्रतिशत भारत प्रतिशत भन्दा पनि ठूलो छ।,False
3,what we really mean is that they're bad at not...,हामी साँच्चै के मतलब तिनीहरूले ध्यान छैन मा हु...,False
5,The then Governor of Kashmir resisted transfer...,"त कश्मीर को राज्यपाल स्थानान्तरण प्रतिरोध, तर ...",False


In [6]:
df['english_sentence'] = df['english_sentence'].apply(lambda x:x.lower())
df['nepali_sentence'] = df['nepali_sentence'].apply(lambda x:x.lower())

In [7]:
# Remove quotes
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub("'", '', x))
df['nepali_sentence']=df['nepali_sentence'].apply(lambda x: re.sub("'", '', x))

In [8]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['nepali_sentence']=df['nepali_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [9]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
df['english_sentence']=df['english_sentence'].apply(lambda x: x.translate(remove_digits))
df['nepali_sentence']=df['nepali_sentence'].apply(lambda x: x.translate(remove_digits))

df['nepali_sentence'] = df['nepali_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
df['english_sentence']=df['english_sentence'].apply(lambda x: x.strip())
df['nepali_sentence']=df['nepali_sentence'].apply(lambda x: x.strip())
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['nepali_sentence']=df['nepali_sentence'].apply(lambda x: re.sub(" +", " ", x))

In [10]:
# Add start and end tokens to target sequences
df['nepali_sentence'] = df['nepali_sentence'].apply(lambda x : 'START_ '+ x + ' _END')


In [11]:
### Get English and nepali Vocabulary
all_eng_words=set()
for eng in df['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_nepali_words=set()
for nep in df['nepali_sentence']:
    for word in nep.split():
        if word not in all_nepali_words:
            all_nepali_words.add(word)

In [12]:
df['length_eng_sentence']=df['english_sentence'].apply(lambda x:len(x.split(" ")))
df['length_nep_sentence']=df['nepali_sentence'].apply(lambda x:len(x.split(" ")))

In [13]:
df=df[df['length_eng_sentence']<=20]
df=df[df['length_nep_sentence']<=20]

In [15]:
print("maximum length of NepaliSentence ",max(df['length_nep_sentence']))
print("maximum length of English Sentence ",max(df['length_eng_sentence']))

maximum length of NepaliSentence  20
maximum length of English Sentence  20


In [17]:
max_length_src=max(df['length_nep_sentence'])
max_length_tar=max(df['length_eng_sentence'])
# max_length_src,max_length_tar

(20, 20)

In [18]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_nepali_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_nepali_words)
num_encoder_tokens, num_decoder_tokens

(11198, 10821)

In [19]:
num_decoder_tokens += 1 #for zero padding

# num_encoder_tokens = num_encoder_tokens + 1 

In [20]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [21]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [22]:
X, y = df['english_sentence'], df['nepali_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((7942,), (1986,))

In [None]:
# encoder_input_data = np.zeros(
#     (len(df.english_sentence), max_length_src),
#     dtype='float32')
# decoder_input_data = np.zeros(
#     (len(df.nepali_sentence), max_length_tar),
#     dtype='float32')
# decoder_target_data = np.zeros(
#     (len(df.nepali_sentence), max_length_tar, num_decoder_tokens),
#     dtype='float32')

In [None]:
# for i, (input_text, target_text) in enumerate(zip(df.english_sentence, df.nepali_sentence)):
#     for t, word in enumerate(input_text.split()):
#         encoder_input_data[i, t] = input_token_index[word]
#     for t, word in enumerate(target_text.split()):
#         # decoder_target_data is ahead of decoder_input_data by one timestep
#         decoder_input_data[i, t] = target_token_index[word]
#         if t > 0:
#             # decoder_target_data will be ahead by one timestep
#             # and will not include the start character.
#             decoder_target_data[i, t - 1, target_token_index[word]] = 1.

In [None]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [None]:
embedding_size = 50

In [31]:
Input(shape=(None,))

<tf.Tensor 'input_1:0' shape=(None, None) dtype=float32>

In [None]:
encoder_inputs = Input(shape=(None,))
en_x=  Embedding(num_encoder_tokens, embedding_size)(encoder_inputs)
encoder = LSTM(50, return_state=True)
encoder_outputs, state_h, state_c = encoder(en_x)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [None]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

dex=  Embedding(num_decoder_tokens, embedding_size)

final_dex= dex(decoder_inputs)


decoder_lstm = LSTM(50, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(final_dex,
                                     initial_state=encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')

decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
model.summary()

In [None]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 20

In [None]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

In [None]:
# model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
#           batch_size=128,
#           epochs=20,
#           validation_split=0.05)

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

In [None]:
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

final_dex2= dex(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 52):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
# for seq_index in [14077,10000,5000,1000,18000]:
#     input_seq = encoder_input_data[seq_index: seq_index + 1]
#     decoded_sentence = decode_sequence(input_seq)
#     print('-')
#     print('Input sentence:', df.english_sentence[seq_index: seq_index + 1])
#     print('Decoded sentence:', decoded_sentence)

In [None]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1


In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Nepali Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Nepali Translation:', decoded_sentence[:-4])

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Nepali Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Nepali Translation:', decoded_sentence[:-4])

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Nepali Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Nepali Translation:', decoded_sentence[:-4])

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Nepali Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Nepali Translation:', decoded_sentence[:-4])

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Nepali Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Nepali Translation:', decoded_sentence[:-4])

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Nepali Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Nepali Translation:', decoded_sentence[:-4])