In [1]:
import re
from typing import List
import pandas as pd
from pandas import DataFrame
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import numpy as np

2022-09-23 08:03:00.069872: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-23 08:03:00.069901: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
data = pd.read_xml('nlp_data/NEWS2018_M-EnHi_trn.xml')

In [3]:
data

Unnamed: 0,ID,SourceName,TargetName
0,1,aabhaa,आभा
1,2,aabheer,आभीर
2,3,aabhijaat,आभिजात
3,4,aabid,आबिद
4,5,aabshar,आबशर
...,...,...,...
12932,12933,zulm ko jala doonga,ज़ुल्म को जला दूँगा
12933,12934,zunheboto,जुन्हेबोटो
12934,12935,zurich financial services,ज़्यूरिक फ़ाइनेंशियल सर्विसेज़
12935,12936,zurna,ज़ुर्ना


In [4]:
# checking shape
data.shape

(12937, 3)

In [5]:
data.isnull().sum()

ID            0
SourceName    0
TargetName    0
dtype: int64

In [6]:
input_texts = []
target_texts = []

def preprocessing(dataframe: DataFrame, 
                  column: str
                 ) -> List:
    """
    Processes a column of a dataframe.
    
    Args:
        dataframe (Dataframe):
            The dataframe.
        column (str):
            The column you want to process.
            If column = SourceName, the steps like character elimination except 
            alphabets using regexp, lowering, splitting, joining, appending to a 
            new list are carried out.
            If column = targetName, the steps like character elimination of numbers 
            using regexp, lowering, splitting, joining, appending to a new list are 
            carried out.
    
    Returns:
        list:
            The list of the processed sentences.
    """
    for i in range(0, len(dataframe)):
        if column == "SourceName":
            review = re.sub('[^a-zA-Z]', ' ', dataframe[str(column)][i])
            review = review.lower()
            review = review.split()
            review = ' '.join(review)
            input_texts.append('\t'+review+'\n')
        if column == "TargetName":
            review = re.sub('[0-9]', ' ', dataframe[str(column)][i])
            review = review.lower()
            review = review.split()
            review = ' '.join(review)
            target_texts.append('\t'+review+'\n')

In [7]:
preprocessing(dataframe=data, column='SourceName')

In [8]:
preprocessing(dataframe=data, column='TargetName')

In [9]:
batch_size = 64 # training batch size
epochs = 100 # no. of epochs
latent_dim = 256 # Latent dim of encoding space
num_samples = 10_000 # no of training samples

In [10]:
# input_characters, target_characters are the characters in the data
input_characters = set()
target_characters = set()

In [11]:
# adding unique characters from both the languages
for word in input_texts:
    for char in word:
        if char not in input_characters:
            input_characters.add(char)
for word in target_texts:
    for char in word:
        if char not in target_characters:
            target_characters.add(char)

In [12]:
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [13]:
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 12937
Number of unique input tokens: 29
Number of unique output tokens: 81
Max sequence length for inputs: 63
Max sequence length for outputs: 64


In [14]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

In [15]:
# creating arrays for encoder_input
import numpy as np
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(target_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

In [16]:
# One hot encoding
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
    
    for t, char in enumerate(target_text):
        # offset by one, "teacher forcing"
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t>0:
            # decoder_target_data ahead of decoder_input_data by one timestamp
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
            
    decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t:, target_token_index[' ']] = 1.