In [255]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
import numpy as np
import json

In [256]:
# Define hyperparameters
input_vocab_size = 114 # Vocabulary size for the source language
output_vocab_size = 80 # Vocabulary size for the target language
embedding_dim = 100  # Dimension of the GloVe embeddings
hidden_units = 32 # Number of units in LSTM layers
sequence_length = 35 # Maximum sequence length
batch_size = 32 # Batch size
num_epochs = 150 # Number of training epochs

In [257]:
glove_path = 'GloVe/glove.6B.100d.txt'  # Adjust the path to your downloaded GloVe file
embedding_matrix = {}  # Create an empty dictionary to store the embeddings

In [258]:
with open(glove_path, 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_matrix[word] = coefs #(coeff == Vectors)

In [259]:
# Create an embedding matrix for the source and target languages
source_embedding_matrix = np.zeros((input_vocab_size, embedding_dim))
target_embedding_matrix = np.zeros((output_vocab_size, embedding_dim))

In [260]:
##load the dataset
with open('card.json', 'r') as json_file:
    dataset = json.load(json_file)

In [261]:
#mapping input and output sequences to integers 
#input sequence = source_tokenizer
#output sequence = target_tokenizer

index_s = 1
index_t = 1
source_tokenizer = {}
target_tokenizer = {}


for i in dataset:
    question_toks = i["question_toks"]
    # print(query_toks)
    for j in question_toks:
        # print(j)
        
        if j not in source_tokenizer:
            source_tokenizer[j] = index_s
            index_s += 1
# print((frequency))
for i in dataset:
    query_toks = i["query_toks"]
    # print(query_toks)
    for j in query_toks:
        # print(j)
        
        if j not in target_tokenizer:
            target_tokenizer[j] = index_t
            index_t += 1

In [262]:
for word, i in source_tokenizer.items():
    embedding_vector = embedding_matrix.get(word)
    if embedding_vector is not None:
        source_embedding_matrix[i] = embedding_vector

for word, i in target_tokenizer.items():
    embedding_vector = embedding_matrix.get(word)
    if embedding_vector is not None:
        target_embedding_matrix[i] = embedding_vector

In [263]:
question_tokens = [example["question_toks"] for example in dataset]
query_tokens = [example["query_toks"] for example in dataset]

encoder_input_data_nopad = [[source_tokenizer[word] for word in sentence] for sentence in question_tokens]


decoder_input_data_nopad = [[target_tokenizer[word] for word in sentence] for sentence in query_tokens]


In [264]:
# Pad the sequences to a consistent length
def pad_sequences(sequences, max_length):
    padded_sequences = []
    for sequence in sequences:
        if len(sequence) < max_length:
            padded_sequence = sequence + [0] * (max_length - len(sequence))
        else:
            padded_sequence = sequence[:max_length]
        padded_sequences.append(padded_sequence)
    return padded_sequences

# Pad the encoder and decoder inputs
encoder_input_data = pad_sequences(encoder_input_data_nopad, sequence_length)
decoder_input_data = pad_sequences(decoder_input_data_nopad, sequence_length)
encoder_input_data = np.array(encoder_input_data)
decoder_input_data = np.array(decoder_input_data)

In [265]:
decoder_input_data

array([[ 1,  2,  3, ...,  0,  0,  0],
       [ 1,  2,  3, ...,  0,  0,  0],
       [ 1,  8,  9, ...,  0,  0,  0],
       ...,
       [ 1, 71,  6, ...,  0,  0,  0],
       [ 1,  8,  9, ...,  0,  0,  0],
       [ 1,  8,  9, ...,  0,  0,  0]])

In [266]:
#decoder_target_data
target_vocab = {
    0 : 0
}

decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]
decoder_target_data[:, -1] = target_vocab[0]

In [267]:
# Define the encoder
encoder_inputs = tf.keras.layers.Input(shape=(sequence_length,))
encoder_embedding = Embedding(input_vocab_size, embedding_dim, weights=[source_embedding_matrix], trainable=False)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

In [268]:
# Define the decoder
decoder_inputs = tf.keras.layers.Input(shape=(sequence_length,))
decoder_embedding = Embedding(output_vocab_size, embedding_dim, weights=[target_embedding_matrix], trainable=False)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(output_vocab_size, activation='softmax')
output = decoder_dense(decoder_outputs)

In [269]:
# Build and compile the model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [270]:
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_13 (InputLayer)       [(None, 35)]                 0         []                            
                                                                                                  
 input_14 (InputLayer)       [(None, 35)]                 0         []                            
                                                                                                  
 embedding_12 (Embedding)    (None, 35, 100)              11400     ['input_13[0][0]']            
                                                                                                  
 embedding_13 (Embedding)    (None, 35, 100)              8000      ['input_14[0][0]']            
                                                                                            

In [271]:
print(decoder_target_data.shape)
print(encoder_input_data.shape)
print(decoder_input_data.shape)

(80, 35)
(80, 35)
(80, 35)


In [272]:
# Train the model
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=num_epochs, validation_split=0.2)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<keras.src.callbacks.History at 0x1b2580f1650>