In [273]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
import numpy as np
import json

In [274]:
# Define hyperparameters
input_vocab_size = 114 # Vocabulary size for the source language
output_vocab_size = 80 # Vocabulary size for the target language
embedding_dim = 100  # Dimension of the GloVe embeddings
hidden_units = 32 # Number of units in LSTM layers
sequence_length = 35 # Maximum sequence length
batch_size = 32 # Batch size
num_epochs = 200 # Number of training epochs

In [275]:
glove_path = 'GloVe/glove.6B.100d.txt'  # Adjust the path to your downloaded GloVe file
embedding_matrix = {}  # Create an empty dictionary to store the embeddings

In [276]:
with open(glove_path, 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_matrix[word] = coefs #(coeff == Vectors)

In [277]:
# Create an embedding matrix for the source and target languages
source_embedding_matrix = np.zeros((input_vocab_size, embedding_dim))
target_embedding_matrix = np.zeros((output_vocab_size, embedding_dim))

In [278]:
##load the dataset
with open('card.json', 'r') as json_file:
    dataset = json.load(json_file)

In [279]:
#mapping input and output sequences to integers 
#input sequence = source_tokenizer
#output sequence = target_tokenizer

index_s = 1
index_t = 1
source_tokenizer = {}
target_tokenizer = {}


for i in dataset:
    question_toks = i["question_toks"]
    # print(query_toks)
    for j in question_toks:
        # print(j)
        
        if j not in source_tokenizer:
            source_tokenizer[j] = index_s
            index_s += 1
# print((frequency))
for i in dataset:
    query_toks = i["query_toks"]
    # print(query_toks)
    for j in query_toks:
        # print(j)
        
        if j not in target_tokenizer:
            target_tokenizer[j] = index_t
            index_t += 1

In [280]:
for word, i in source_tokenizer.items():
    embedding_vector = embedding_matrix.get(word)
    if embedding_vector is not None:
        source_embedding_matrix[i] = embedding_vector

for word, i in target_tokenizer.items():
    embedding_vector = embedding_matrix.get(word)
    if embedding_vector is not None:
        target_embedding_matrix[i] = embedding_vector

In [281]:
question_tokens = [example["question_toks"] for example in dataset]
query_tokens = [example["query_toks"] for example in dataset]

encoder_input_data_nopad = [[source_tokenizer[word] for word in sentence] for sentence in question_tokens]


decoder_input_data_nopad = [[target_tokenizer[word] for word in sentence] for sentence in query_tokens]


In [282]:
# Pad the sequences to a consistent length
def pad_sequences(sequences, max_length):
    padded_sequences = []
    for sequence in sequences:
        if len(sequence) < max_length:
            padded_sequence = sequence + [0] * (max_length - len(sequence))
        else:
            padded_sequence = sequence[:max_length]
        padded_sequences.append(padded_sequence)
    return padded_sequences

# Pad the encoder and decoder inputs
encoder_input_data = pad_sequences(encoder_input_data_nopad, sequence_length)
decoder_input_data = pad_sequences(decoder_input_data_nopad, sequence_length)
encoder_input_data = np.array(encoder_input_data)
decoder_input_data = np.array(decoder_input_data)

In [283]:
decoder_input_data

array([[ 1,  2,  3, ...,  0,  0,  0],
       [ 1,  2,  3, ...,  0,  0,  0],
       [ 1,  8,  9, ...,  0,  0,  0],
       ...,
       [ 1, 71,  6, ...,  0,  0,  0],
       [ 1,  8,  9, ...,  0,  0,  0],
       [ 1,  8,  9, ...,  0,  0,  0]])

In [284]:
#decoder_target_data
target_vocab = {
    0 : 0
}

decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]
decoder_target_data[:, -1] = target_vocab[0]

In [285]:
# Define the encoder
encoder_inputs = tf.keras.layers.Input(shape=(sequence_length,))
encoder_embedding = Embedding(input_vocab_size, embedding_dim, weights=[source_embedding_matrix], trainable=False)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

In [286]:
# Define the decoder
decoder_inputs = tf.keras.layers.Input(shape=(sequence_length,))
decoder_embedding = Embedding(output_vocab_size, embedding_dim, weights=[target_embedding_matrix], trainable=False)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(output_vocab_size, activation='softmax')
output = decoder_dense(decoder_outputs)

In [287]:
# Build and compile the model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [288]:
model.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_15 (InputLayer)       [(None, 35)]                 0         []                            
                                                                                                  
 input_16 (InputLayer)       [(None, 35)]                 0         []                            
                                                                                                  
 embedding_14 (Embedding)    (None, 35, 100)              11400     ['input_15[0][0]']            
                                                                                                  
 embedding_15 (Embedding)    (None, 35, 100)              8000      ['input_16[0][0]']            
                                                                                            

In [289]:
print(decoder_target_data.shape)
print(encoder_input_data.shape)
print(decoder_input_data.shape)

(80, 35)
(80, 35)
(80, 35)


In [290]:
# Train the model
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=num_epochs, validation_split=0.2)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 15

Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.src.callbacks.History at 0x1b267e4f010>