In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
import numpy as np
import json

In [3]:
# Define hyperparameters
input_vocab_size = 116 # Vocabulary size for the source language
output_vocab_size = 82 # Vocabulary size for the target language
embedding_dim = 100  # Dimension of the GloVe embeddings
hidden_units = 32 # Number of units in LSTM layers
sequence_length = 37 # Maximum sequence length
batch_size = 32 # Batch size
num_epochs = 200 # Number of training epochs

In [4]:
glove_path = 'GloVe/glove.6B.100d.txt'  # Adjust the path to your downloaded GloVe file
embedding_matrix = {}  # Create an empty dictionary to store the embeddings

In [5]:
embedding_matrix

{}

In [6]:
with open(glove_path, 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_matrix[word] = coefs #(coeff == Vectors)

In [7]:
# Create an embedding matrix for the source and target languages
source_embedding_matrix = np.zeros((input_vocab_size, embedding_dim))
target_embedding_matrix = np.zeros((output_vocab_size, embedding_dim))

In [8]:
source_embedding_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
##load the dataset
with open('card.json', 'r') as json_file:
    dataset = json.load(json_file)

In [10]:
#mapping input and output sequences to integers 
#input sequence = source_tokenizer
#output sequence = target_tokenizer

index_s = 3
index_t = 3
source_tokenizer = {'<PAD>':0,
                   '<start>':1,
                   '<end>':2}
target_tokenizer = {'<PAD>':0,
                   '<start>':1,
                   '<end>':2}


for i in dataset:
    question_toks = i["question_toks"]
    # print(query_toks)
    for j in question_toks:
        # print(j)
        
        if j not in source_tokenizer:
            source_tokenizer[j] = index_s
            index_s += 1
# print((frequency))
for i in dataset:
    query_toks = i["query_toks"]
    # print(query_toks)
    for j in query_toks:
        # print(j)
        
        if j not in target_tokenizer:
            target_tokenizer[j] = index_t
            index_t += 1

In [11]:
source_tokenizer

{'<PAD>': 0,
 '<start>': 1,
 '<end>': 2,
 'How': 3,
 'many': 4,
 'accounts': 5,
 'do': 6,
 'we': 7,
 'have': 8,
 '?': 9,
 'Count': 10,
 'the': 11,
 'number': 12,
 'of': 13,
 '.': 14,
 'Show': 15,
 'ids': 16,
 ',': 17,
 'customer': 18,
 'names': 19,
 'for': 20,
 'all': 21,
 'What': 22,
 'are': 23,
 'account': 24,
 'and': 25,
 'other': 26,
 'details': 27,
 'with': 28,
 'name': 29,
 '338': 30,
 'is': 31,
 'first': 32,
 'last': 33,
 'phone': 34,
 '162': 35,
 'Give': 36,
 'full': 37,
 'who': 38,
 'has': 39,
 'does': 40,
 'Art': 41,
 'Turcotte': 42,
 'Return': 43,
 'that': 44,
 'each': 45,
 'there': 46,
 'id': 47,
 'most': 48,
 'how': 49,
 'this': 50,
 'person': 51,
 'least': 52,
 'fewest': 53,
 'customers': 54,
 'without': 55,
 'an': 56,
 'not': 57,
 'any': 58,
 'distinct': 59,
 'hold': 60,
 'phones': 61,
 'email': 62,
 'Aniyah': 63,
 'Feest': 64,
 'cards': 65,
 'card': 66,
 'type': 67,
 'codes': 68,
 'numbers': 69,
 'types': 70,
 'date': 71,
 'valid': 72,
 'from': 73,
 'to': 74,
 "'4560596

In [12]:
target_tokenizer_rev = {v: k for k, v in target_tokenizer.items()}
target_tokenizer

{'<PAD>': 0,
 '<start>': 1,
 '<end>': 2,
 'SELECT': 3,
 'count': 4,
 '(': 5,
 '*': 6,
 ')': 7,
 'FROM': 8,
 'Accounts': 9,
 'account_id': 10,
 ',': 11,
 'customer_id': 12,
 'account_name': 13,
 'other_account_details': 14,
 'WHERE': 15,
 '=': 16,
 '``': 17,
 '338': 18,
 "''": 19,
 'T2.customer_first_name': 20,
 'T2.customer_last_name': 21,
 'T2.customer_phone': 22,
 'AS': 23,
 'T1': 24,
 'JOIN': 25,
 'Customers': 26,
 'T2': 27,
 'ON': 28,
 'T1.customer_id': 29,
 'T2.customer_id': 30,
 'T1.account_name': 31,
 '162': 32,
 'Art': 33,
 'AND': 34,
 'Turcotte': 35,
 'GROUP': 36,
 'BY': 37,
 'ORDER': 38,
 'DESC': 39,
 'LIMIT': 40,
 '1': 41,
 'ASC': 42,
 'NOT': 43,
 'IN': 44,
 'customer_first_name': 45,
 'customer_last_name': 46,
 'EXCEPT': 47,
 'T1.customer_first_name': 48,
 'T1.customer_last_name': 49,
 'DISTINCT': 50,
 'customer_phone': 51,
 'customer_email': 52,
 'Aniyah': 53,
 'Feest': 54,
 'Customers_cards': 55,
 'card_id': 56,
 'card_type_code': 57,
 'card_number': 58,
 'date_valid_from

In [13]:
for word, i in source_tokenizer.items():
    embedding_vector = embedding_matrix.get(word)
    if embedding_vector is not None:
        source_embedding_matrix[i] = embedding_vector

for word, i in target_tokenizer.items():
    embedding_vector = embedding_matrix.get(word)
    if embedding_vector is not None:
        target_embedding_matrix[i] = embedding_vector

In [14]:
question_tokens = [example["question_toks"] for example in dataset]
query_tokens = [example["query_toks"] for example in dataset]

encoder_input_data_nopad = [[1] + [source_tokenizer[word] for word in sentence] + [2] for sentence in question_tokens]


decoder_input_data_nopad = [[1] + [target_tokenizer[word] for word in sentence] + [2] for sentence in query_tokens]


In [15]:
# Pad the sequences to a consistent length
def pad_sequences(sequences, max_length):
    padded_sequences = []
    for sequence in sequences:
        if len(sequence) < max_length:
            padded_sequence = sequence + [0] * (max_length - len(sequence))
        else:
            padded_sequence = sequence[:max_length]
        padded_sequences.append(padded_sequence)
    return padded_sequences

# Pad the encoder and decoder inputs
encoder_input_data = pad_sequences(encoder_input_data_nopad, sequence_length)
decoder_input_data = pad_sequences(decoder_input_data_nopad, sequence_length)
encoder_input_data = np.array(encoder_input_data)
decoder_input_data = np.array(decoder_input_data)

In [16]:
decoder_input_data[1]

array([1, 3, 4, 5, 6, 7, 8, 9, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [17]:
encoder_input_data[1]

array([ 1, 10, 11, 12, 13,  5, 14,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0])

In [18]:
#decoder_target_data
target_vocab = {
    0 : 0
}

decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]
decoder_target_data[:, -1] = target_vocab[0]

In [19]:
decoder_target_data[1]

array([3, 4, 5, 6, 7, 8, 9, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [20]:
# Define the encoder
encoder_inputs = tf.keras.layers.Input(shape=(sequence_length,))
encoder_embedding = Embedding(input_vocab_size, embedding_dim, weights=[source_embedding_matrix], trainable=False)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

In [21]:
# Define the decoder
decoder_inputs = tf.keras.layers.Input(shape=(sequence_length,))
decoder_embedding = Embedding(output_vocab_size, embedding_dim, weights=[target_embedding_matrix], trainable=False)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(output_vocab_size, activation='softmax')
output = decoder_dense(decoder_outputs)

In [22]:
# Build and compile the model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [23]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 37)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 37)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 37, 100)              11600     ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 37, 100)              8200      ['input_2[0][0]']             
                                                                                              

In [24]:
print(decoder_target_data.shape)
print(encoder_input_data.shape)
print(decoder_input_data.shape)

(80, 37)
(80, 37)
(80, 37)


In [25]:
# Train the model
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=num_epochs, validation_split=0.2)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200


Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200


Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200


Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.src.callbacks.History at 0x2312f8d9ad0>

In [26]:
model = tf.keras.Sequential()

In [27]:
dummy_input = np.array([ 8,  9, 10, 11,  3, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0])
_ = model(dummy_input)

In [28]:
##saving model
model.save('seq2seq_model.h5')

  saving_api.save_model(


In [29]:
from tensorflow.keras.models import load_model
model = load_model('seq2seq_model.h5')




In [44]:
# Load your tokenizers, embeddings, and model as described in the previous response

# Define the start token and initial decoder input
start_token = target_tokenizer['<start>']
end_token = source_tokenizer['<end>']
initial_decoder_input = np.zeros((1, 37))
initial_decoder_input[0, 0] = start_token


In [84]:
# User input
user_question = "Count the total number of accounts"
user_question_tokens = user_question.split()
user_question_sequence = [1] + [source_tokenizer.get(word,0) for word in user_question_tokens] + [2]
user_question_sequence = pad_sequences([user_question_sequence], sequence_length)
# Load your tokenizers, embeddings, and model as described in your setup

# Define the start and end tokens
start_token = target_tokenizer['<start>']
end_token = target_tokenizer['<end>']

# Initialize variables for prediction
predicted_sql_tokens = []
current_input = np.zeros((1, sequence_length))
current_input[0, 0] = start_token

while len(predicted_sql_tokens) < sequence_length:
    # Predict the next token
    predictions = model.predict([user_question_sequence, current_input])
    predicted_token_index = np.argmax(predictions[0, -1, :])

    # Convert the token index to the corresponding word
    predicted_word = [key for key, value in target_tokenizer.items() if value == predicted_token_index][0]

    # Append the predicted token to the sequence
    predicted_sql_tokens.append(predicted_word)

    # Prepare the input for the next prediction
    current_input[0, len(predicted_sql_tokens)] = predicted_token_index

    # Check for the end token
    if predicted_word == '<end>':
        break

# Convert Predicted Tokens to SQL Query
predicted_sql_query = ' '.join(predicted_sql_tokens)

# Execute or Display the SQL Query as needed


IndexError: index 1 is out of bounds for axis 0 with size 1

In [77]:
user_question_tokens

['Count', 'the', 'total', 'number', 'of', 'accounts']

In [78]:
print(predictions)

[[[  1.  10.  11. 105.  12.  13.   5.   2.   0.   0.   0.   0.   0.   0.
     0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
     0.   0.   0.   0.   0.   0.   0.   0.   0.]]

 [[  1.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.
     3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.   3.
     3.   3.   3.   3.   3.   3.   3.   3.   3.]]]


In [79]:
print(user_question_sequence)

[[1, 10, 11, 105, 12, 13, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [80]:
print(predicted_word)

SELECT


In [81]:
print(predicted_token_index)

3


In [82]:
print(predicted_sql_tokens)

['SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT', 'SELECT']
