In [163]:
# pip install tensorflow

In [164]:
import tensorflow as tf
import json
import numpy as np
from sklearn.model_selection import train_test_split

In [165]:
#model parameters
max_input_sequence_length = 35
max_output_sequence_length = 35
input_vocab_size = 113
output_vocab_size = 79
embedding_dim = 128
lstm_units = 64

In [166]:
# Create the Encoder
encoder_inputs = tf.keras.layers.Input(shape=(max_input_sequence_length,))
encoder_embedding = tf.keras.layers.Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = tf.keras.layers.LSTM(units=lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

In [167]:
# Create the Decoder
decoder_inputs = tf.keras.layers.Input(shape=(max_output_sequence_length,))
decoder_embedding = tf.keras.layers.Embedding(output_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(units=lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

In [168]:
output_vocab = tf.keras.layers.Dense(output_vocab_size, activation='softmax')(decoder_outputs)


In [169]:
model = tf.keras.models.Model(inputs=[encoder_inputs, decoder_inputs], outputs=output_vocab)


In [170]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [171]:
model.summary()


Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_19 (InputLayer)       [(None, 35)]                 0         []                            
                                                                                                  
 input_20 (InputLayer)       [(None, 35)]                 0         []                            
                                                                                                  
 embedding_17 (Embedding)    (None, 35, 128)              14464     ['input_19[0][0]']            
                                                                                                  
 embedding_18 (Embedding)    (None, 35, 128)              10112     ['input_20[0][0]']            
                                                                                            

In [172]:
#monitoring training progress
log_dir = "C:/Users/DEBASHISH THAKUR/Desktop/project"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


In [173]:
##load the dataset
with open('C:/Users/DEBASHISH THAKUR/Desktop/project/card.json', 'r') as json_file:
    dataset = json.load(json_file)

In [174]:
##questions and their corresponding queries
train_encoder_inputs = [example["question_toks"] for example in dataset]
train_decoder_inputs = [example["query_toks"] for example in dataset]

In [175]:
max_sequence_length = 35

# Pad the sequences to a consistent length
def pad_sequences(sequences, max_length):
    padded_sequences = []
    for sequence in sequences:
        if len(sequence) < max_length:
            padded_sequence = sequence + ["<PAD>"] * (max_length - len(sequence))
        else:
            padded_sequence = sequence[:max_length]
        padded_sequences.append(padded_sequence)
    return padded_sequences

# Pad the encoder and decoder inputs
train_encoder_inputs = pad_sequences(train_encoder_inputs, max_sequence_length)
train_decoder_inputs = pad_sequences(train_decoder_inputs, max_sequence_length)

In [176]:
index = 1
vocabulary = {}

for i in dataset:
    query_toks = i["query_toks"]
    # print(query_toks)
    for j in query_toks:
        # print(j)
        
        if j not in vocabulary:
            vocabulary[j] = index
            index += 1


In [177]:
train_decoder_outputs = []
for query_data in dataset: 
    # Extract the SQL query tokens from the dataset
    sql_query_tokens = query_data["query_toks"]

    # Convert SQL query tokens to numerical sequences based on the vocabulary
    query_numeric_sequence = [vocabulary[token] for token in sql_query_tokens]

    # Append the numerical sequence to train_decoder_outputs
    train_decoder_outputs.append(query_numeric_sequence)
train_decoder_outputs = pad_sequences(train_decoder_outputs, max_sequence_length)
# print(train_decoder_outputs)

In [178]:
# Split the dataset into training (80%) and temporary (20%) sets
X_train, X_temp, y_train_inputs, y_temp = train_test_split(
    train_encoder_inputs,
    train_decoder_inputs,
    train_size=0.80,
    test_size=0.20,  # Adjust the ratio for the temporary set
    random_state=None  # Set a random seed for reproducibility
)

# Split the temporary set into validation (50%) and testing (50%) sets
X_val, X_test, y_val_inputs, y_test_inputs = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,  # Half of the temporary set becomes validation, and half becomes testing
    random_state=42
)

# If you also have y_train_outputs, y_val_outputs, and y_test_outputs, split them similarly.


In [179]:
# X_val = pad_sequences(X_val, max_sequence_length)
# X_test = pad_sequences(X_test, max_sequence_length)
# y_val_inputs = pad_sequences(y_val_inputs, max_sequence_length)
# y_test_inputs = pad_sequences(y_test_inputs, max_sequence_length)

In [180]:
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

y_train_outputs = np.array(train_decoder_outputs)[:64] ##ground truth but not aligned with X_train and y_train_inputs

y_train_inputs = np.array(y_train_inputs)
y_val_inputs = np.array(y_val_inputs)
y_test_inputs = np.array(y_test_inputs)

In [181]:
# Train the model
num_epochs = 10  # Number of training epochs
batch_size = 8  # Batch size

history = model.fit(
    [X_train, y_train_inputs],  # Input data
    y_train_outputs,            # Target data (if available)
    validation_data=([X_val, y_val_inputs], y_val_inputs),  # Validation data
    epochs=num_epochs,
    batch_size=batch_size,
    verbose=2  # Set verbosity level
)


Epoch 1/10


ValueError: in user code:

    File "C:\Users\DEBASHISH THAKUR\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\DEBASHISH THAKUR\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\DEBASHISH THAKUR\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\DEBASHISH THAKUR\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1127, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\DEBASHISH THAKUR\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1185, in compute_loss
        return self.compiled_loss(
    File "C:\Users\DEBASHISH THAKUR\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\DEBASHISH THAKUR\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\DEBASHISH THAKUR\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\DEBASHISH THAKUR\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 2221, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "C:\Users\DEBASHISH THAKUR\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\backend.py", line 5575, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (8, 35) and (8, 35, 79) are incompatible


In [None]:
print(y_train_outputs.shape)
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train_inputs.shape)
print(y_val_inputs.shape)
print(y_test_inputs.shape)

In [None]:
X_train


In [None]:
y_train_inputs

In [None]:
y_train_outputs.shape

In [None]:
y_val_outputs