###Importing Libraries

In [5]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import tensorflow as tf
import numpy as np
import unicodedata
import re

###Reading the Data

In [7]:
%cd drive/MyDrive/CS425/Chatbot_Project

/content/drive/MyDrive/CS425/Chatbot_Project


In [8]:
file = open('./data/output_truncate_20.txt','r').read()

In [9]:
raw_data = [f.split('\t') for f in file.split('\n')]    #separating questions and answers
questions = [x[0] for x in raw_data]
answers = [x[1] if len(x) > 1 else "" for x in raw_data]

In [10]:
print("Question: ", questions[0])
print("Answer: ", answers[0])

Question:  What is the relationship between very low Mg2+ levels, PTH levels, and Ca2+ levels?
Answer:  Very low Mg2+ levels correspond to low PTH levels which in turn results in low Ca2+ levels.


###Tokenizing

In [11]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

###PreProcessing

In [12]:
def preprocess_sentence(sentence):
    sentence = ''.join(c for c in unicodedata.normalize('NFD', sentence) if unicodedata.category(c) != 'Mn')
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
pre_questions = [preprocess_sentence(w) for w in questions] #processing all the quesstions
pre_answers = [preprocess_sentence(w) for w in answers] #processing all the answers

In [14]:
data = pre_answers, pre_questions

In [15]:
def prepare_data(data):
    targ_lang, inp_lang = data

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [18]:
input_tensor, target_tensor, inp_lang, targ_lang = prepare_data(data)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [19]:
print(max_length_targ)
print(max_length_inp)

28
24


### Downloading the Tokenizers

In [20]:
import pickle

def save_tokenizer(tokenizer, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

save_tokenizer(inp_lang, 'input_tokenizer.pkl')
save_tokenizer(targ_lang, 'target_tokenizer.pkl')

###Splitting the Data

In [21]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split the data into 80% train, 10% test, 10% val
input_tensor_train, input_tensor_test_val, target_tensor_train, target_tensor_test_val = train_test_split(
    input_tensor, target_tensor, test_size=0.2, random_state=42)

input_tensor_test, input_tensor_val, target_tensor_test, target_tensor_val = train_test_split(
    input_tensor_test_val, target_tensor_test_val, test_size=0.5, random_state=42)

print(f'Train size: {len(input_tensor_train)}')
print(f'Test size: {len(input_tensor_test)}')
print(f'Validation size: {len(input_tensor_val)}')


Train size: 11906
Test size: 1488
Validation size: 1489


###Defining the PipeLine

In [22]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 128
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 200
units = 750
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape


(TensorShape([128, 24]), TensorShape([128, 28]))

In [23]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.enc_units,
                                         return_sequences=True,
                                         return_state=True,
                                         recurrent_initializer='glorot_uniform')
        self.dropout = tf.keras.layers.Dropout(0.2)

    def call(self, x, hidden):
      x = self.embedding(x)
      x = self.dropout(x)  # Apply dropout to the input
      output, state_h, state_c = self.lstm(x, initial_state=hidden)  # Use LSTM with state_h and state_c
      state = [state_h, state_c]
      return output, state

    def initialize_hidden_state(self):
        return [tf.zeros((self.batch_sz, self.enc_units)),
                tf.zeros((self.batch_sz, self.enc_units))]

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [24]:
import tensorflow as tf

class Attention(tf.keras.layers.Layer):
    def __init__(self, units, use_scaling=True, use_masking=False, dropout_rate=0.1):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        self.use_scaling = use_scaling
        self.use_masking = use_masking
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, query, values, mask=None):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

        if self.use_scaling:
            score = score / tf.math.sqrt(tf.cast(tf.shape(query_with_time_axis)[-1], tf.float32))

        if self.use_masking and mask is not None:
            score += (1 - mask) * -1e9  # Apply masking to scores

        attention_weights = tf.nn.softmax(score, axis=1)
        attention_weights = self.dropout(attention_weights)

        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        context_vector = self.layer_norm(context_vector)

        return context_vector, attention_weights

In [26]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.dec_units,
                                         return_sequences=True,
                                         return_state=True,
                                         recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = Attention(self.dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden[0], enc_output)

        x = self.embedding(x)

        # Concatenate context vector and embedding
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # Passing the concatenated vector to the LSTM
        output, state_h, state_c = self.lstm(x, initial_state=hidden)  # Use LSTM with state_h and state_c

        state = [state_h, state_c]

        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)

        return x, state, attention_weights

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

###Adjusting Learning Rates

In [27]:
initial_learning_rate = 0.001 #adaptive learning rate
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps=1000, decay_rate=0.9, staircase=True
)

# Define the optimizer with adaptive learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

# Define your loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none'
)

def loss_function(real, pred):  #defining loss function
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)


###Defining Train Step

In [28]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        # Initialize LSTM's initial state
        dec_hidden = [enc_hidden[0][:, :units], enc_hidden[1][:, :units]]

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # Passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # Using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [29]:
@tf.function
def validation_step(inp, targ, enc_hidden):
    val_loss = 0

    val_samples = 0

    enc_output, enc_hidden = encoder(inp, enc_hidden)

    # Initialize LSTM's initial state
    dec_hidden = [enc_hidden[0][:, :units], enc_hidden[1][:, :units]]

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]):
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
        loss = loss_function(targ[:, t], predictions)
        val_loss += loss
        val_samples += 1
        dec_input = tf.expand_dims(targ[:, t], 1)

    val_loss /= val_samples

    return val_loss

###Training the Pipeline

In [30]:
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Define the optimizer with gradient clipping
optimizer = Adam(learning_rate=lr_schedule, clipvalue=1.0)  # Added gradient clipping

EPOCHS = 15

train_losses = []
val_losses = []
for epoch in range(1, EPOCHS + 1):
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    # Training loop
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    num_samples = 0
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            predicted_id = tf.argmax(predictions, axis=-1)
            num_samples += 1
            dec_input = tf.expand_dims(targ[:, t], 1)

    validation_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
    validation_dataset = validation_dataset.batch(BATCH_SIZE, drop_remainder=True)

    val_loss = 0
    val_samples = 0

    for (batch, (inp, targ)) in enumerate(validation_dataset):
        enc_hidden = encoder.initialize_hidden_state()  # Initialize hidden state for each batch
        val_batch_loss = validation_step(inp, targ, enc_hidden)
        val_loss += val_batch_loss
        val_samples += 1

    val_loss /= val_samples

    if epoch % 1 == 0:
        train_losses.append(total_loss / steps_per_epoch)
        val_losses.append(val_loss)
        print('Epoch:{:3d} Loss:{:.4f} Val Loss:{:.4f}'.format(
            epoch, total_loss / steps_per_epoch,  val_loss))

    # Implement early stopping
    if early_stopping.on_epoch_end(epoch, logs={'val_loss': val_loss}):
        print("Early stopping triggered.")
        break

# Plotting the accuracy and loss graphs
plt.figure(figsize=(12, 6))

# Plot training and validation losses
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.tight_layout()
plt.show()


Epoch:  1 Loss:2.8500 Val Loss:2.7992


AttributeError: ignored

In [None]:
encoder.save("encoder_dropOutwithNormalization")
decoder.save("decoder_dropOutwithNormalization")



In [None]:
!zip -r "encoder_dropOutwithNormalization.zip" "encoder_dropOutwithNormalization"

  adding: encoder_dropOutwithNormalization/ (stored 0%)
  adding: encoder_dropOutwithNormalization/variables/ (stored 0%)
  adding: encoder_dropOutwithNormalization/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: encoder_dropOutwithNormalization/variables/variables.index (deflated 39%)
  adding: encoder_dropOutwithNormalization/assets/ (stored 0%)
  adding: encoder_dropOutwithNormalization/saved_model.pb (deflated 90%)
  adding: encoder_dropOutwithNormalization/fingerprint.pb (stored 0%)
  adding: encoder_dropOutwithNormalization/keras_metadata.pb (deflated 81%)


In [None]:
!zip -r "decoder_dropOutwithNormalization.zip" "decoder_dropOutwithNormalization"

  adding: decoder_dropOutwithNormalization/ (stored 0%)
  adding: decoder_dropOutwithNormalization/variables/ (stored 0%)
  adding: decoder_dropOutwithNormalization/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: decoder_dropOutwithNormalization/variables/variables.index (deflated 54%)
  adding: decoder_dropOutwithNormalization/assets/ (stored 0%)
  adding: decoder_dropOutwithNormalization/saved_model.pb (deflated 90%)
  adding: decoder_dropOutwithNormalization/fingerprint.pb (stored 0%)
  adding: decoder_dropOutwithNormalization/keras_metadata.pb (deflated 86%)


In [None]:
def remove_tags(sentence):
    return sentence.split("<start>")[-1].split("<end>")[0]

In [None]:
def evaluate(sentence):
    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units)), tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        if targ_lang.index_word[predicted_id] == '<end>':
            return remove_tags(result), remove_tags(sentence)

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return remove_tags(result), remove_tags(sentence)


### Testing some random questions

In [1]:
def test(question):
    answer, question = evaluate(question)
    print('Question:', question)
    print('Predicted answer:', answer)

test("Relationship between very low Mg2+ levels, PTH levels, and Ca2+ levels")


NameError: ignored

In [None]:
test("What is the most severe form of β-thalassemia and what are the typical symptoms associated with it")


Question:  what is the most severe form of thalassemia and what are the typical symptoms associated with it 
Predicted answer: the most common cause of seizure in adults is thalassemia . 


In [None]:
test("Hello")
test("How are you doing?")
test("What is your age?")
test("Do you have a tv?")
test("Do you like rain?")

Question:  hello 
Predicted answer: greetings ! 
Question:  how are you doing ? 
Predicted answer: fine , and you ? 
Question:  what is your age ? 
Predicted answer: i am still young by your standards . 
Question:  do you have a tv ? 
Predicted answer: yes , i do . 
Question:  do you like rain ? 
Predicted answer: yes , i love traveling and exploring new places . 


In [None]:
test("I am afraid")

Question:  i am afraid 
Predicted answer: why ? do i frighten you ? try not to be too scared . what are you afraid of ? 


In [None]:
test("I am feeling sick")

Question:  i am feeling sick 
Predicted answer: oh , really ? 


In [None]:
test("Sorry")

Question:  sorry 
Predicted answer: yeah , so do i . 


In [None]:
test("hi, how are you doing?")

Question:  hi , how are you doing ? 
Predicted answer: i m fine . how about yourself ? 


In [None]:
test("i'm pretty good. thanks for asking.")

Question:  i m pretty good . thanks for asking . 
Predicted answer: no problem . so how have you been ? 


In [None]:
test("i've been great. what about you?")

Question:  i ve been great . what about you ? 
Predicted answer: i ve been good . i m in school right now . 


In [None]:
test("what school do you go to?")

Question:  what school do you go to ? 
Predicted answer: i go to pcc . 


In [None]:
test("I don't know")

Question:  i don t know 
Predicted answer: i like the ones i can sing along with . 


In [None]:
test("nice to meet you")

Question:  nice to meet you 
Predicted answer: thank you . 


In [None]:
test("What are your hobbies?")

Question:  what are your hobbies ? 
Predicted answer: i enjoy reading books and playing the guitar . 


In [None]:
test("You are rude")

Question:  you are rude 
Predicted answer: yep . i always behave in socially unacceptable ways . 


In [None]:
test("I love you")

Question:  i love you 
Predicted answer: i love you , too . 
