### Install packages

### Import libraries

In [None]:
import string
import matplotlib
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from nltk import word_tokenize

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


### Get data from GitHub

In [None]:
csv_url = 'https://raw.githubusercontent.com/davisjrule/SQuAD-bot/main/topical_chat.csv'
df = pd.read_csv(csv_url)
df.drop('conversation_id', axis=1, inplace=True)
df.drop('sentiment', axis=1, inplace=True)

questions = df[0:5000:2]
responses = df[1:5000:2]

questions.head()


Unnamed: 0,message
0,Are you a fan of Google or Microsoft?
2,"I'm not a huge fan of Google, but I use it a..."
4,"Yeah, their services are good. I'm just not a..."
6,Did you know Google had hundreds of live goat...
8,I like Google Chrome. Do you use it as well f...


In [None]:
data = pd.DataFrame()
data['questions'] = df[0:5000:2]
data['answers'] = df[1:5000:2].values
data = data.sample(frac=1).reset_index(drop=True)
data

Unnamed: 0,questions,answers
0,I had not heard that but it's actually quite ...,thats interesting
1,Ha! While trying a suit filed against Eminem!...,yeah. I think he chose wisely. I am not sure ...
2,Yes he was. It was nice chatting with you as ...,You too! Have a nice weekend!
3,lol! bye now!,Crazy that early humans used to have to battl...
4,Yes so pandas do not feel threatened I think....,Yeah so much that he once hiked in a full cos...
...,...,...
2495,I already looked up that jellyfish on my phon...,Tell me about it apparently there are more U....
2496,Yep it was definitely a fad haha. So was Just...,"Hello there, so have you seen any of the Star..."
2497,"haha nope I have ridden one, I guess they evo...",They went from multi-toed to single-toed. I g...
2498,That is true. It does play a role still. Plus...,Thomas Jefferson didn't seem to agree with al...


### Process data

In [None]:
def clean_text(text):
  text = str(text).lower()
  text = text.replace("can't", "cannot")
  text = text.replace("won't", "will not")
  text = text.replace("i'm", "i am")
  text = text.replace("'ve", " have")
  text = text.replace("'ll", " will")
  text = text.replace("n't", " not")
  text = text.replace("'d", " would")
  text = text.replace("'s", " is")
  text = text.replace("'re", " are")
  for char in string.punctuation:
    if char != '-':
      text = text.replace(char, '')
  return text

In [None]:
pairs = list(zip(data['questions'],data['answers']))

pairs


[(" I had not heard that but it's actually quite touching that he could find a way. Speaking of piano, Freddie mercury used the same piano to record bohemian rhapsody as paul mccartney used to record hey jude.",
  ' thats interesting '),
 (' Ha! While trying a suit filed against Eminem! I heard about that. Speaking of Eminem, crazy he wanted to be a comic book artist growing up. What might have been had he pursued that instead if rap!',
  ' yeah. I think he chose wisely. I am not sure how successful he would have been at compliment rap though.'),
 (' Yes he was. It was nice chatting with you as well. Have a nice night!!!',
  ' You too! Have a nice weekend!'),
 (' lol! bye now!',
  " Crazy that early humans used to have to battle giant sloths! Who would've thought?"),
 (' Yes so pandas do not feel threatened I think. Did you know Sean Bean is scared of flying?',
  ' Yeah so much that he once hiked in a full costume to the filming site'),
 (" I agree, I am sure other countries do that. I

In [None]:
import numpy as np
import re

input_docs = []
target_docs = []
input_tokens = set()
target_tokens = set()

for line in pairs[:1000]:
  input_doc, target_doc = line[0], line[1]

  input_doc = clean_text(input_doc)
  target_doc = clean_text(target_doc)

  # Appending each input sentence to input_docs
  input_docs.append(input_doc)
  
  # Splitting words from punctuation  
  target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))

  # Redefine target_doc below and append it to target_docs
  target_doc = '<START> ' + target_doc + ' <END>'
  target_docs.append(target_doc)

  
  # Now we split up each sentence into words and add each unique word to our vocabulary set
  for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
    if token not in input_tokens:
      input_tokens.add(token)
       
  for token in target_doc.split():
    if token not in target_tokens:
      target_tokens.add(token)


input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

# target_tokens

In [None]:
print(input_tokens)

['-', '001', '1', '10', '100', '1000', '10000', '11', '1110', '12', '125', '13', '141', '15', '15th', '165', '16th', '17', '175', '1760', '180', '1800', '1853', '19', '1900', '1900s', '1904', '1912', '1918', '1920s', '1928', '1934', '1936', '1938', '1940', '1940s', '1941', '1949', '1950', '1950s', '1953', '196', '1963', '1968', '1969', '1970s', '1971', '1984', '1989', '1991', '1992', '1994', '1997', '1998', '1mb', '1st', '2', '20', '200', '2000', '2001', '2003', '2005', '2012', '2016', '2018', '2020', '20th', '2100', '219', '224', '2263', '24', '2430', '25', '270', '274', '3', '30', '300', '300000', '309', '309000000', '30s', '31', '312000', '324', '38', '3rd', '4', '40', '400', '41', '43', '45', '4673', '49', '495', '4th', '5', '50', '500k', '51', '55', '6', '622', '628', '63', '65', '68', '69', '6939', '7', '70', '70k', '75', '8', '80', '84', '86', '9', '900', '93', 'a', 'aaron', 'abide', 'abilities', 'able', 'about', 'absentee', 'absolutely', 'absorbs', 'abstract', 'absurd', 'abut',

In [None]:
input_features_dict = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_features_dict = dict(
    [(token, i) for i, token in enumerate(target_tokens)])

reverse_input_features_dict = dict(
    (i, token) for token, i in input_features_dict.items())
reverse_target_features_dict = dict(
    (i, token) for token, i in target_features_dict.items())


In [None]:
max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", target_doc)) for target_doc in target_docs])

encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):
    for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):
        #Assign 1. for the current line, timestep, & word in encoder_input_data
        encoder_input_data[line, timestep, input_features_dict[token]] = 1.
    
    for timestep, token in enumerate(target_doc.split()):
        decoder_input_data[line, timestep, target_features_dict[token]] = 1.
        if timestep > 0:
            decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.



In [None]:
max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", target_doc)) for target_doc in target_docs])

encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length), dtype='int32'
)
decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length), dtype='int32'
)
decoder_output_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='int32'
  )
for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):
  for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):
    encoder_input_data[line, timestep] = input_features_dict[token]
  for timestep, token in enumerate(target_doc.split()):
    decoder_input_data[line, timestep] = target_features_dict[token]
    if timestep > 0:
        decoder_output_data[line, timestep - 1, target_features_dict[token]] = 1.


In [None]:
from tensorflow.keras import layers , activations , models , preprocessing

In [None]:
import tensorflow as tf

encoder_inputs = tf.keras.layers.Input(shape=( max_encoder_seq_length, ))
encoder_embedding = tf.keras.layers.Embedding( num_encoder_tokens, 200 , mask_zero=True) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( num_decoder_tokens, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( num_decoder_tokens , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')
# model.compile(optimizer=tf.keras.optimizers.Adam(), loss='categorical_crossentropy')

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 62)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 62, 200)      638200      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 200)    652600      ['input_2[0][0]']                
                                                                                              

In [None]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=800 ) 
model.save( 'model.h5' ) 

Epoch 1/800
Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800
Epoch 28/800
Epoch 29/800
Epoch 30/800
Epoch 31/800
Epoch 32/800
Epoch 33/800
Epoch 34/800
Epoch 35/800
Epoch 36/800
Epoch 37/800
Epoch 38/800
Epoch 39/800
Epoch 40/800
Epoch 41/800
Epoch 42/800
Epoch 43/800
Epoch 44/800
Epoch 45/800
Epoch 46/800
Epoch 47/800
Epoch 48/800
Epoch 49/800
Epoch 50/800
Epoch 51/800
Epoch 52/800
Epoch 53/800
Epoch 54/800
Epoch 55/800
Epoch 56/800
Epoch 57/800
Epoch 58/800
Epoch 59/800
Epoch 60/800
Epoch 61/800
Epoch 62/800
Epoch 63/800
Epoch 64/800
Epoch 65/800
Epoch 66/800
Epoch 67/800
Epoch 68/800
Epoch 69/800
Epoch 70/800
Epoch 71/800
Epoch 72/800
Epoch 73/800
Epoch 74/800
Epoch 75/800
Epoch 76/800
Epoch 77/800
Epoch 78

In [None]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

In [None]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( input_features_dict[word] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=max_encoder_seq_length , padding='post')

In [None]:
l

In [None]:
enc_model , dec_model = make_inference_models()

for _ in range(10):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = target_features_dict['<START>']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = reverse_target_features_dict[sampled_word_index]
        decoded_translation += " " + sampled_word
        
        if sampled_word == '<END>' or len(decoded_translation.split()) > max_decoder_seq_length:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )


Enter question : do you like google
 i am a little how most interesting i do not have one dogs and elephants are the only animals that understand pointing <END>
Enter question : do you like amazon
 i love them and i also like others like cats i am going to keep my eyes peeled for his rocket that will colonize mars haha <END>
Enter question : lol
 are you a science fan that the time guy i find it interesting that is what canadians call kickball <END>
Enter question : i am a science fan
 i am a star wars fan lol seems <END>
Enter question : haha
 do you watch the show fixer upper <END>
Enter question : yes
 hi there <END>
Enter question : how is your cat
 do you know why baseball managers wear a bunch of awards <END>
Enter question : how is your dog
 i am a little dog especially made past few days and i cannot believe that it is immortal <END>
Enter question : are you smart
 hello <END>


In [None]:
from keras.models import load_model
training_model = load_model('model.h5')

encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding , initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = tf.keras.models.Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

def decode_response(test_input):
  states_values = encoder_model.predict(test_input)

  target_seq = np.zeros( ( 1 , 1 ) )
  target_seq[0, 0] = target_features_dict['<START>']
  stop_condition = False
  decoded_translation = ""
  while not stop_condition :
      dec_outputs , h , c = training_model.predict([ target_seq ] + states_values )

      sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
      sampled_word = reverse_target_features_dict[sampled_word_index]
      decoded_translation += " " + sampled_word
      
      if sampled_word == '<END>' or len(decoded_translation.split()) > max_decoder_seq_length:
          stop_condition = True
          
      empty_target_seq = np.zeros( ( 1 , 1 ) )  
      empty_target_seq[ 0 , 0 ] = sampled_word_index
      states_values = [ h , c ] 

  return decoded_translation


In [None]:
def generate_response(input):
  input = clean_text(input)
  #tokens = re.findall(r"[\w']+|[^\s\w]", input)
  #input_matrix = np.zeros(
  #    (1, max_encoder_seq_length),
  #    dtype='int32')
  #for timestep, token in enumerate(tokens):
  #  if token in input_features_dict:
  #    input_matrix[0, timestep] = input_features_dict[token]
  #output = decode_response(input_matrix)
  output = decode_response(str_to_tokens(input))
  return output.replace("<START>","").replace("<END>","")


while True:
  reply = input()
  if reply == "STOP":
    break
  print(generate_response(reply))

In [None]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

Legacy Code:

In [None]:
from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model

#Dimensionality
dimensionality = 256

#The batch size and number of epochs
batch_size = 10
epochs = 10

#Encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(dimensionality, return_state=True)
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
encoder_states = [state_hidden, state_cell]

#Decoder
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(dimensionality, return_sequences=True, return_state=True)
decoder_outputs, decoder_state_hidden, decoder_state_cell = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

#Model
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

#Compiling
training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'], sample_weight_mode='temporal')

#Training
training_model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size = batch_size, epochs = epochs, validation_split = 0.2)
training_model.save('training_model.h5')
     

In [None]:
from keras.models import load_model
training_model = load_model('training_model.h5')
encoder_inputs = training_model.input[0]
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

latent_dim = 256
decoder_state_input_hidden = Input(shape=(latent_dim,))
decoder_state_input_cell = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]
decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_hidden, state_cell]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

def decode_response(test_input):
  #Getting the output states to pass into the decoder
  states_value = encoder_model.predict(test_input)
  
  #Generating empty target sequence of length 1
  target_seq = np.zeros((1, 1, num_decoder_tokens))
  
  #Setting the first token of target sequence with the start token
  target_seq[0, 0, target_features_dict['<START>']] = 1.
  
  #A variable to store our response word by word
  decoded_sentence = ''
    
  stop_condition = False

  while not stop_condition:
    #Predicting output tokens with probabilities and states
    output_tokens, hidden_state, cell_state = decoder_model.predict([target_seq] + states_value)
    
    #Choosing the one with highest probability
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_token = reverse_target_features_dict[sampled_token_index]
    decoded_sentence += " " + sampled_token
    
    #Stop if hit max length or found the stop token
    if (sampled_token == '' or len(decoded_sentence) > max_decoder_seq_length):
      stop_condition = True
    
    #Update the target sequence
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sampled_token_index] = 1.
    
    #Update states
    states_value = [hidden_state, cell_state]
  return decoded_sentence
     


In [None]:
class ChatBot:
  negative_responses = ("no", "nope", "nah", "naw", "not a chance", "sorry")
  exit_commands = ("quit", "pause", "exit", "goodbye", "bye", "later", "stop")
  
  #Method to start the conversation
  def start_chat(self):
    user_response = input("Hi, I'm a chatbot trained on random dialogs. Would you like to chat with me?\n")
    
    if user_response in self.negative_responses:
      print("Ok, have a great day!")
      return
    self.chat(user_response)
  
  #Method to handle the conversation
  def chat(self, reply):
    while not self.make_exit(reply):
      reply = input(self.generate_response(reply)+"\n")
    
  #Method to convert user input into a matrix
  def string_to_matrix(self, user_input):
    tokens = re.findall(r"[\w']+|[^\s\w]", user_input)
    user_input_matrix = np.zeros(
      (1, max_encoder_seq_length, num_encoder_tokens),
      dtype='float32')
    for timestep, token in enumerate(tokens):
      if token in input_features_dict:
        user_input_matrix[0, timestep, input_features_dict[token]] = 1.
    return user_input_matrix

  #Method that will create a response using seq2seq model we built
  def generate_response(self, user_input):
    input_matrix = self.string_to_matrix(user_input)  
    chatbot_response = decode_response(input_matrix)
    #Remove  and  tokens from chatbot_response
    chatbot_response = chatbot_response.replace("<START>",'')
    chatbot_response = chatbot_response.replace("<END>",'')
    return chatbot_response
  
  #Method to check for exit commands
  def make_exit(self, reply):
    for exit_command in self.exit_commands:
      if exit_command in reply:
        print("Ok, have a great day!")
        return True
    return False
  
chatbot = ChatBot()
chatbot.start_chat()

In [None]:
def generate_response(input):
  input = clean_text(input)
  tokens = re.findall(r"[\w']+|[^\s\w]", input)
  input_matrix = np.zeros(
      (1, max_encoder_seq_length, num_encoder_tokens),
      dtype='float32')
  for timestep, token in enumerate(tokens):
    if token in input_features_dict:
      input_matrix[0, timestep, input_features_dict[token]] = 1
  output = decode_response(input_matrix)
  return output.replace("<START>","").replace("<END>","")


while True:
  reply = input()
  if reply == "STOP":
    break
  print(generate_response(reply)+'\n')

In [None]:
def save_model(model):
  model.save('')
  return pick_model

In [None]:
def load_model():
  return model