### Install packages

### Import libraries

In [7]:
import string
import matplotlib
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from nltk import word_tokenize

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Get data from GitHub

In [22]:
csv_url = 'https://raw.githubusercontent.com/davisjrule/SQuAD-bot/main/topical_chat.csv'
df = pd.read_csv(csv_url)
df.drop('conversation_id', axis=1, inplace=True)
df.drop('sentiment', axis=1, inplace=True)

questions = df[0:5000:2]
responses = df[1:5000:2]

questions.head()


Unnamed: 0,message
0,Are you a fan of Google or Microsoft?
2,"I'm not a huge fan of Google, but I use it a..."
4,"Yeah, their services are good. I'm just not a..."
6,Did you know Google had hundreds of live goat...
8,I like Google Chrome. Do you use it as well f...


### Process data

In [23]:
def clean_text(text):
  text = str(text).lower()
  text = text.replace("can't", "cannot")
  text = text.replace("won't", "will not")
  text = text.replace("i'm", "i am")
  text = text.replace("'ve", " have")
  text = text.replace("'ll", " will")
  text = text.replace("n't", " not")
  text = text.replace("'d", " would")
  text = text.replace("'s", " is")
  text = text.replace("'re", " are")
  for char in string.punctuation:
    if char != '-':
      text = text.replace(char, '')
  return text

In [31]:
pairs = list(zip(questions['message'],responses['message']))

pairs


[(' Are you a fan of Google or Microsoft?',
  ' Both are excellent technology they are helpful in many ways. For the security purpose both are super.'),
 (" I'm not  a huge fan of Google, but I use it a lot because I have to. I think they are a monopoly in some sense. ",
  ' Google provides online related services and products, which includes online ads, search engine and cloud computing.'),
 (" Yeah, their services are good. I'm just not a fan of intrusive they can be on our personal lives. ",
  ' Google is leading the alphabet subsidiary and will continue to be the Umbrella company for Alphabet internet interest.'),
 (' Did you know Google had hundreds of live goats to cut the grass in the past? ',
  ' It is very interesting. Google provide "Chrome OS" which is a light weight OS. Google provided a lot of hardware mainly in 2010 to 2015. '),
 (' I like Google Chrome. Do you use it as well for your browser? ',
  ' Yes.Google is the biggest search engine and Google service figure out to

In [40]:
import numpy as np
import re

input_docs = []
target_docs = []
input_tokens = set()
target_tokens = set()

for line in pairs[:1000]:
  input_doc, target_doc = line[0], line[1]

  input_doc = clean_text(input_doc)
  target_doc = clean_text(target_doc)

  # Appending each input sentence to input_docs
  input_docs.append(input_doc)
  
  # Splitting words from punctuation  
  target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))

  # Redefine target_doc below and append it to target_docs
  target_doc = '<START> ' + target_doc + ' <END>'
  target_docs.append(target_doc)

  
  # Now we split up each sentence into words and add each unique word to our vocabulary set
  for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
    if token not in input_tokens:
      input_tokens.add(token)
       
  for token in target_doc.split():
    if token not in target_tokens:
      target_tokens.add(token)


input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

# target_tokens

In [41]:
print(input_tokens)

['-', '001', '1', '10', '100', '10000', '11', '1110', '12', '15', '15000000', '16', '165', '17', '175', '18', '180', '1853', '1895', '19', '1900', '1909', '1910', '1912', '1918', '1928', '1930', '1934', '1936', '1938', '1940', '1940s', '1949', '1950s', '1958', '196', '1963', '1968', '1969', '1970s', '1977', '1980', '1991', '1993', '1994', '1998', '1999', '1mb', '2', '20', '2000', '2002', '2005', '2006', '2013', '2016', '2018', '2020', '20th', '224', '23', '24', '2430', '25', '270', '2795', '2nd', '3', '30', '3000', '300000', '309', '30s', '31', '32000', '356', '38', '39', '3rd', '4', '40', '4000', '45', '4500', '49', '4th', '5', '50', '500k', '51', '55', '6', '60', '60s', '622', '628', '63', '6331', '64', '65', '69', '6year', '7', '70k', '70s', '71', '75', '7500', '8', '80', '84', '85', '86', '9', '93', 'a', 'abc', 'abilities', 'able', 'about', 'above', 'absentee', 'absolutely', 'absorbs', 'abstract', 'abusive', 'academy', 'accent', 'accept', 'according', 'accountable', 'achieve', 'aco

In [42]:
input_features_dict = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_features_dict = dict(
    [(token, i) for i, token in enumerate(target_tokens)])

reverse_input_features_dict = dict(
    (i, token) for token, i in input_features_dict.items())
reverse_target_features_dict = dict(
    (i, token) for token, i in target_features_dict.items())


In [43]:
max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", target_doc)) for target_doc in target_docs])

encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):
    for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):
        #Assign 1. for the current line, timestep, & word in encoder_input_data
        encoder_input_data[line, timestep, input_features_dict[token]] = 1.
    
    for timestep, token in enumerate(target_doc.split()):
        decoder_input_data[line, timestep, target_features_dict[token]] = 1.
        if timestep > 0:
            decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.



In [44]:
max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", target_doc)) for target_doc in target_docs])

encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length), dtype='int32'
)
decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length), dtype='int32'
)
decoder_output_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='int32'
  )
for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):
  for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):
    encoder_input_data[line, timestep] = input_features_dict[token]
  for timestep, token in enumerate(target_doc.split()):
    decoder_input_data[line, timestep] = target_features_dict[token]
    if timestep > 0:
        decoder_output_data[line, timestep - 1, target_features_dict[token]] = 1.


In [45]:
from tensorflow.keras import layers , activations , models , preprocessing

In [46]:
import tensorflow as tf

encoder_inputs = tf.keras.layers.Input(shape=( max_encoder_seq_length, ))
encoder_embedding = tf.keras.layers.Embedding( num_encoder_tokens, 200 , mask_zero=True) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( num_decoder_tokens, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( num_decoder_tokens , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 62)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 62, 200)      601800      ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, None, 200)    595000      ['input_4[0][0]']                
                                                                                            

In [47]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=400 ) 
model.save( 'model.h5' ) 

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78

In [2]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

In [3]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( input_features_dict[word] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=max_encoder_seq_length , padding='post')

In [8]:
enc_model , dec_model = make_inference_models()

for _ in range(10):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = target_features_dict['<START>']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = reverse_target_features_dict[sampled_word_index]
        decoded_translation += " " + sampled_word
        
        if sampled_word == '<END>' or len(decoded_translation.split()) > max_decoder_seq_length:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )


NameError: ignored

In [None]:
from keras.models import load_model
training_model = load_model('model.h5')

encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding , initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = tf.keras.models.Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

def decode_response(test_input):
  states_values = encoder_model.predict(test_input)

  target_seq = np.zeros( ( 1 , 1 ) )
  target_seq[0, 0] = target_features_dict['<START>']
  stop_condition = False
  decoded_translation = ""
  while not stop_condition :
      dec_outputs , h , c = training_model.predict([ target_seq ] + states_values )

      sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
      sampled_word = reverse_target_features_dict[sampled_word_index]
      decoded_translation += " " + sampled_word
      
      if sampled_word == '<END>' or len(decoded_translation.split()) > max_decoder_seq_length:
          stop_condition = True
          
      empty_target_seq = np.zeros( ( 1 , 1 ) )  
      empty_target_seq[ 0 , 0 ] = sampled_word_index
      states_values = [ h , c ] 

  return decoded_translation


In [None]:
def generate_response(input):
  input = clean_text(input)
  #tokens = re.findall(r"[\w']+|[^\s\w]", input)
  #input_matrix = np.zeros(
  #    (1, max_encoder_seq_length),
  #    dtype='int32')
  #for timestep, token in enumerate(tokens):
  #  if token in input_features_dict:
  #    input_matrix[0, timestep] = input_features_dict[token]
  #output = decode_response(input_matrix)
  output = decode_response(str_to_tokens(input))
  return output.replace("<START>","").replace("<END>","")


while True:
  reply = input()
  if reply == "STOP":
    break
  print(generate_response(reply))

In [None]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

Legacy Code:

In [None]:
from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model

#Dimensionality
dimensionality = 256

#The batch size and number of epochs
batch_size = 10
epochs = 10

#Encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(dimensionality, return_state=True)
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
encoder_states = [state_hidden, state_cell]

#Decoder
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(dimensionality, return_sequences=True, return_state=True)
decoder_outputs, decoder_state_hidden, decoder_state_cell = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

#Model
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

#Compiling
training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'], sample_weight_mode='temporal')

#Training
training_model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size = batch_size, epochs = epochs, validation_split = 0.2)
training_model.save('training_model.h5')
     

In [None]:
from keras.models import load_model
training_model = load_model('training_model.h5')
encoder_inputs = training_model.input[0]
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

latent_dim = 256
decoder_state_input_hidden = Input(shape=(latent_dim,))
decoder_state_input_cell = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]
decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_hidden, state_cell]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

def decode_response(test_input):
  #Getting the output states to pass into the decoder
  states_value = encoder_model.predict(test_input)
  
  #Generating empty target sequence of length 1
  target_seq = np.zeros((1, 1, num_decoder_tokens))
  
  #Setting the first token of target sequence with the start token
  target_seq[0, 0, target_features_dict['<START>']] = 1.
  
  #A variable to store our response word by word
  decoded_sentence = ''
    
  stop_condition = False

  while not stop_condition:
    #Predicting output tokens with probabilities and states
    output_tokens, hidden_state, cell_state = decoder_model.predict([target_seq] + states_value)
    
    #Choosing the one with highest probability
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_token = reverse_target_features_dict[sampled_token_index]
    decoded_sentence += " " + sampled_token
    
    #Stop if hit max length or found the stop token
    if (sampled_token == '' or len(decoded_sentence) > max_decoder_seq_length):
      stop_condition = True
    
    #Update the target sequence
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sampled_token_index] = 1.
    
    #Update states
    states_value = [hidden_state, cell_state]
  return decoded_sentence
     


In [None]:
class ChatBot:
  negative_responses = ("no", "nope", "nah", "naw", "not a chance", "sorry")
  exit_commands = ("quit", "pause", "exit", "goodbye", "bye", "later", "stop")
  
  #Method to start the conversation
  def start_chat(self):
    user_response = input("Hi, I'm a chatbot trained on random dialogs. Would you like to chat with me?\n")
    
    if user_response in self.negative_responses:
      print("Ok, have a great day!")
      return
    self.chat(user_response)
  
  #Method to handle the conversation
  def chat(self, reply):
    while not self.make_exit(reply):
      reply = input(self.generate_response(reply)+"\n")
    
  #Method to convert user input into a matrix
  def string_to_matrix(self, user_input):
    tokens = re.findall(r"[\w']+|[^\s\w]", user_input)
    user_input_matrix = np.zeros(
      (1, max_encoder_seq_length, num_encoder_tokens),
      dtype='float32')
    for timestep, token in enumerate(tokens):
      if token in input_features_dict:
        user_input_matrix[0, timestep, input_features_dict[token]] = 1.
    return user_input_matrix

  #Method that will create a response using seq2seq model we built
  def generate_response(self, user_input):
    input_matrix = self.string_to_matrix(user_input)  
    chatbot_response = decode_response(input_matrix)
    #Remove  and  tokens from chatbot_response
    chatbot_response = chatbot_response.replace("<START>",'')
    chatbot_response = chatbot_response.replace("<END>",'')
    return chatbot_response
  
  #Method to check for exit commands
  def make_exit(self, reply):
    for exit_command in self.exit_commands:
      if exit_command in reply:
        print("Ok, have a great day!")
        return True
    return False
  
chatbot = ChatBot()
chatbot.start_chat()

In [None]:
def generate_response(input):
  input = clean_text(input)
  tokens = re.findall(r"[\w']+|[^\s\w]", input)
  input_matrix = np.zeros(
      (1, max_encoder_seq_length, num_encoder_tokens),
      dtype='float32')
  for timestep, token in enumerate(tokens):
    if token in input_features_dict:
      input_matrix[0, timestep, input_features_dict[token]] = 1
  output = decode_response(input_matrix)
  return output.replace("<START>","").replace("<END>","")


while True:
  reply = input()
  if reply == "STOP":
    break
  print(generate_response(reply)+'\n')

In [None]:
def save_model(model):
  model.save('')
  return pick_model

In [None]:
def load_model():
  return model