BUILDING A RULE BASED CHATBOT

In [71]:
import numpy as np
import nltk
import string
import random

In [72]:
f = open('chatbot.txt','r', errors = 'ignore')
raw_doc = f.read()
raw_doc = raw_doc.lower() #Convert text to lowercase
nltk.download('punkt') # using punkt tokenizer
nltk.download('wordnet') # using wordnet dictionary
sent_tokens = nltk.sent_tokenize(raw_doc)# converts doc to list of sentences
word_tokens = nltk.word_tokenize(raw_doc) # converts doc to list of words 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [73]:
sent_tokens[:2]

['data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from noisy, structured and unstructured data,[1][2] and apply knowledge from data across a broad range of application domains.',
 'data science is related to data mining, machine learning and big data.']

In [74]:
word_tokens[:2]

['data', 'science']

In [75]:
#TEXT PREPROCESSING
lemmer = nltk.stem.WordNetLemmatizer()
#WordNet is a semantically- oriented dictionaly of English included in NLTK
def LemTokens(tokens):
    return[lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
    

In [76]:
# defining a greeting function
greet_inputs = ("hello","hi","greetings","sup","what's up","hey")
greet_responses = ["hi","hey","*nods*","hi there","I am glad you are talking to me!"]

def greet(sentence):
    for word in sentence.split():
        if word.lower() in greet_inputs:
            return random.choice(greet_responses)

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [78]:
def response(user_response):
    robo1_response = ''
    TfidfVec = TfidfVectorizer(tokenizer= LemNormalize, stop_words = 'english')
    tfidf = TfidfVec.fit_transform(sent_tokens) #finding the number of times and the rarity of the words in the doc
    vals = cosine_similarity(tfidf[-1],tfidf) #mapping our previous findings
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if(req_tfidf == 0):
        robo1_response = robo1_response + "I am sorry, I don't understand you!"
        return robo1_response
    else:
        robo1_response = robo1_response + sent_tokens[idx]
        return robo1_response
    

flag = True
print("BOT: My name is Stark. Let's have a conversation! Also, if wanna exit, just type Bye!")
while(flag == True):
    print("YOU: ")
    user_response = input()
    user_response = user_response.lower()
    if(user_response!='bye'):
        if(user_response == 'thanks' or user_response == 'thank you'):
            flag = False
            print("BOT: You are welcome..")
        else:
            if(greet(user_response) !=None):
                print("BOT: " + greet(user_response))
            else:
                sent_tokens.append(user_response)
                word_tokens = word_tokens + nltk.word_tokenize(user_response)
                final_words = list(set(word_tokens))
                print("BOT: ", end="")
                print(response(user_response))
                sent_tokens.remove(user_response)
    else:
        flag = False
        print("BOT: Goodbye! take care <3")  