In [1]:
#Basic pre-processing and a very basic chatbot

import numpy as np
import nltk
import random
import string
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
f=open('data.txt','r',errors = 'ignore')
raw=f.read()

# first-time use only
nltk.download('punkt') 

# first-time use only
nltk.download('wordnet') 

sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
word_tokens = nltk.word_tokenize(raw)# converts to list of words
sent_tokens[0]
word_tokens[0:10]

# prepare a function to do lemmatization
WNL = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
    return [WNL.lemmatize(token) for token in tokens]

# prepare a mapping list for punctuation removal
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

# Prepare a preprocessing function that will do case lowering, punctuation 
# removal, and lemmatization
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

#test the preprocessing function
LemNormalize(sent_tokens[0])

        
# preprocess the sentences in data, remove stop words, and create a tf-idf
# vector
TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
tfidf = TfidfVec.fit_transform(sent_tokens)
tfidf.shape




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sundar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sundar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  'stop_words.' % sorted(inconsistent))


(126, 949)

In [3]:
#prepare some greeting words
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]
def greeting(sentence): 
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)
        



In [4]:
# function to match input to the preprocessed sentences
def response(user_response):
    robo_response=''
    new = TfidfVec.transform([user_response])
    vals = cosine_similarity(new[0], tfidf)
    idx=vals.argsort()[0][-1]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-1]
    if(req_tfidf==0):
        robo_response=robo_response+"I am sorry! I don't understand you"
        return robo_response
    else:
        robo_response = robo_response+sent_tokens[idx]
        return robo_response




In [5]:
#starting the bot
flag=True
print("CHATTY: My name is CHATTY. I will answer your queries about Chatbots. If you want to exit, type Bye!")
while(flag==True):
    user_response = input()
    user_response=user_response.lower()
    if(user_response!='bye'):
        if(user_response=='thanks' or user_response=='thank you' ):
            flag=False
            print("CHATTY: You are welcome..")
        else:
            if(greeting(user_response)!=None):
                print("CHATTY: "+greeting(user_response))
            else:
                print("CHATTY: ",end="")
                print(response(user_response))
    else:
        flag=False
        print("CHATTY: Bye! take care...")
        
#now chat with bot...




CHATTY: My name is CHATTY. I will answer your queries about Chatbots. If you want to exit, type Bye!
hi
CHATTY: I am glad! You are talking to me
what is chatbot
CHATTY: [42]

Hello Barbie is an Internet-connected version of the doll that uses a chatbot provided by the company ToyTalk,[43] which previously used the chatbot for a range of smartphone-based characters for children.
go
CHATTY: I am sorry! I don't understand you
bye
CHATTY: Bye! take care...


In [6]:
import spacy
from spacy import displacy

#load the required model
nlp = spacy.load("en_core_web_sm")



In [7]:
#process a sentence
eg1 = u"What is the weather today in Seattle?"
doc1 = nlp(eg1)

#visualize the results in a browser: http://localhost:5000
displacy.render(doc1, style="dep")
#displacy.render(doc1, style="ent")

#detailed results
for token in doc1:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.head,
            token.shape_, token.is_alpha, token.is_stop)

for ent in doc1.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)



What what PRON WP attr is Xxxx True True
is be AUX VBZ ROOT is xx True True
the the DET DT det weather xxx True True
weather weather NOUN NN nsubj is xxxx True False
today today NOUN NN npadvmod is xxxx True False
in in ADP IN prep is xx True True
Seattle Seattle PROPN NNP pobj in Xxxxx True False
? ? PUNCT . punct is ? False False
today 20 25 DATE
Seattle 29 36 GPE


In [8]:
eg2 = u"Apple is looking at buying U.K. startup for $1 billion"
eg3 = u"What's the time now in Singapore?"
doc2 = nlp(eg2)
doc3 = nlp(eg3)
print(doc2.similarity(doc1))
print(doc3.similarity(doc1))

#load the model with word vectors， which enables more accurate semantic similarity comparison   
nlpd = spacy.load('en_core_web_md')
tokens = nlpd(u'king queen man woman')



  """
  


0.46682035441300973
0.7563714658478452


In [13]:
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov, 
          token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    


king True 7.1417456 False king PROPN NNP compound xxxx True False
queen True 6.8297405 False queen PROPN NNP compound xxxx True False
man True 6.352939 False man PROPN NNP compound xxx True False
woman True 6.8987513 False woman NOUN NN ROOT xxxx True False


In [14]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))
        


king king 1.0
king queen 0.72526103
king man 0.40884617
king woman 0.26556596
queen king 0.72526103
queen queen 1.0
queen man 0.27109137
queen woman 0.4066065
man king 0.40884617
man queen 0.27109137
man man 1.0
man woman 0.7401744
woman king 0.26556596
woman queen 0.4066065
woman man 0.7401744
woman woman 1.0


In [None]:
doc1_md = nlpd(eg1)
doc2_md = nlpd(eg2)
doc3_md = nlpd(eg3)

In [11]:
print(doc2_md.similarity(doc1_md))


0.6995620076752264


In [12]:
print(doc3_md.similarity(doc1_md))


0.9084785787989824
