<a href="https://colab.research.google.com/github/ericzhou16/NLP/blob/master/Projects/BasicChatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Created based on [this article](https://medium.com/analytics-vidhya/a-simple-chatbot-using-python-and-nltk-c413b40e9441).

In [None]:
# Connecting to Google Drive
from google.colab import drive
drive.mount('/content/gdrive')  

Mounted at /content/gdrive


In [None]:
# Imports
import pandas as pd
import random 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# Greetings, basically maps different user greetings to a random output greeting
greet_in = ('hey', 'sup', 'waddup', 'wassup', 'hi', 'hello', 'good day','heya', 'hiya', 'howdy',
'greetings', 'yo', 'ahoy', 'ello')
greet_out = ['hey', 'hello', 'hi there', 'hi', 'heya', 'hiya', 'howdy', 'greetings', '*nods*', '\'ello']

def greeting(sent):
   for word in sent.split():
      if word.lower() in greet_in:
         return random.choice(greet_out)

In [None]:
# Small talk, basic questions and responses
small_talk_responses = {
'how are you': 'I am fine. Thank you for asking',
'how are you doing': 'I am fine. Thank you for asking',
'how do you do': 'I am good. Thanks for asking',
'how are you holding up': 'I am fine. Thank you for asking',
'how is it going': 'It is going great. Thank you for asking',
'good morning': 'Good Morning',
'good afternoon': 'Good Afternoon',
'good evening': 'Good Evening',
'good day': 'Good day to you too',
'whats up': 'The sky',
'sup': 'The sky',
'thanks': 'Don\'t mention it. You are welcome',
'thank you': 'Don\'t mention it. You are welcome'
}

# Getting string array of all responses
small_talk = small_talk_responses.values()
small_talk = [str (item) for item in small_talk]

# Find the closest small talk
def tfidf_cosim_smalltalk(doc, query):
   query = [query]
   tf = TfidfVectorizer(use_idf=True, sublinear_tf=True)
   # Converts words to vectors
   tf_doc = tf.fit_transform(doc)
   tf_query = tf.transform(query)
   # Get cosine similarity
  #  print(f"cossimunflat: {cosine_similarity(tf_doc,tf_query)}")
  #  print(f"cossimflat: {cosine_similarity(tf_doc,tf_query).flatten()}")
   cosineSimilarities = cosine_similarity(tf_doc,tf_query).flatten()  # converts vert to hor list, one for each of the small talk qs
   related_docs_indices = cosineSimilarities.argsort()[:-2:-1]  # gets last value of the sorted list
   # Only responds if the similarity is higher than threshold of 0.9
   if (cosineSimilarities[related_docs_indices] > 0.9):
      ans = [small_talk[i] for i in related_docs_indices[:1]]
      return ans[0]

In [None]:
# Managing name
def naming(name):
   a = name.split()
   if('my name is' in name):
      for j in a:
         if j not in 'mynameis':
            return j
   elif('call me' in name):
      for j in a:
         if j not in 'callme':
            return j
   elif('name is' in name):
      for j in a:
         if j not in 'nameis':
            return j
   elif('change my name to' in name):
      for j in a:
         if j not in 'changemynameto':
            return j
   elif('change name to' in name):
      for j in a:
         if j not in 'changenameto':
            return j
   else:
      return name

In [None]:
# https://www.microsoft.com/en-us/download/details.aspx?id=a333c41c-9704-4412-9fbc-15bb1fb7f5c3
# Reading and cleaning data
data = pd.read_csv("/content/gdrive/MyDrive/WikiQA.tsv", sep='\t')
data = data[data['Label'] == 1]
data = data.reset_index()
data = data.drop(['QuestionID', 'DocumentID', 'DocumentTitle', 
                  'SentenceID', 'Label', 'index'], axis=1)
data 

Unnamed: 0,Question,Sentence
0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,"As such, African immigrants are to be distingu..."
1,how are glacier caves formed?,A glacier cave is a cave formed within the ice...
2,how a water pump works,Pumps operate by some mechanism (typically rec...
3,"how big is bmc software in houston, tx","Employing over 6,000, BMC is often credited wi..."
4,"how big is bmc software in houston, tx","For 2011, the company recorded an annual reven..."
...,...,...
1464,What is an economic feature?,"At the turn of the 21st century, the expanding..."
1465,what is the average american income,"U.S. median household income fell from $51,144..."
1466,When was Apple Computer founded,"The company was founded on April 1, 1976, and ..."
1467,what is section eight housing,"Section 8 of the Housing Act of 1937 (), often..."


In [None]:
# Combining into a list with all the text
# lm = WordNetLemmatizer()
symbs = '!@#$%^&*()<>,;?\''
ps = PorterStemmer()

all_text = {}
og_text = {}
for index, row in data.iterrows():
    question = ''.join(filter(lambda i: i not in symbs, row['Question']))
    # question = ' '.join(lm.lemmatize(word) for word in question.split())
    question = ' '.join(ps.stem(word.lower()) for word in question.split())
    og_text[question] = row['Question']
    all_text[question] = row['Sentence']
all_text = [str (item) for item in all_text]
print(all_text)
print(og_text)

# stemmed_doc = [lm.lemmatize(w) for w in all_text]
stemmed_doc = [ps.stem(w) for w in all_text]
tf = TfidfVectorizer(use_idf=True, sublinear_tf=True, stop_words=stopwords.words('english'))
tf_doc = tf.fit_transform(stemmed_doc)

# Stemming and figuring out importance of terms using tfidf
def stem_tfidf(query):
   query = [query]
  #  stemmed_query = [lm.lemmatize(w) for w in query]
   stemmed_query = [ps.stem(w) for w in query]
   tf_query = tf.transform(stemmed_query)
   return tf_query
  
# Getting cosine similarity between things and getting the answer
def cos_sim(x):
   cosineSimilarities = cosine_similarity(tf_doc, x).flatten()
   related_docs_indices = cosineSimilarities.argsort()[:-2:-1]
   if (cosineSimilarities[related_docs_indices] > 0.5):
      ans = [data.loc[data['Question'] == og_text[all_text[i]]] for i in related_docs_indices[:1]]
      # print(ans)
      for item in ans:
         return item['Sentence'].iloc[0]
   else:
      k = 'I am sorry, I cannot help you with this one. Hope to in the future. Cheers :)'
      return k

['how african american were immigr to the us', 'how are glacier cave form', 'how a water pump work', 'how big is bmc softwar in houston tx', 'how much is 1 tablespoon of water', 'how much are the harri potter movi worth', 'how a rocket engin work', 'how old wa sue lyon when she made lolita', 'how are cholera and typhu transmit and prevent', 'how did ann frank die', 'how are aircraft radial engin built', 'how deep can be drill for deep underwat', 'how old wa monica lewinski dure the affair', 'how long wa frank sinatra famou', 'how are antibodi use in', 'how much is jk rowl worth', 'how big is auburndal florida', 'how old is kirk dougla the actor', 'how old is the singer bob seger', 'how long wa i love luci on the air', 'how long wa richard nixon a presid', 'how much is centavo in mexico', 'how long wa micki jame with wwe', 'how did armando christian perez becom famou', 'how old were the twin tower when destroy', 'how are the of electron in each shell determin', 'how mani presid of the u

In [None]:
# Main loop
n = input('\nHello, my name is Bob. What is your name?:\t')
name = naming(n)  # naming function
while(True):
  # User query
  query = input(f'\nHi {name}, I am Bob. How can I help you? If you want to exit, type Bye. : \t')
  query = query.lower()
  query = ''.join((filter(lambda i: i not in symbs, query)))
  # Exit
  if(query=='bye'):
    print(f'\nBob: This is Bob signing off. Bye, take care {name}')
    break
  
  # Changing names
  elif('my name is' in query or 'call me' in query or 'name is' in query or 'change my name to' in query or 'change name to' in query):
    name = naming(query)
    print(f'\nBob: Your name is {name}')
  elif('what' in query and 'my' in query and 'name' in query):
    print(name)
  # Greeting
  elif(greeting(query)!=None):
        print(f'\nBob: {greeting(query)} {name}')
  # Small talk
  elif(tfidf_cosim_smalltalk(small_talk_responses, query)!=None):
      x = tfidf_cosim_smalltalk(small_talk_responses, query)
      print(f'\nBob: {x}')
  # Searching through corpus fo response
  else:
      x = stem_tfidf(query)
      g = cos_sim(x)
      print(f'\nBob: {g}')


Hello, my name is Bob. What is your name?:	joe

Hi joe, I am Bob. How can I help you? If you want to exit, type Bye. : 	my name is eric

Bob: Your name is eric

Hi eric, I am Bob. How can I help you? If you want to exit, type Bye. : 	what's my name
eric

Hi eric, I am Bob. How can I help you? If you want to exit, type Bye. : 	what's up

Bob: The sky

Hi eric, I am Bob. How can I help you? If you want to exit, type Bye. : 	thanks

Bob: Don't mention it. You are welcome

Hi eric, I am Bob. How can I help you? If you want to exit, type Bye. : 	what are glacier caves

Bob: A glacier cave is a cave formed within the ice of a glacier .

Hi eric, I am Bob. How can I help you? If you want to exit, type Bye. : 	when did john adam become president?

Bob: John Adams (October 30, 1735 ( O.S. October 19, 1735) – July 4, 1826) was the second president of the United States (1797–1801), having earlier served as the first vice president of the United States .

Hi eric, I am Bob. How can I help you? If