##First we make our imports.

In [1]:
import pandas as pd
import numpy as np
import nltk
import random
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

##Read in our data.

In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/danielmoore19/COVID-Chat-Bot/master/covid_data/covid_df.csv')

##Make sure our data looks accurate.

In [3]:
df.head()

Unnamed: 0,questions,answers
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza..."
2,Why might someone blame or avoid individuals a...,People in the U.S. may be worried or anxious a...
3,How can people help stop stigma related to COV...,People can fight stigma by providing social su...
4,Why do some state’s COVID-19 case numbers some...,CDC’s overall case numbers are validated throu...


##Everything looks right as rain.

##Define our tagging function for the lemmatizer.

In [0]:
def get_wordnet_pos(word):
  #Map POS tag to first character lemmatize() accepts
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ,
              "N": wordnet.NOUN,
              "V": wordnet.VERB,
              "R": wordnet.ADV}

  return tag_dict.get(tag, wordnet.NOUN)

##Define our preprocessing function to tokenize the words in our question bank. After serveral rounds of testing, I decided to use a ranked answering system which functions better if we do not remove stop words. Thus that line has been commented out.

In [0]:
def preprocess(docs):
  lemmer = WordNetLemmatizer()
  prepd = []

  for doc in docs:
    tokenized = word_tokenize(doc)

    cleaned = [lemmer.lemmatize(token.lower(), get_wordnet_pos(token))
               for token in tokenized
               #if token.lower() not in stopwords.words('english')
               if token.isalpha()]

    untokenized = ' '.join(cleaned)
    prepd.append(untokenized)

  return prepd

##Process our questions.

In [0]:
lemm = preprocess(df['questions'])

##Now we instantiate our model and fit/transform our data and make it a numpy array for our cosine similarity feature.

In [0]:
model = TfidfVectorizer()
tfidf = model.fit_transform(lemm).toarray()

##The heavy lifting, here we design the engine of the bot. We will tranform the user question to tfidf and set it to an array, then create a new df column called similarity that is populated with the similarity scores of the user questions to our question bank. Next, we rank by descending values and--unless the questions are nearly identical--offer the top three matches.

In [0]:
def COVID2bot(user_response):
  text = model.transform([user_response]).toarray()
  df['similarity'] = cosine_similarity(tfidf, text)
  scores = df.sort_values(['similarity'], ascending=False)
  if scores.iloc[0]['similarity'] >= 0.8:
    return (scores.iloc[0]['answers'])
  else:
    first = scores.iloc[0]['questions']
    second = scores.iloc[1]['questions']
    third = scores.iloc[2]['questions']
    user_input = input(
        f'These are the top 3 matches to your question:\n1. "{first}"\n2. "{second}"\n3. "{third}"\nPlease type the number that matches your question, or hit return to ask a different question.\n')
    if user_input == '1':
      return (scores.iloc[0]['answers'])
    elif user_input == '2':
      return (scores.iloc[1]['answers'])
    elif user_input == '3':
      return (scores.iloc[2]['answers'])
    else:
      return ('Please ask another question.')

##Create our greeting.

In [0]:
welcome_input = ("hello", "hi", "greetings", "sup", "what's up","hey",)
welcome_response = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad you are talking to me!"]
def welcome(user_response):
    for word in user_response.split():
        if word.lower() in welcome_input:
            return random.choice(welcome_response)

##Lastly, we initiate the chatbot. We greet the user and let them know they are speaking to a bot. We let them know what type of bot we are, and how do they end the chat session. Viola!

In [10]:
flag=True
print("Greetings! I am a chatbot, and I will try to answer your questions about COVID-19. If you want to exit, type Bye!")
while(flag==True):
    user_response = input()
    user_response = user_response.lower()
    if(user_response not in ['bye','shutdown','exit', 'quit']):
        if(user_response=='thanks' or user_response=='thank you'):
            flag=False
            print("Chatbot : You are welcome..")
        else:
            if(welcome(user_response)!=None):
                print("Chatbot : "+welcome(user_response))
            else:
                print("Chatbot : ",end="")
                print(COVID2bot(user_response))
    else:
        flag=False
        print("Chatbot: Stay safe, and wash your hands!!! ")

Greetings! I am a chatbot, and I will try to answer your questions about COVID-19. If you want to exit, type Bye!
what is the rate of spread?
Chatbot : These are the top 3 matches to your question:
1. "What is community spread?"
2. "What can I do to protect myself and prevent the spread of disease?"
3. "How does the virus spread?"
Please type the number that matches your question, or hit return to ask a different question.
3
The virus that causes COVID-19 is thought to spread mainly from person to person, mainly through respiratory droplets produced when an infected person coughs or sneezes. These droplets can land in the mouths or noses of people who are nearby or possibly be inhaled into the lungs. Spread is more likely when people are in close contact with one another (within about 6 feet).
COVID-19 seems to be spreading easily and sustainably in the community (“community spread”) in many affected geographic areas. Community spread means people have been infected with the virus in a