# Importing libraries

In [1]:
import bs4 as bs                  #beautifulsoup4 library for parsing the webpage
import urllib.request             #the urllib library for connection to a remote webpage
import re                         #the re  library for performing regex operation

import nltk              #the nltk  library for natural language processing
import numpy as np       #the numpy  library for basic array operations
import random            #the random  library is used for random number generation.
import string            #the string  library is used for string manipulation.

In [2]:
#Scrapping the article from the wiki page

rawdata = urllib.request.urlopen('https://en.wikipedia.org/wiki/Global_warming')  
rawdata = rawdata.read()
 
html_data = bs.BeautifulSoup(rawdata,'html.parser')
 
all_paragraphs =html_data.find_all('p')
 
article_content = ""
 
for p in all_paragraphs:  
    article_content += p.text
    
article_content =  article_content.lower() #turning all words to lowercase

In [3]:
#removing numbers from our dataset and replace multiple empty spaces with single space.
#(This step is optional, you can skip it)

article_content = re.sub(r'\[[0-9]*\]', ' ', article_content ) 
article_content = re.sub(r'\s+', ' ', article_content )

In [4]:
#tokenizing the article into sentences:

sentence_list = nltk.sent_tokenize(article_content)
article_words= nltk.word_tokenize(article_content )

# Lemmatization and Punctuation Removal

Lemmatization refers to reducing the word to its root form, as available in the dictionary. For instance, the lemmatized version of the word eating will be eat , better will be good , medium will be media and so on.

Lemmatization helps find similarity between the words since similar words can be used in different tense and different degrees. Lemmatizing them makes them uniform.

Similarly, we will remove punctuations from our text because punctuations do not convey any meaning and if we do not remove them, they will also be treated as tokens.

We will use NLTK’s punkt  and wordnet modules for punctuation removal.
We can then use the WordNetLemmatizer object from the nltk.stem  module for lemmatizing the words.

In [5]:
lemmatizer = nltk.stem.WordNetLemmatizer()
 
def LemmatizeWords(words):
    return [lemmatizer.lemmatize(word) for word in words]
 
remove_punctuation= dict((ord(punctuation), None) for punctuation in string.punctuation)
 
def RemovePunctuations(text):
    return LemmatizeWords(nltk.word_tokenize(text.lower().translate(remove_punctuation)))

In [6]:
#Creating Greetings

greeting_input_texts = ("hey", "hi", "hello", "morning", "evening","greetings","afternoon",)
greeting_replie_texts = ["hey", "hey, how are you?", "👋 how may i help you?", "hello there", "hello", "Welcome, how are you"]

def reply_greeting(text):
 
    for word in text.split():
        if word.lower() in greeting_input_texts:
            return random.choice(greeting_replie_texts)

# Generating Response:
Refer to ReadMe file Now

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def give_reply(user_input):
    chatbot_response=''
    sentence_list.append(user_input)
    word_vectors = TfidfVectorizer(tokenizer=RemovePunctuations, stop_words='english')
    vecrorized_words = word_vectors.fit_transform(sentence_list)
    similarity_values = cosine_similarity(vecrorized_words[-1], vecrorized_words)
    similar_sentence_number =similarity_values.argsort()[0][-2]
    similar_vectors = similarity_values.flatten()
    similar_vectors.sort()
    matched_vector = similar_vectors[-2]
    if(matched_vector ==0):
        chatbot_response=chatbot_response+"I am sorry! I don't understand you. ask me something else"
        return chatbot_response
    else:
        chatbot_response = chatbot_response +sentence_list[similar_sentence_number]
        return chatbot_response

# Interacting with User

we will set a flag "continue_discussion" to True.
Next,  execute a while loop inside which we ask the user to input his/her questions regarding global warming.
The loop executes until the  "continue_discussion"  flag is set to True.
If the user input is equal to the string ‘bye’, the loop terminates by setting  "continue_discussion"  flag to False .
Else if the user input contains words like thank ‘thanks’, ‘thank you very much’ or ‘thank you’ the response generated will be ‘Chatbot: Most welcome’.
If the user input contains a greeting, the response generated will contain greeting. 
Finally, if the user input doesn’t contain ‘bye’ or ‘thank you’ words or greetings, the user input is sent to give_reply function that we created in the last section, the function returns an appropriate response based on cosine similarity.

If you run the above script, you should see a text box asking you for any question regarding global warming, based on the question, a response will be generated.

In [None]:
continue_discussion=True
print("Hello, my name is Prince, I will answer your questions about global warming: say bye to end conversation.")
while(continue_discussion==True):
    user_input = input()
    user_input = user_input .lower()
    if(user_input !='bye'):
        if(user_input =='thanks' or user_input =='thank you very much'  or user_input =='thank you'):
            continue_discussion=False
            print("Prince: Most welcome")
        else:
            if(reply_greeting(user_input)!=None):
                print("Chatbot: "+reply_greeting(user_input))
            else:
                print("Prince: ",end="")
                print(give_reply(user_input))
                sentence_list.remove(user_input)
    else:
        continue_discussion=False
        print("Prince: Take care, bye ..")

Hello, my name is Prince, I will answer your questions about global warming: say bye to end conversation.
