# SearchPageBot replies queries from contents in a webpage.

 This is another example of a chatbot that takes data from a webpage and the chatbot replies to queries related to contents of the page. You need to input the URL of your desired webpage. 
 
# Import necessary libraries

In [None]:
import nltk
import numpy as np
import random
import string

The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language Processing.  

In [None]:
from bs4 import BeautifulSoup

Beautiful Soup uses a pluggable XML or HTML parser to parse a (possibly invalid) document into a tree representation. Beautiful Soup provides methods and Pythonic idioms that make it easy to navigate, search, and modify the parse tree. 
There are mainly two ways to extract data from a website:

- Use the API of the website (if it exists). For example, Facebook has the Facebook Graph API which allows retrieval of data posted on Facebook.
- Access the HTML of the webpage and extract useful information/data from it. This technique is called web scraping or web harvesting or web data extraction.

Easiest way to install external libraries in python is to use pip. pip is a package management system used to install and manage software packages written in Python.
All you need to do is:

In [None]:
pip install requests
pip install html5lib
pip install bs4

# Accessing the HTML content from webpage

In [None]:
import requests 
URL = "https://www.wikipedia.org/data-structures/"
r = requests.get(URL) 
print(r.content) 

In [None]:
soup = BeautifulSoup(r.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib 
print(soup.prettify()) 

# Searching and navigating through the parse tree

Now, we would like to extract some useful data from the HTML content. The soup object contains all the data in the nested structure which could be programmatically extracted. In our example, we are scraping a webpage consisting of data about something. So, we would like to create a program to save those sentences (and all relevant information about them).

In [None]:
import bs4 as bs
import urllib.request
import re
print("Hello, I am GoogleSearch. Give me the URL of a webpage you want answers from here ->  ")
raw_html = urllib.request.urlopen(input())
raw_html = raw_html.read()

article_html = bs.BeautifulSoup(raw_html, 'lxml')

The urllib.request module defines functions and classes which help in opening URLs (mostly HTTP) in a complex world — basic and digest authentication, redirections, cookies and more. 
The urllib.request module defines the following functions:

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)
Open the URL url, which can be either a string or a Request object.

In [None]:
article_paragraphs = article_html.find_all('p')

article_text = ''

for para in article_paragraphs:
    article_text += para.text

article_text = article_text.lower()
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)


# Tokenisation

In [None]:
article_sentences = nltk.sent_tokenize(article_text)
article_words = nltk.word_tokenize(article_text)
wnlemmatizer = nltk.stem.WordNetLemmatizer()
def perform_lemmatization(tokens):
    return [wnlemmatizer.lemmatize(token) for token in tokens]

Converting the context to sentences and words. We shall now define a function called perform_lemmatization which will take as input the tokens and return normalized tokens.

In [None]:
punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)

def get_processed_text(document):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))
greeting_inputs = ("hey", "good morning", "good evening", "morning", "evening", "hi", "whatsup")
greeting_responses = ["hey", "hey hows you?", "*nods*", "hello, how you doing", "hello", "Welcome, I am good and you"]
def generate_greeting_response(greeting):
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)

# Machine learning library to extract words according to stop word

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  

# Matching the keywords in user's responses.

In [None]:
word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
    all_word_vectors = word_vectorizer.fit_transform(article_sentences)
    similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
    similar_sentence_number = similar_vector_values.argsort()[0][-2]

    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0:
        bot_response = bot_response + "I am sorry, I could not understand you!"
        return bot_response
    else:
        bot_response = bot_response + article_sentences[similar_sentence_number]
        return bot_response

word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
all_word_vectors = word_vectorizer.fit_transform(article_sentences)
similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
similar_sentence_number = similar_vector_values.argsort()[0][-2]
continue_dialogue = True

# Building the conversation 

In [None]:
while(continue_dialogue == True):
    print(" Ask your question:  ")
    user_text = input()
    user_text = user_text.lower()
    if user_text != 'bye':
        if user_text == 'thanks' or user_text == 'thank you very much' or user_text == 'thank you':
            continue_dialogue = False
            print("Google Search: Most welcome")
        else:
            if generate_greeting_response(user_text) != None:
                print("Google Search: " + generate_greeting_response(user_text))
            else:
                print("Google Search: ", end="")
                print(generate_response(user_text))
                article_sentences.remove(user_text)
    else:
        continue_dialogue = False
        print("Google Search: Good bye and take care. Come back again if you want to know something...")

