# Building a retrieval based chatbot!

In [26]:
import nltk
import numpy as np
import random
import string

import bs4 as bs
import requests
import re

import warnings
warnings.filterwarnings = False


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/harshit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/harshit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Gathering Data from Wikipedia

In [29]:
r = requests.get('https://en.wikipedia.org/wiki/Cuisine')
raw_html = r.text

In [30]:
##cleaning up
corpus_html = bs.BeautifulSoup(raw_html)

##extracting paragraphs from the html
corpus_paras = corpus_html.find_all('p')
corpus_text = ''

##concatenatig all the paras
for para in corpus_paras:
    corpus_text += para.text

##lowering the text
corpus_text = corpus_text.lower()

In [31]:
corpus_text

'\na cuisine  is a style of cooking characterized by distinctive ingredients, techniques and dishes, and usually associated with a specific culture or geographic region. regional food preparation traditions, customs and ingredients often combine to create dishes unique to a particular region.[1]\na cuisine is primarily influenced by the ingredients that are available locally or through trade, they can even be made into distinct ingredients themselves when they become popular within a region, take for example japanese rice in japanese cuisine and new mexico chile in new mexican cuisine.\nreligious food laws can also exercise a strong influence on cuisine, such as hinduism in indian cuisine, sikhism in punjabi cuisine, buddhism in east asian cuisine, islam in middle eastern cuisine, and judaism in israeli cuisine.\nsome factors that have an influence on a region\'s cuisine include the area\'s climate, the trade among different countries, religiousness or sumptuary laws and culinary cultu

In [32]:
##getting rid of all the empty spaces and special characters
corpus_text = re.sub(r'\[[0-9]*\]', ' ', corpus_text)
corpus_text = re.sub(r'\s+', ' ', corpus_text)

In [33]:
##converting text into sentences and word tokens
corpus_sentences = nltk.sent_tokenize(corpus_text)
corpus_words = nltk.word_tokenize(corpus_text)

In [34]:
corpus_sentences

[' a cuisine is a style of cooking characterized by distinctive ingredients, techniques and dishes, and usually associated with a specific culture or geographic region.',
 'regional food preparation traditions, customs and ingredients often combine to create dishes unique to a particular region.',
 'a cuisine is primarily influenced by the ingredients that are available locally or through trade, they can even be made into distinct ingredients themselves when they become popular within a region, take for example japanese rice in japanese cuisine and new mexico chile in new mexican cuisine.',
 'religious food laws can also exercise a strong influence on cuisine, such as hinduism in indian cuisine, sikhism in punjabi cuisine, buddhism in east asian cuisine, islam in middle eastern cuisine, and judaism in israeli cuisine.',
 "some factors that have an influence on a region's cuisine include the area's climate, the trade among different countries, religiousness or sumptuary laws and culinar

In [35]:
corpus_words

['a',
 'cuisine',
 'is',
 'a',
 'style',
 'of',
 'cooking',
 'characterized',
 'by',
 'distinctive',
 'ingredients',
 ',',
 'techniques',
 'and',
 'dishes',
 ',',
 'and',
 'usually',
 'associated',
 'with',
 'a',
 'specific',
 'culture',
 'or',
 'geographic',
 'region',
 '.',
 'regional',
 'food',
 'preparation',
 'traditions',
 ',',
 'customs',
 'and',
 'ingredients',
 'often',
 'combine',
 'to',
 'create',
 'dishes',
 'unique',
 'to',
 'a',
 'particular',
 'region',
 '.',
 'a',
 'cuisine',
 'is',
 'primarily',
 'influenced',
 'by',
 'the',
 'ingredients',
 'that',
 'are',
 'available',
 'locally',
 'or',
 'through',
 'trade',
 ',',
 'they',
 'can',
 'even',
 'be',
 'made',
 'into',
 'distinct',
 'ingredients',
 'themselves',
 'when',
 'they',
 'become',
 'popular',
 'within',
 'a',
 'region',
 ',',
 'take',
 'for',
 'example',
 'japanese',
 'rice',
 'in',
 'japanese',
 'cuisine',
 'and',
 'new',
 'mexico',
 'chile',
 'in',
 'new',
 'mexican',
 'cuisine',
 '.',
 'religious',
 'food',


## Generating greeting responses on predefined set of inputs

In [36]:
greeting_inputs = ("hey", "good morning", "good evening", "morning", "evening", "hi", "whatsup")
greeting_responses = ["hey", "hey hows you?", "*nods*", "hello, how you doing", "hello", "Welcome, I am good and you"]

def greet_response(greeting):
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)

## Preprocessing with Punctuation Removal and Lemmatizing

In [37]:
wn_lemmatizer = nltk.stem.WordNetLemmatizer()

def perform_lemmatization(tokens):
    return [wn_lemmatizer.lemmatize(token) for token in tokens]

punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)

def get_processed_text(document):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))

## Language Modeling with tf-idf

In [40]:
def respond(user_input):
    '''
    generating response after 
    '''
    bot_response = ''
    corpus_sentences.append(user_input)

    ##vectorizing the processed text
    word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
    corpus_word_vectors = word_vectorizer.fit_transform(corpus_sentences)
    
    
    cos_sim_vectors = cosine_similarity(corpus_word_vectors[-1], corpus_word_vectors)
    similar_response_idx = similar_vector_values.argsort()[0][-2]

    matched_vector = cos_sim_vectors.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0:
        bot_response = bot_response + "I am sorry, what is it, again?"
        return bot_response
    else:
        bot_response = bot_response + corpus_sentences[similar_sentence_number]
        return bot_response

In [38]:
word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')
word_vectors = word_vectorizer.fit_transform(corpus_sentences)

In [46]:
cos_sim_vectors = cosine_similarity(word_vectors[-1], word_vectors)
similar_response_idx = cos_sim_vectors.argsort()[0][-2]
similar_response_idx

40

In [41]:
chat = True
print("Hello, What do you want to learn about Cuisines today?")
while(chat == True):
    user_query = input()
    user_query = user_query.lower()
    if user_query != 'quit':
        if user_query == 'thanks' or user_query == 'thank you':
            chat = False
            print("CuisineBot: You are welcome!")
        else:
            if greet_response(user_query) != None:
                print("CuisineBot: " + greet_response(human_text))
            else:
                print("CuisineBot: ", end="")
                print(respond(user_query))
                corpus_sentences.remove(user_query)
    else:
        chat = False
        print("CuisineBot: Good bye!")

Hello, What do you want to learn about Cuisines today?
bye
CuisineBot: I am sorry, what is it, again?
quit
CuisineBot: Good bye!
