# Importing Libraries

In [45]:
import nltk
import random
import string
import bs4 as bs
import requests
import re
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Data Extraction

In [46]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [47]:
r = requests.get('https://en.wikipedia.org/wiki/Cuisine')
raw_html = r.text


# Data Preparation

In [48]:
corpus_html = bs.BeautifulSoup(raw_html)

corpus_paras = corpus_html.find_all('p')
corpus_text = ''

for para in corpus_paras:
  corpus_text += para.text

corpus_text = corpus_text.lower()

# Data Preprocessing

In [49]:
corpus_text = re.sub(r'\[[0-9]*\]', ' ', corpus_text)
corpus_text = re.sub(r'\s+', ' ', corpus_text)

In [50]:
corpus_sentences = nltk.sent_tokenize(corpus_text)
corpus_words = nltk.word_tokenize(corpus_text)

# Defining Greet Function

In [51]:
greeting_inputs = ['hey', 'hello', 'good morning', 'ssup?']
greeting_responses = ['hey', 'How are you', 'Hi', 'Whatsup']

def greet_response(greeting):
  for token in greeting.split():
    if token.lower() in greeting_inputs:
      return random.choice(greeting_responses)


# Lemmatizing

In [52]:
wn_lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_corpus(tokens):
  return [wn_lemmatizer.lemmatize(token) for token in tokens]

punct_removal_dict = dict((ord(punctuation), None) for punctuation in string.punctuation)

def get_processed_text(document):
  return lemmatize_corpus(nltk.word_tokenize(document.lower().translate(punct_removal_dict)))

# TF-IDF Vectorizer

In [53]:
def respond(user_input):
  bot_response = ''
  corpus_sentences.append(user_input)

  word_vectorizer = TfidfVectorizer(tokenizer = get_processed_text, stop_words = 'english')
  corpus_word_vectors = word_vectorizer.fit_transform(corpus_sentences)

  cos_sim_vectors = cosine_similarity(corpus_word_vectors[-1], corpus_word_vectors)
  similar_response_idx = cos_sim_vectors.argsort()[0][-2]

  matched_vector = cos_sim_vectors.flatten()
  matched_vector.sort()
  vector_matched = matched_vector[-2]

  if vector_matched == 0:
    bot_response = bot_response + 'Sorry! I dont understand'
    return bot_response

  else:
    bot_response = bot_response + corpus_sentences[similar_response_idx]
    return bot_response

# Preparing ChatBox

In [None]:
chat = True
print('Hello! What do you want to learn about cuisines today?')

while(chat ==  True):
  user_query = input()
  #user_query = user_query.lower()
  if user_query != 'quit':
    if user_query == 'thanks' or user_query == 'thank you':
      chat = False
      print('You are welcome!')
    else:
      greet_response(user_query)
      if greet_response(user_query) != None:
        print('True!')
        print('CuisineBost: ' + greet_response(user_query))
      else:
        print('Found none input')
        print('CuisineBot: ', end='')
        print(respond(user_query))
        corpus_sentences.remove(user_query)
  else:
    chat = False
    print('CuisineBot: Good Bye')

Hello! What do you want to learn about cuisines today?
Hi
Found none input
CuisineBot: Sorry! I dont understand
Hi
Found none input
CuisineBot: Sorry! I dont understand
hey
True!
CuisineBost: hey
hey
True!
CuisineBost: Whatsup
food
Found none input
CuisineBot: market stalls selling food are found across southeast asia.
