https://www.coursera.org/learn/codio-select-topics-python-natural-language-processing/

# 1 NLP basic workflow
## 1.1 Getting started: corpora, lexicon, and tokens

- Corpus & Corpora
    - Corpus is one collection of texts
    - Corpora in NLTK: Brown, Gutenberg, Web Text, Inaugural, ...
- Lexicon
    - a collection of words / phrases marked with allied information
    - lexicon is a type of corpus
    - Lexicons in NLTK: stopwords, names, CMU pronouncing dictionary, ...
- wordnet
    - wordnet is an english dictionary used to defin single words and phrases
    - wordnet is a lexical corpus in NLTK

In [None]:
# intall nltk
python3 -m pip install nltk==3.6.7

# check package install status
python3 -m pip list

In [None]:
# download nltk data
import nltk
nltk.download()

In [None]:
# access corpora
from nltk.corpus import brown
from nltk.corpus import gutenberg
from nltk.corpus import webtext
from nltk.corpus import nps_chat
from nltk.corpus import reuters
from nltk.corpus import inaugural

# a list of words in corpus
brown.words()

# a list of sentences in corpus, each sentence is a list of words
brown.sents()

# a list of paragraph in corpus, each paragraph is a list of sentences
brown.paras()

# a list of tuples, each tuple is (word, tag)
brown.tagged_words()

# tagged sentences and paragraphs
brown.tagged_sents()
brown.tagged_paras()

In [None]:
# lexicons
from nltk.corpus import stopwords
from nltk.corpus import names
from nltk.corpus import cmudict

# English stopwords
stopwords.words('english')

# names
male_names = names.words('male.txt')
female_names = names.words('female.txt')

In [None]:
# wordnet
from nltk.corpus import wordnet

# find synonym set of a word
hello_synset = wordnet.synsets('hello')
print(f"\nThe Synonym Set for the word 'hello' is: {hello_synset}")

# find lemma names of synonym sets
# lemmas are the names of groups of words, which have varying forms but refer to the same meaning, i.e. walk and walked
hello_lemma_names = wordnet.synset('hello.n.01').lemma_names()
print(f"\nThe Lemma Names in the Synonym Set 'hello.n.01' are: {hello_lemma_names}")


# find lemmas of a word
hello_lemmas =  wordnet.lemmas('hello') 
print(f"\nThe Lemmas in the Word 'hello' are: {hello_lemmas}")

# find lemmas of a synonym set
hello_n_01_synset_lemmas = wordnet.synset('hello.n.01').lemmas()
print(f"\nThe Lemmas in the Synonym Set 'hello.n.01' are: {hello_n_01_synset_lemmas}")


# find synset and synset name of a lemma
hello_n_01_hello_lemma_synset = wordnet.lemma('hello.n.01.hello').synset()
print(f"\nThe Synset for the Lemma 'hello.n.01.hello' is: {hello_n_01_hello_lemma_synset}")

hello_n_01_hello_lemma_synset_name = wordnet.lemma('hello.n.01.hello').name()
print(f"\nThe Word / Synset Name for the Lemma 'hello.n.01.hello' is: {hello_n_01_hello_lemma_synset_name}")


# find definitions and examples of a synonym
hello_def = wordnet.synset('hello.n.01').definition()
print(f"\nThe Definition for the Synonym Set 'hello.n.01' from the word 'hello' is: {hello_def}")

hello_examples = wordnet.synset('hello.n.01').examples()
print(f"\nExamples for the Synonym Set 'hello.n.01' from the word 'hello' is: {hello_examples}")

## 1.2 Tokenize Text

In [None]:
# tokenize by word
from nltk.tokenize import word_tokenize

lorenzo_paragraph = "Lorenzo di Piero de'Medici was an Italian statesman, banker, de facto ruler of the Florentine Republic and the most powerful and enthusiastic patron of Renaissance culture in Italy. Also known as Lorenzo the Magnificent (Lorenzo il Magnifico) by contemporary Florentines, he was a magnate, diplomat, politician and patron of scholars, artists, and poets. As a patron, he's best known for his sponsorship of artists such as Botticelli and Michelangelo. He held the balance of power within the Italic League, an alliance of states that stabilized political conditions on the Italian peninsula for decades, and his life coincided with the mature phase of the Italian Renaissance and the Golden Age of Florence."
word_tokenize(lorenzo_paragraph)

In [None]:
# tokenize by sentence
from nltk.tokenize import sent_tokenize

lorenzo_paragraph = "Lorenzo di Piero de'Medici was an Italian statesman, banker, de facto ruler of the Florentine Republic and the most powerful and enthusiastic patron of Renaissance culture in Italy. Also known as Lorenzo the Magnificent (Lorenzo il Magnifico) by contemporary Florentines, he was a magnate, diplomat, politician and patron of scholars, artists, and poets. As a patron, he's best known for his sponsorship of artists such as Botticelli and Michelangelo. He held the balance of power within the Italic League, an alliance of states that stabilized political conditions on the Italian peninsula for decades, and his life coincided with the mature phase of the Italian Renaissance and the Golden Age of Florence."
sent_tokenize(lorenzo_paragraph)

## 1.3 Access Text from different resources

In [None]:
# Extract text from the web
from urllib.request import urlopen
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize

extinction_url = 'https://www.bbc.com/news/science-environment-61242789'
extinction_html = urlopen(extinction_url)
extinction_html_parse = BeautifulSoup(extinction_html, 'html.parser')

for index, element in enumerate(extinction_html_parse.find_all('p')):
  words = element.get_text()
  print(f'\nTokens in Paragraph {index + 1}: {word_tokenize(words)}')

In [None]:
# Extract text from local files
from nltk.tokenize import word_tokenize

with open('exercises/local_article.txt') as local_text_file:
  raw_local_text_file = local_text_file.read()
  print(f"\nTokens from the Local Article Called 'local_article.txt': {word_tokenize(raw_local_text_file)}")

In [68]:
# use text from user input
from nltk.tokenize import word_tokenize

user_input = input()
print(f'You typed {len(word_tokenize(user_input))} words.')

You typed 5 words.


# 2 Method for analyze natural language
## 2.1 letter frequency

In [None]:
# Helper Methods from https://www.nostarch.com/crackingcodes (BSD Licensed)

ETAOIN = 'ETAOINSHRDLCUMWFGYPBVKJXQZ'
LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

def getLetterCount(message):
    # Returns a dictionary with keys of single letters and values of the
    # count of how many times they appear in the message parameter:
    letterCount = {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'J': 0, 'K': 0, 'L': 0, 'M': 0, 'N': 0, 'O': 0, 'P': 0, 'Q': 0, 'R': 0, 'S': 0, 'T': 0, 'U': 0, 'V': 0, 'W': 0, 'X': 0, 'Y': 0, 'Z': 0}

    for letter in message.upper():
        if letter in LETTERS:
            letterCount[letter] += 1

    return letterCount

def getItemAtIndexZero(items):
    return items[0]

def getFrequencyOrder(message):
    # Returns a string of the alphabet letters arranged in order of most
    # frequently occurring in the message parameter.

    # First, get a dictionary of each letter and its frequency count:
    letterToFreq = getLetterCount(message)

    # Second, make a dictionary of each frequency count to each letter(s)
    # with that frequency:
    freqToLetter = {}
    for letter in LETTERS:
        if letterToFreq[letter] not in freqToLetter:
            freqToLetter[letterToFreq[letter]] = [letter]
        else:
            freqToLetter[letterToFreq[letter]].append(letter)

    # Third, put each list of letters in reverse "ETAOIN" order, and then
    # convert it to a string:
    for freq in freqToLetter:
        freqToLetter[freq].sort(key=ETAOIN.find, reverse=True)
        freqToLetter[freq] = ''.join(freqToLetter[freq])

    # Fourth, convert the freqToLetter dictionary to a list of
    # tuple pairs (key, value), then sort them:
    freqPairs = list(freqToLetter.items())
    freqPairs.sort(key=getItemAtIndexZero, reverse=True)

    # Fifth, now that the letters are ordered by frequency, extract all
    # the letters for the final string:
    freqOrder = []
    for freqPair in freqPairs:
        freqOrder.append(freqPair[1])

    return ''.join(freqOrder)

def main():
    message = """
    Alan Mathison Turing was an English mathematician, computer scientist, logician, 
    cryptanalyst, philosopher, and theoretical biologist. Turing was highly influential 
    in the development of theoretical computer science, providing a formalisation of the 
    concepts of algorithm and computation with the Turing machine, which can be considered 
    a model of a general-purpose computer. He is widely considered to be the father of 
    theoretical computer science and artificial intelligence.
    
    Born in Maida Vale, London, Turing was raised in southern England. He graduated at 
    King's College, Cambridge, with a degree in mathematics. Whilst he was a fellow at 
    Cambridge, he published a proof demonstrating that some purely mathematical yes–no questions 
    can never be answered by computation and defined a Turing machine, and went on to prove 
    that the halting problem for Turing machines is undecidable. In 1938, he obtained his PhD 
    from the Department of Mathematics at Princeton University. During the Second World War, 
    Turing worked for the Government Code and Cypher School (GC&CS) at Bletchley Park, 
    Britain's codebreaking centre that produced Ultra intelligence. For a time he led Hut 8, 
    the section that was responsible for German naval cryptanalysis. Here, he devised a number 
    of techniques for speeding the breaking of German ciphers, including improvements to the 
    pre-war Polish bombe method, an electromechanical machine that could find settings for the 
    Enigma machine. Turing played a crucial role in cracking intercepted coded messages that 
    enabled the Allies to defeat the Axis powers in many crucial engagements, including the 
    Battle of the Atlantic.

    After the war, Turing worked at the National Physical Laboratory, where he designed the 
    Automatic Computing Engine (ACE), one of the first designs for a stored-program computer. 
    In 1948, Turing joined Max Newman's Computing Machine Laboratory, at the Victoria 
    University of Manchester, where he helped develop the Manchester computers and became 
    interested in mathematical biology. He wrote a paper on the chemical basis of morphogenesis
    and predicted oscillating chemical reactions such as the Belousov–Zhabotinsky reaction, 
    first observed in the 1960s. Despite these accomplishments, Turing was never fully recognised
    in Britain during his lifetime because much of his work was covered by the Official Secrets Act.
    """
    ## example from https://en.wikipedia.org/wiki/Alan_Turing

    print(getLetterCount(message))

# Run main function
if __name__ == "__main__":
    main()

## 2.2 Word Frequency

In [None]:
from nltk.corpus import webtext
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize

wt_words = webtext.words()
data_analysis = FreqDist(wt_words)
 
# Let's take the specific words only if their frequency is greater than 3.
filter_words = dict([(m, n) for m, n in data_analysis.items() if len(m) > 3])
data_analysis = FreqDist(filter_words)

data_analysis.plot(25, cumulative=False)

In [None]:
# print top 25 frequent words
most_common = data_analysis.most_common(25)
print("\nIn Order of Most Common Frequencies: ")
for word in most_common:
  print(word)

# print most frequent word
print("\nMost common word: ")
print(data_analysis.max())

# print words show up more than 1000 times
most_common = data_analysis.most_common()
print("\nIn Order of Most Common Frequencies: ")
for word in most_common:
    if word[1] > 1000:
        print(word)

# hapaxes are words show up exactly once
hapaxes = data_analysis.hapaxes()
print("\nHapaxes or Words occuring only once: ")
for word in hapaxes:
    print(word)

## 2.3 Bag of words
- similar to word cloud, a bag of words
    - ignore the order of words
    - represented as a set of unique words in text
    - keep track of the count / frequency of each word
- implement bag of word
    - documents is a list holding the documents as strings in a list
    - build a vocabulary:
        - tokenize the documents and store all tokens in all_words
        - remove stopwords
        - remove punctuation
        - ignore case
        - do stemming or other lemmatization to ignore tense/plural
        - remove duplicate words
    - how to score the presence of known words
        - generate vectors by counting instences of each word
        - generate vectors using 0s and 1s representing a word was present or not
        - frequency = count_word / count_total
- limitations
    - too sparse
    - context loss since discarding word order

In [69]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import string

documents = ["John likes to watch movies. Mary likes Movies too.",
             "Mary also likes to watch football games."]

## Create Tokens from the set of documents
all_words = []
for document in documents:
    all_words.extend(word_tokenize(document))

## ignore case and remove duplicates so only UNIQUE words remain
all_words = [word.lower() for word in all_words] ## ADDED
unique_words = sorted(list(set(all_words)))

## Process out stop words and punctuation to create VOCABULARY
stop_words = set(stopwords.words('english'))
unique_words = filter(lambda word: word not in string.punctuation, unique_words)
vocabulary = [element for element in unique_words if element.casefold() not in stop_words]

print(vocabulary)


## Generate bag of word vectors by counting instances of each word
for document in documents:
        words = word_tokenize(document)
        bag_vector = np.zeros(len(vocabulary))
        for w in words:
            for i,word in enumerate(vocabulary):
                if word.lower() == w.lower():
                    bag_vector[i] += 1
                    
        print("{0} \n{1}\n".format(document,np.array(bag_vector)))

['also', 'football', 'games', 'john', 'likes', 'mary', 'movies', 'watch']
John likes to watch movies. Mary likes Movies too. 
[0. 0. 0. 1. 2. 1. 2. 1.]

Mary also likes to watch football games. 
[1. 1. 1. 0. 1. 1. 0. 1.]



In [71]:
# use Scikit-Learn CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

documents = ["John likes to watch movies. Mary likes Movies too.",
             "Mary also likes to watch football games."]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())
print(df_bow_sklearn.head())

   also  football  games  john  likes  mary  movies  to  too  watch
0     0         0      0     1      2     1       2   1    1      1
1     1         1      1     0      1     1       0   1    0      1


In [72]:
# if remove stopwords
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
print(X.toarray())

[[0 0 1 2 1 2 1]
 [1 1 0 1 1 0 1]]


In [None]:
# if use binary score
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(documents)
print(X.toarray())

In [None]:
# n-gram
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X = vectorizer.fit_transform(documents)

print(vectorizer.get_feature_names_out())
print(X.toarray())

## 2.4 Term Frequency-Inverse Decument Frequency (TF-IDF)
- sometimes refered as "weighted" bag of word
- penalize words are frequent in all documents, i.e. "the"
- TF-IDF scores have the effect of highlighting words that are distinct (contain useful information) in a given document.algorithm:
    - TF(word, document) = count_word_in_document / total_word_in_document
    - IDF(word) = log(count_documents / count_documents_contain_word)
    - TF-IDF(word, document) = TF(word, document) * IDF(word)


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import string
 
documents = ['The dog plays fetch', 'The cat hunts bugs']
 
# Create Tokens from the set of documents
all_words = []
for document in documents:
    all_words.extend(word_tokenize(document))

## remove duplicates so only UNIQUE words remain
all_words_lower_case = [word.lower() for word in all_words]
unique_words = sorted(list(set(all_words_lower_case)))

## Process out stop words to create VOCABULARY
stop_words = set(stopwords.words('english'))
unique_words = filter(lambda word: word not in string.punctuation, unique_words)
vocabulary = [element for element in unique_words if element.casefold() not in stop_words]

total_documents = len(documents)
total_documents

vectors = []

for document in documents:
    doc_vec = np.zeros((len(vocabulary),)) # create vector of all 0's
    index = 0 # tracks which vocabulary word we are on as vector index
    for word in vocabulary:
        
        # calculate TF
        occurences = len([token for token in document.split() if token.lower() == word])
        words_in_doc = len(document.split())
        tf = float(occurences/words_in_doc)

        # calculate IDF
        occurences = 0
        for inner_document in documents:
            if word in inner_document.lower():
                occurences += 1

        idf = np.log10(total_documents/occurences)

        # store IDF into the vector
        doc_vec[index] = tf * idf
        index += 1

    # inside outer loop but not inner loop
    vectors.append(doc_vec) 

# outside both loops
print("{0} \n{1}\n".format(vocabulary,np.array(vectors)))

In [None]:
# use Scikit-Learn TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

documents = ['The dog plays with the ball', 'The cat plays with the ball']

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

print(vectorizer.get_feature_names_out()) # prints vocabulary
print(X.toarray()) # prints TF-IDF scores

In [None]:
# remove stopwords
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

print(vectorizer.get_feature_names_out()) # prints vocabulary
print(X.toarray()) # prints TF-IDF scores. it's different than previous cell since sklearn uses some improved algorithm, i.e. idf smooth

In [None]:
# turn off smooth_idf
vectorizer = TfidfVectorizer(smooth_idf=False)

X = vectorizer.fit_transform(documents)

print(vectorizer.get_feature_names_out()) # prints vocabulary
print(X.toarray()) # prints TF-IDF scores

In [None]:
# n-gram
vectorizer = TfidfVectorizer(stop_words='english', analyzer='word', ngram_range=(2, 2))
X = vectorizer.fit_transform(documents)

print(vectorizer.get_feature_names_out())
print(X.toarray()) # prints TF-IDF scores

## 2.5 Text similarity metrics
- Jaccard Similarity Coefficient Scorey: J(A,B) = (A and B) / (A or B)
- Euclidean distance: d(p,q)^2 = (q1 - p1)^2 + (q2 - p2)^2 + ... + (qn - pn)^2
- Cosine similarity: cos(theta) = (A.B) / x*y = (a1b1 + ... + anbn) / [(a1^2 + ... + an^2)^0.5 * (b1^2 + ... + bn^2)^0.5]

In [None]:
# Jaccard Similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score

documents = ["Bunnies like to eat lettuce more than carrots.", 
             "Fish like to play with bubbles while swimming."]

vectorizer = CountVectorizer(stop_words='english', binary=True)
X = vectorizer.fit_transform(documents)

print(jaccard_score(X.toarray()[0], X.toarray()[1]))

In [None]:
# Euclidean distance
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances

documents = ["Bunnies like to eat lettuce more than carrots.", 
             "Fish like to play with bubbles while swimming."]

vectorizer = CountVectorizer(stop_words='english', binary=True)
X = vectorizer.fit_transform(documents)

print( euclidean_distances(X) )

In [None]:
# Cosine similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

documents = ["Bunnies like to eat lettuce more than carrots.", 
             "Fish like to play with bubbles while swimming."]

vectorizer = CountVectorizer(stop_words='english', binary=True)
X = vectorizer.fit_transform(documents)

print( cosine_similarity(X) )

## 2.6 Language Models
- A language model is a model which understands language – more precisely how words occur together in natural language. A language model is used to predict what word comes next.
- probabilistic language models and machine learning language models
- use pre-trained language models:
    - What task are you using it for?
    - What are the technical requirements to use the model?
- Popular Pre-Trained Models
    - BERT from Google
    - GPT-3 from OpenAI

In [None]:
# Example Using DialoGPT
import os
import transformers # need to install this package

model = transformers.pipeline("conversational", model="microsoft/DialoGPT-small")

## conversation runner code
user = input("Enter text or type 'bye' to end: ").strip().lower()
while user != 'bye':

    # generate and print response using model
    response = str(model(transformers.Conversation(user), pad_token_id=50256))
    print(response[response.find("bot >> ")+6:].strip())

    # get next user input
    user = input("Enter text or type 'bye' to end: ").strip().lower()

# end of chat
print("Good bye!")

response = str(model(transformers.Conversation(user), pad_token_id=50256))

In [None]:
# Example Using Blenderbot
import os
import transformers

#initialize pretrained model
model = transformers.pipeline("conversational", model="facebook/blenderbot_small-90M")

## conversation runner code
user = input("Enter text or type 'bye' to end: ").strip().lower()
while user != 'bye':

    # generate and print response
    response = str(model(transformers.Conversation(user)))
    print(response[response.find("bot >> ")+6:].strip())

    # get next user input
    user = input("Enter text or type 'bye' to end: ").strip().lower()

# end of chat
print("Good bye!")

# 3 Lab: create a chatbot

In [None]:
# create a simple hard-coded chatbot
import tkinter.scrolledtext as tks #creates a scrollable text window
from datetime import datetime
from tkinter import *


# Generating response
def get_bot_response(user_input):

  bot_response = ""
  if(user_input == "hello"):
    bot_response = "Hi!"
  elif(user_input == "hi" or user_input == "hii" or user_input == "hiiii"):
    bot_response = "Hello there! How are you?"
  elif(user_input == "how are you"):
    bot_response = "Oh, I'm great! How about you?"
  elif(user_input == "fine" or user_input == "i am good" or user_input == "i am doing good"):
    bot_response = "That's excellent! How can I help you today?"
  else:
    bot_response = "I'm sorry, I don't understand..."      
    
  return bot_response


def create_and_insert_user_frame(user_input):
  userFrame = Frame(chatWindow, bg="#d0ffff")
  Label(
      userFrame,
      text=user_input,
      font=("Arial", 11),
      bg="#d0ffff").grid(row=0, column=0, sticky="w", padx=5, pady=5)
  Label(
      userFrame,
      text=datetime.now().strftime("%H:%M"),
      font=("Arial", 7),
      bg="#d0ffff"
  ).grid(row=1, column=0, sticky="w")

  chatWindow.insert('end', '\n ', 'tag-right')
  chatWindow.window_create('end', window=userFrame)


def create_and_insert_bot_frame(bot_response):
  botFrame = Frame(chatWindow, bg="#ffffd0")
  Label(
      botFrame,
      text=bot_response,
      font=("Arial", 11),
      bg="#ffffd0",
      wraplength=400,
      justify='left'
  ).grid(row=0, column=0, sticky="w", padx=5, pady=5)
  Label(
      botFrame,
      text=datetime.now().strftime("%H:%M"),
      font=("Arial", 7),
      bg="#ffffd0"
  ).grid(row=1, column=0, sticky="w")

  chatWindow.insert('end', '\n ', 'tag-left')
  chatWindow.window_create('end', window=botFrame)
  chatWindow.insert(END, "\n\n" + "")


def send(event):
    chatWindow.config(state=NORMAL)

    user_input = userEntryBox.get("1.0",'end-2c')
    user_input_lc = user_input.lower()
    bot_response = get_bot_response(user_input_lc) 

    create_and_insert_user_frame(user_input)
    create_and_insert_bot_frame(bot_response)

    chatWindow.config(state=DISABLED)
    userEntryBox.delete("1.0","end")
    chatWindow.see('end')


baseWindow = Tk()
baseWindow.title("The Simple Bot")
baseWindow.geometry("500x250")

chatWindow = tks.ScrolledText(baseWindow, font="Arial")
chatWindow.tag_configure('tag-left', justify='left')
chatWindow.tag_configure('tag-right', justify='right')
chatWindow.config(state=DISABLED)

sendButton = Button(
    baseWindow,
    font=("Verdana", 12, 'bold'),
    text="Send",
    bg="#fd94b4",
    activebackground="#ff467e",
    fg='#ffffff',
    command=send)
sendButton.bind("<Button-1>", send)
baseWindow.bind('<Return>', send)

userEntryBox = Text(baseWindow, bd=1, bg="white", width=38, font="Arial")

chatWindow.place(x=1, y=1, height=200, width=500)
userEntryBox.place(x=3, y=202, height=27)
sendButton.place(x=430, y=200)

baseWindow.mainloop()    

In [None]:
# create chatbot with nltk.chat
import tkinter.scrolledtext as tks #creates a scrollable text window

from datetime import datetime
from tkinter import *
from nltk.chat.util import Chat


# Generating response
def get_bot_response(user_input):
  pairs = [
    ('my name is (.*)', ['Hello ! % 1']),
    ('(hi|hello|hey|holla|hola)', ['Hey there !', 'Hi there !', 'Hey !']),
    ('(.*) your name ?', ['My name is Geeky']),
    ('(.*) do you do ?', ['We provide a platform for tech enthusiasts, a wide range of options !']),
    ('(.*) created you ?', ['Geeksforgeeks created me using python and NLTK'])
  ]

  chat = Chat(pairs)  
  while user_input[-1] in "!.":
    user_input = user_input[:-1]
  bot_response = chat.respond(user_input)   
  return bot_response


def create_and_insert_user_frame(user_input):
  userFrame = Frame(chatWindow, bg="#d0ffff")
  Label(
      userFrame,
      text=user_input,
      font=("Arial", 11),
      bg="#d0ffff").grid(row=0, column=0, sticky="w", padx=5, pady=5)
  Label(
      userFrame,
      text=datetime.now().strftime("%H:%M"),
      font=("Arial", 7),
      bg="#d0ffff"
  ).grid(row=1, column=0, sticky="w")

  chatWindow.insert('end', '\n ', 'tag-right')
  chatWindow.window_create('end', window=userFrame)


def create_and_insert_bot_frame(bot_response):
  botFrame = Frame(chatWindow, bg="#ffffd0")
  Label(
      botFrame,
      text=bot_response,
      font=("Arial", 11),
      bg="#ffffd0",
      wraplength=400,
      justify='left'
  ).grid(row=0, column=0, sticky="w", padx=5, pady=5)
  Label(
      botFrame,
      text=datetime.now().strftime("%H:%M"),
      font=("Arial", 7),
      bg="#ffffd0"
  ).grid(row=1, column=0, sticky="w")

  chatWindow.insert('end', '\n ', 'tag-left')
  chatWindow.window_create('end', window=botFrame)
  chatWindow.insert(END, "\n\n" + "")


def send(event):
    chatWindow.config(state=NORMAL)

    user_input = userEntryBox.get("1.0",'end-2c')
    user_input_lc = user_input.lower()
    bot_response = get_bot_response(user_input_lc) 

    create_and_insert_user_frame(user_input)
    create_and_insert_bot_frame(bot_response)

    chatWindow.config(state=DISABLED)
    userEntryBox.delete("1.0","end")
    chatWindow.see('end')


baseWindow = Tk()
baseWindow.title("The Simple Bot")
baseWindow.geometry("500x250")

chatWindow = tks.ScrolledText(baseWindow, font="Arial")
chatWindow.tag_configure('tag-left', justify='left')
chatWindow.tag_configure('tag-right', justify='right')
chatWindow.config(state=DISABLED)

sendButton = Button(
    baseWindow,
    font=("Verdana", 12, 'bold'),
    text="Send",
    bg="#fd94b4",
    activebackground="#ff467e",
    fg='#ffffff',
    command=send)
sendButton.bind("<Button-1>", send)
baseWindow.bind('<Return>', send)

userEntryBox = Text(baseWindow, bd=1, bg="white", width=38, font="Arial")

chatWindow.place(x=1, y=1, height=200, width=500)
userEntryBox.place(x=3, y=202, height=27)
sendButton.place(x=430, y=200)

baseWindow.mainloop()    

In [None]:
# create chatbot with google search
import string
import nltk
import requests

from googlesearch import search
from lxml import html
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer


# helper function to generate text corpus from html elements
def generate_corpus(all_p_elements):
    corpus = ""
    for p_element in all_p_elements:
        corpus += '\n' + ''.join(p_element.findAll(text = True))
    return corpus


# preprocessing (lemmatization, removing punctuation)
def LemTokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]


def LemNormalize(text):
    remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))


def calculate_cosine_similarity(tfidf):

    # Calculate cosine_similarity
    cosine_sim_matrix = cosine_similarity(tfidf[-1], tfidf)
    
    # get index second highest value for cosine_similarity 
    # highest value will be the user_input itself
    idx = cosine_sim_matrix.argsort()[0][-2]
    cosine_sim_flattened = cosine_sim_matrix.flatten()
    cosine_sim_flattened.sort()
    req_tfidf = cosine_sim_flattened[-2]

    return idx, req_tfidf


def get_bot_response(user_input):
    # default bot response
    bot_response = "I'm sorry, I don't think I can help you with that :("
    try:
        # use the google search api to fetch top 3 search results
        google_search_results = list(search(user_input, stop=3, pause=1))
        
        # use the requests api to fetch the top result webpage
        webpage = requests.get(google_search_results[0])
        webpage_tree = html.fromstring(webpage.content)
        webpage_soup = BeautifulSoup(webpage.content, "lxml")
        
        # extract all <p> elements from webpage soup object
        all_p_list = webpage_soup.findAll('p')
        
        # generate corpus from all <p> elements
        google_search_corpus = generate_corpus(all_p_list)

        # Tokenisation
        sentence_tokens = nltk.sent_tokenize(google_search_corpus)# converts raw text to list of sentences

        # Calculate TFIDF matrix
        sentence_tokens.append(user_input)
        tfidf_vectorizer = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
        tfidf = tfidf_vectorizer.fit_transform(sentence_tokens)

        idx, req_tfidf = calculate_cosine_similarity(tfidf)
        
        if(req_tfidf==0):
            # if value of cosine_similarity == 0, similar sentence not found 
            bot_response = "I am sorry! I don't think I can help you with that at the moment..."
        else:
            bot_response = sentence_tokens[idx]
        sentence_tokens.remove(user_input)
        return bot_response

    except:
      # return the default response if corpus is empty
      if len(google_search_corpus) == 0:
        return bot_response

In [None]:
# generate chatbot with a pre-trained model
import os
import transformers


def initialize_model():

  model = transformers.pipeline("conversational", model="facebook/blenderbot_small-90M")
  os.environ["TOKENIZERS_PARALLELISM"] = "true"

  return model


def get_bot_response(model, user_input):

  chat = model(transformers.Conversation(user_input))
  bot_response = str(chat)
  bot_response = bot_response[bot_response.find("bot >> ")+6:].strip()

  return bot_response