In [5]:
import pandas as pd
import numpy as np
import json
import random
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [6]:
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('omw-1.4',quiet=True)

True

In [7]:
f = open('zelda.json','r')

In [8]:
raw=f.read()

In [9]:
raw = raw.lower()

In [10]:
sent_tokens = nltk.sent_tokenize(raw)

In [11]:
word_tokens = nltk.word_tokenize(raw)

In [12]:
df = pd.read_json('zelda.json', orient = 'index')

In [13]:
df.head()

Unnamed: 0,0
The Legend of Zelda: Breath of the Wild,The Legend of Zelda: Breath of the Wild is a 2...
Untitled The Legend of Zelda: Breath of the Wild sequel,An untitled sequel to the 2017 action-adventur...
The Legend of Zelda,The Legend of Zelda is a high fantasy action-a...
Universe of The Legend of Zelda,The Legend of Zelda is a video game franchise ...
Hyrule Warriors: Age of Calamity,Hyrule Warriors: Age of Calamity is a hack-and...


In [14]:
df.reset_index(inplace = True)

In [15]:
df.rename(columns = {'index':'doc_title', 0:'doc_text'}, inplace = True)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   doc_title  30 non-null     object
 1   doc_text   30 non-null     object
dtypes: object(2)
memory usage: 608.0+ bytes


In [17]:
df['doc_title'].duplicated().sum()

0

In [18]:
df['doc_text'].duplicated().sum()

0

In [19]:
df['doc_title']

0               The Legend of Zelda: Breath of the Wild
1     Untitled The Legend of Zelda: Breath of the Wi...
2                                   The Legend of Zelda
3                       Universe of The Legend of Zelda
4                      Hyrule Warriors: Age of Calamity
5                                                Urbosa
6                       List of Game of the Year awards
7                                        Princess Zelda
8                                                 Mipha
9                            Link (The Legend of Zelda)
10                                         Prince Sidon
11                                      Nintendo Switch
12                                    Horizon Zero Dawn
13                                       Genshin Impact
14                                 The Game Awards 2017
15                                 Hidemaro Fujibayashi
16                                      Sonic Frontiers
17                                              

Here we see there are a few docs that don't actually talk about Zelda: Breath of the Wild, so I'm going to drop those real quick

In [20]:
df.drop([12, 13, 16, 24, 29])

Unnamed: 0,doc_title,doc_text
0,The Legend of Zelda: Breath of the Wild,The Legend of Zelda: Breath of the Wild is a 2...
1,Untitled The Legend of Zelda: Breath of the Wi...,An untitled sequel to the 2017 action-adventur...
2,The Legend of Zelda,The Legend of Zelda is a high fantasy action-a...
3,Universe of The Legend of Zelda,The Legend of Zelda is a video game franchise ...
4,Hyrule Warriors: Age of Calamity,Hyrule Warriors: Age of Calamity is a hack-and...
5,Urbosa,Urbosa is a fictional character from Nintendo'...
6,List of Game of the Year awards,Game of the Year (GotY) is an award given by v...
7,Princess Zelda,Princess Zelda is the titular character in Nin...
8,Mipha,Mipha is a fictional character in Nintendo's T...
9,Link (The Legend of Zelda),Link is a fictional character and the protagon...


In [21]:
def clean_doc(doc): 
    #split document into individual words
    tokens=doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 4]
    #lowercase all words
    tokens = [word.lower() for word in tokens]
    # filter out stop words
    stop_words = stopwords.words('english')
    stop_words = set(stop_words)
    tokens = [w for w in tokens if not w in stop_words]         
    # lemmatize
    wnl = WordNetLemmatizer()
    tokens=[wnl.lemmatize(word.lower()) for word in tokens]
    return tokens

In [22]:
df['tokens'] = df['doc_text'].apply(lambda x: clean_doc(x))

In [23]:
df.head()

Unnamed: 0,doc_title,doc_text,tokens
0,The Legend of Zelda: Breath of the Wild,The Legend of Zelda: Breath of the Wild is a 2...,"[legend, zelda, breath, actionadventure, devel..."
1,Untitled The Legend of Zelda: Breath of the Wi...,An untitled sequel to the 2017 action-adventur...,"[untitled, sequel, actionadventure, legend, ze..."
2,The Legend of Zelda,The Legend of Zelda is a high fantasy action-a...,"[legend, zelda, fantasy, actionadventure, vide..."
3,Universe of The Legend of Zelda,The Legend of Zelda is a video game franchise ...,"[legend, zelda, video, franchise, created, jap..."
4,Hyrule Warriors: Age of Calamity,Hyrule Warriors: Age of Calamity is a hack-and...,"[hyrule, warrior, calamity, hackandslash, vide..."


In [24]:
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]
def greeting(sentence):
 
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [25]:
def response(user_response):
    robo_response=''

In [26]:
def tfidf(corpus, titles, ngram_range):
    #this is a function to create the tfidf matrix
    Tfidf=TfidfVectorizer(ngram_range=(1,1))

    #fit the vectorizer using final processed documents.  The vectorizer requires the 
    #stiched back together document.

    TFIDF_matrix=Tfidf.fit_transform(corpus)
    vals = cosine_similarity(TFIDF_matrix[-1], TFIDF_matrix)

    #creating dataframe from TFIDF Matrix
    words = Tfidf.get_feature_names_out()
    matrix=pd.DataFrame(TFIDF_matrix.toarray(), columns=Tfidf.get_feature_names(), index=titles)
    return matrix ,words, vals

In [27]:
final_processed_text = [' '.join(x) for x in df['tokens'].tolist()]

In [28]:
titles = df['doc_title'].tolist()

In [29]:
tfidf_matrix, tfidf_words, tfidf_vals = tfidf(final_processed_text, titles, ngram_range = (1,1))



In [30]:
tfidf_matrix.head()

Unnamed: 0,abandon,abandoned,abandonment,abducting,abgnac,abilitiesin,abilitiesthe,ability,aboard,aboda,...,zombie,zombielike,zombiesin,zone,zoras,ztargeting,zynga,ōhara,ōkami,マスターソード
The Legend of Zelda: Breath of the Wild,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029572,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Untitled The Legend of Zelda: Breath of the Wild sequel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Legend of Zelda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013603,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008845,0.0
Universe of The Legend of Zelda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017015,0.0,0.0,...,0.017702,0.008851,0.0,0.0,0.012503,0.008851,0.0,0.0,0.0,0.0
Hyrule Warriors: Age of Calamity,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
idx=tfidf_vals.argsort()[0][-2]
flat = tfidf_vals.flatten()
flat.sort()
req_tfidf = flat[-2]

In [32]:
def response(user_response):
    robo_response=''

In [33]:
def response(user_response):
    robo_response=''
    if req_tfidf==0:
        robo_response=robo_response+"I am sorry! I don't understand you"
        return robo_response
    else:
        robo_response = robo_response+sent_tokens[idx]
        return robo_response

In [34]:
flag=True
print("ROBO: My name is Robo. I will answer your queries about Chatbots. If you want to exit, type Bye!")
while(flag==True):
    user_response = input()
    user_response=user_response.lower()
    if(user_response!='bye'):
        if(user_response=='thanks' or user_response=='thank you' ):
            flag=False
            print("ROBO: You are welcome..")
        else:
            if(greeting(user_response)!=None):
                print("ROBO: "+greeting(user_response))
            else:
                sent_tokens.append(user_response)
                word_tokens=word_tokens+nltk.word_tokenize(user_response)
                final_words=list(set(word_tokens))
                print("ROBO: ",end="")
                print(response(user_response))
                sent_tokens.remove(user_response)
    else:
        flag=False
        print("ROBO: Bye! take care..")

ROBO: My name is Robo. I will answer your queries about Chatbots. If you want to exit, type Bye!
hello
ROBO: *nods*
who is link?
ROBO: the player controls an amnesiac link, who awakens from a hundred-year slumber, and attempts to regain his memories and prevent the destruction of hyrule by calamity ganon.\nsimilar to the original 1986 the legend of zelda game, players are given little instruction and can explore the open world freely.
what is breath of the wild?
ROBO: the player controls an amnesiac link, who awakens from a hundred-year slumber, and attempts to regain his memories and prevent the destruction of hyrule by calamity ganon.\nsimilar to the original 1986 the legend of zelda game, players are given little instruction and can explore the open world freely.
impa?
ROBO: the player controls an amnesiac link, who awakens from a hundred-year slumber, and attempts to regain his memories and prevent the destruction of hyrule by calamity ganon.\nsimilar to the original 1986 the legen