<h1><center>ChatBot</center></h1>

In [1]:
# importing required libaries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# read data
df = pd.read_excel('./data/chat_bot.xlsx')
print("Reading Data")

Reading Data


In [3]:
# dropping null
df.dropna(inplace=True)
print("Dropping NaN")

Dropping NaN


In [4]:
# initalize lemmatizer
lemmatizer = WordNetLemmatizer()

# text_normalization
def text_normalize(text):
    """
    A text_normalize function for pre processing the text. Like 
    removing special characters and lemmatization the text with
    POS parts of speech. In this function POS like verb, adverb
    adjective is used.
    """
    lema_sent = []
    try:
        pre_text = text.lower()
        words = re.sub(r'[^a-z0-9 ]','',pre_text)
        tag_list = pos_tag(nltk.word_tokenize(words),tagset=None)
        for token,pos_token in tag_list:
            if pos_token.startswith('V'): #verb
                pos_val = 'v'
            elif pos_token.startswith('J'): #adjective
                pos_val = 'a'
            elif pos_token.startswith('R'): #adverb
                pos_val = 'r'
            else: #any parts of speech except verb, adjective, adverb
                pos_val = 'n'
            lema_token = lemmatizer.lemmatize(token,pos_val) #computing lematization
            lema_sent.append(lema_token) #append values in list
            
        return " ".join(lema_sent)
    except:
        pass

print("In the Halfway....")

# applying
df['lemmatized'] = df['Context'].apply(text_normalize)

In the Halfway....


In [5]:
#initialze
cv = CountVectorizer()

#converting into bow (bag of words)
X = cv.fit_transform(df['lemmatized']).toarray()

In [6]:
# tfidf
tfidf = TfidfVectorizer()

def we_tfidf(lemmatized):
    """
    TFIDF: Term Frequency–Inverse Document Frequency
    Instead of removing the stop words. Lets add some weight
    for it.
    
    More: https://en.wikipedia.org/wiki/Tf-idf
    """
    X_tfid = tfidf.fit_transform(lemmatized).toarray()
    return X_tfid

# applying tfidf
x_tfid = we_tfidf(df.lemmatized)

print("Preprocessing done")

Preprocessing done


In [7]:
# query flittering
def query():
    """
    A function for asking the query from the user.
    Converts query string into the normalized text
    using 'text_normalize' function and converts into
    bag of words.
    """
    string = input("Enter Query:")
    clean = text_normalize(string)
    clean_bow = tfidf.transform([clean]).toarray()
    return clean_bow

#ask query
print("Test Query")
query_ask = query()

Test Query
Enter Query:Hello


In [8]:
query_ask

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [9]:
# cross validation and reply
def validation(x_tfid,query_ask):
    """
    A validation function to validate the query with the data
    using cosine simlilarity as metric.
    Calculates the cosine simlilarity between the data we trained
    and the query we recevied.
    And send the reply for the query
    """
    cos = 1 - pairwise_distances(x_tfid,query_ask,metric='cosine')
    #global ind
    ind = cos.argmax()
    threshold = cos[ind]
    if threshold > 0.2:
        result = df['Text Response'].loc[ind]
    else:
        result = df['Text Response'].loc[51]
    return print(result)

In [10]:
validation(x_tfid,query_ask)

hello world


In [11]:
#Define Quite to end Chat
print("Define Quite Letter Like Q in to order to avoid infinite loop")
QUITE = query()

Define Quite Letter Like Q in to order to avoid infinite loop
Enter Query:Q


In [12]:
#conversation
while True:
    query_ask = query()
    if (query_ask.any() == QUITE.any()) == True:
        print("Thank You Have A Nice Day")
        break
    else:
        validation(x_tfid,query_ask)

Enter Query:Hello
hello world
Enter Query:how are you doing?
Well
Enter Query:Where are you from?
I wish I knew where.
Enter Query:I am in love with you
Thanks! The feeling is mutual.
Enter Query:Q
Thank You Have A Nice Day


In [13]:
#import pickle for saving
import pickle
text_norm = pickle.dumps(text_normalize)

In [14]:
# load for test
text = pickle.loads(text_norm)

In [15]:
text("A Whole nEw World")

'a whole new world'