In [10]:
import re
import string

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import seaborn as sns 

import pickle as pkl

import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

In [2]:
corpus = pd.read_json('./data/HDFC_faq.txt')
corpus.head(5)

Unnamed: 0,question,answer,found_duplicate
0,How do I change my password?,"After you have logged in, you can change your ...",False
1,When will I receive my changed ATM PIN?,You will receive your new ATM PIN by post with...,False
2,Can I get my newly generated PIN online?,"No, for security reasons we send you your ATM ...",False
3,How can I register for Autopay?,To register for Autopay: Step 1: Click on the ...,False
4,Can Chip Credit cards be used anywhere?,"Yes, your HDFC Bank Chip Credit card can be us...",False


In [3]:
corpus.shape

(2236, 3)

In [4]:
corpus= corpus[['question', 'answer']]
corpus.head(5)

Unnamed: 0,question,answer
0,How do I change my password?,"After you have logged in, you can change your ..."
1,When will I receive my changed ATM PIN?,You will receive your new ATM PIN by post with...
2,Can I get my newly generated PIN online?,"No, for security reasons we send you your ATM ..."
3,How can I register for Autopay?,To register for Autopay: Step 1: Click on the ...
4,Can Chip Credit cards be used anywhere?,"Yes, your HDFC Bank Chip Credit card can be us..."


In [5]:
corpus.shape

(2236, 2)

In [6]:
# Lets check and drop dupicate questions
corpus.drop_duplicates(subset='question', keep='first', inplace=True)
corpus.reset_index(drop=True, inplace=True)
corpus.shape

(2233, 2)

In [7]:
# lets check and drop Nans
corpus[corpus.isna().any(axis=1)]

Unnamed: 0,question,answer


In [8]:
corpus['question']

0                            How do I change my password?
1                 When will I receive my changed ATM PIN?
2                Can I get my newly generated PIN online?
3                         How can I register for Autopay?
4                 Can Chip Credit cards be used anywhere?
                              ...                        
2228    How to make payment for Insta Loan / Insta Jum...
2229    What is the disbursement time for Insta Loan /...
2230             How to check the available credit limit?
2231    What is the promo code to be entered in the lo...
2232    After loan disbursal, How to check the active ...
Name: question, Length: 2233, dtype: object

### we can seee there are some slashes which are mostly representing ***or*** , so I will go ahead and re place with 'or'

In [11]:
# Pickle corpus to models
with open("models/corpus.df", "wb") as f:
    pkl.dump(corpus, f)

In [16]:
def clean_text1(text):
    ''' Make texts lower case, remove text in square bracket, remove punctuation'''
    text = text.lower()
    text = re.sub(r"""[\/]""", ' or ', text)
    
    # Removes quotation marks.
    text = text.replace('"', "")
    
    # Remove numeric characters.
    text = re.sub('\w*\d\w*', ' ', text)
    
    # Remove puncuation.
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    
    return text
round1 = lambda x: clean_text1(x)

In [None]:
corpus['question'].apply(round1)

In [17]:
# Load in my data

stemmer = SnowballStemmer("english")
lemmer = WordNetLemmatizer()

STOP_WORDS = [stemmer.stem(stopword) for stopword in stopwords.words("english")]
L_STOP_WORDS = [lemmer.lemmatize(stopword) for stopword in stopwords.words("english")]

def clean_text1(text):
    ''' Make texts lower case, remove text in square bracket, remove punctuation'''
    text = text.lower()
    text = re.sub(r"""[\/]""", ' or ', text)
    
    # Removes quotation marks.
    text = text.replace('"', "")
    
    # Remove numeric characters.
    text = re.sub('\w*\d\w*', ' ', text)
    
    # Remove punctuation.
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    
    return text

def tokenizer(text):
    
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens


In [None]:
cv1 = CountVectorizer(
    stop_words=STOP_WORDS,
    preprocessor=clean_text1,
    tokenizer=tokenizer,
    min_df=2,
#     max_df=.80
)

document = corpus['question']
doc_term_mtx = cv1.fit_transform(document)
vocab = cv1.get_feature_names()
doc_term_df = pd.DataFrame(doc_term_mtx.toarray(), columns=vocab)
print(doc_term_df.shape)

doc_term_df

# Do modeling here with the count vectors!

In [None]:
# TFIDF
tfidf1 = TfidfVectorizer(
    stop_words=STOP_WORDS,
    preprocessor=clean_text1,
    tokenizer=tokenizer,
    min_df=2,
#     max_df=.80
)

document = corpus['question']
doc_term_mtx1 = tfidf1.fit_transform(document)
vocab = tfidf1.get_feature_names()
doc_term_df1 = pd.DataFrame(doc_term_mtx1.toarray(), columns=vocab)
print(doc_term_df1.shape)

doc_term_df1


## Dimensionality reduction and Topic modeling
1. #### CountVectorizer()

In [None]:
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from numpy.linalg import svd
from optht import optht

* ##### Truncated SVD

In [None]:
tsvd = TruncatedSVD(879)
X_svd= tsvd.fit_transform(doc_term_mtx)

In [None]:
tsvd.explained_variance_ratio_.sum()

In [None]:
def show_variance_explained_plots(algo):
    
    var_exp_array = algo.explained_variance_ratio_
    n_comps = var_exp_array.shape[0] 
    
    fig, ax = plt.subplots(1,2,figsize=(10,6))
    
    ax[0].fill_between(range(n_comps), var_exp_array)
    ax[0].set_title('Variance Explained by Nth Component')
    
    ax[1].fill_between(range(n_comps), np.cumsum(var_exp_array))
    ax[1].set_title('Cumulative Variance Explained by N Components')
    
#     plt.savefig('./pca.jpg', dpi=100)
    
    plt.show()


In [None]:
show_variance_explained_plots(tsvd)

* #### Above will give us an idea on how many components/ topics we can reduce our dimensions to 

### PCA

In [None]:
pca = PCA(879)
X_pca= pca.fit_transform(doc_term_mtx.toarray())

In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
show_variance_explained_plots(pca)

### Optimal Thresholding

In [None]:
from numpy.linalg import svd
from optht import optht

In [None]:
u,s,vt = svd(doc_term_mtx.toarray(), full_matrices=False)
u.shape, s.shape, vt.shape

In [None]:
k = optht(doc_term_mtx1.toarray(), sv=s, sigma=None)
k

This tells that 169 components will be the optimal number of components that will grab the needend infomation without capturiing the noise associated with the data.

This is proven by a published Research paper on $ IEEE $ by Gavin and Donoho, 2014. https://ieeexplore.ieee.org/document/6846297 


In [None]:
def svd_n(doc_t_mtx,n):
    tsvd = TruncatedSVD(n)
    question_topic= tsvd.fit_transform(doc_t_mtx)
    print(f'exp variance sum: {tsvd.explained_variance_ratio_.sum()}')
    return tsvd, question_topic
def nmf_n(doc_t_mtx,n):
    nmf = NMF(n)
    question_topic= nmf.fit_transform(doc_t_mtx)
    return nmf, question_topic

In [None]:
type(svd_n(doc_term_mtx,169))
nmf_n(doc_term_mtx,169)

* about 76% of our docs explained with the 169 components while truncating the noise in our data

In [None]:
question_topic_df = pd.DataFrame(question_topic).add_prefix('topic_')
question_topic_df

question_topic_df[['question', 'answer']] = corpus[['question', 'answer']]
question_topic_df

In [None]:
vocab = cv1.get_feature_names()
word_topic_df = pd.DataFrame(tsvd.components_, columns=vocab).T.add_prefix('topic_')
word_topic_df

In [None]:
for i, question in enumerate(question_topic_df.sort_values(by='topic_16', ascending=False).head(10)['question'].values):
    print(question)
print(i)



In [None]:
word_topic_df.reset_index().sort_values(by='topic_16', ascending=False)

In [None]:
def top_questions(question_topic_df, topic, n_questions):
    return (question_topic_df
            .sort_values(by=topic, ascending=False)
            .head(n_questions)['question']
            .values)

def top_words(word_topic_df, topic, n_words):
    return (word_topic_df
            .reset_index()
            .sort_values(by=topic, ascending=False)
            .head(n_words))['index']

In [None]:
def show_n_questions_and_words(q_topic_df, w_topic_df, n):
    for topic in q_topic_df.columns[:-2]:
        print(f'\n{topic}')
        print(f'Top {n} questions:')
        for q in top_questions(q_topic_df, topic, n):
            print(q)
        print()
        print(f'Top {n} words:')
        for word in top_words(w_topic_df, topic, n):
            print(word)
# show_n_questions_and_words(2)   

In [None]:
print(sorted(STOP_WORDS))

In [None]:
mask = document.str.lower().str.contains('cancel')
document[mask].sample(10).tolist()

In [None]:
for word in top_words(word_topic_df, 'topic_16', 15):
    print(word)

In [None]:
new_q = 'How do I cancel my account'
embeded_querry = cv1.transform([new_q])
embeded_q_topic = tsvd.transform(embeded_querry)

res = []
for index in range(corpus.shape[0]):
    question, embedding = corpus['answer'][index], question_topic[index]
    cos_sim = round(cosine_similarity([embedding], embeded_q_topic)[0][0],3)
    res.append(cos_sim)
#     print(idx)
#     print(sim, sent)

n = 5
idx_array = np.array(res).argsort()[-n:][::-1]
answer_idx = idx_array[0]
print(f'Top {n} clossest questions:\n')
for i in idx_array:
    print(res[i],'---', corpus['question'][i])

print(f'\nQ: {new_q}?')
print(f"Matched: {corpus['question'][answer_idx]}?\n")
print(f"Ans: {corpus['answer'][answer_idx]}?")
embeded_querry

* I will limit to 10 topics now!

In [None]:
tsvd_10 = TruncatedSVD(10)
question_topic_10= tsvd_10.fit_transform(doc_term_mtx)
tsvd_10.explained_variance_ratio_.sum()
question_topic_df_10 = pd.DataFrame(question_topic_10).add_prefix('topic_')

question_topic_df_10[['question', 'answer']] = corpus[['question', 'answer']]
question_topic_df_10


In [None]:
vocab = cv1.get_feature_names()
word_topic_df_10 = pd.DataFrame(tsvd_10.components_ , columns=vocab).T.add_prefix('topic_')
word_topic_df_10

In [None]:
show_n_questions_and_words(question_topic_df_10, word_topic_df_10, 5)

## NMF

### -CountVectorizer

In [None]:
nmf = NMF(10)
nmf_doc_topic = nmf.fit_transform(doc_term_mtx)

nmf_question_topic_df = pd.DataFrame(nmf_doc_topic).add_prefix('topic_')

nmf_question_topic_df[['question', 'answer']] = corpus[['question', 'answer']]
nmf_question_topic_df

In [None]:
vocab = cv1.get_feature_names()
nmf_word_topic_df = pd.DataFrame(nmf.components_ , columns=vocab).T.add_prefix('topic_')
nmf_word_topic_df

In [None]:
show_n_questions_and_words(nmf_question_topic_df, nmf_word_topic_df, 5)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# def clean_vectorize_answer(new_querry, vectorizer, reduction_func):
# clean_sent = re.sub(r"""[\/]""", ' or ', new_querry)
# clean_sent =  re.sub(r'''[^A-Za-z]+''', ' ', clean_sent)
# print(clean_sent)
# embeded_querry = vectorizer.transform([clean_sent])
# embeded_q_topic = reduction_func.transform(embeded_querry)

new_q = 'How do I cancel my account'
embeded_querry = cv1.transform([new_q])
embeded_q_topic = nmf.transform(embeded_querry)

res = []
for index in range(corpus.shape[0]):
    question, embedding = corpus['answer'][index], nmf_doc_topic[index]
    cos_sim = round(cosine_similarity([embedding], embeded_q_topic)[0][0],3)
    res.append(cos_sim)
#     print(idx)
#     print(sim, sent)

n = 5
idx_array = np.array(res).argsort()[-n:][::-1]
answer_idx = idx_array[0]
print(f'Top {n} clossest questions:\n')
for i in idx_array:
    print(res[i],'---', corpus['question'][i])

print(f'\nQ: {new_q}?')
print(f"Matched: {corpus['question'][answer_idx]}?\n")
print(f"Ans: {corpus['answer'][answer_idx]}?")
embeded_querry


## Not satisfied with answer

#### Hence I am goig to try TFI-DF


In [None]:
# TFIDF
tfidf1 = TfidfVectorizer(
    stop_words=STOP_WORDS,
    preprocessor=clean_text1,
    tokenizer=tokenizer,
    min_df=2,
#     max_df=.80
)

document = corpus['question']
doc_term_mtx_tfidf = tfidf1.fit_transform(document)
vocab_tfidf = tfidf1.get_feature_names()
doc_term_df1 = pd.DataFrame(doc_term_mtx_tfidf.toarray(), columns=vocab)
doc_term_df1

In [None]:
## finding optimal thresh
u,s,vt = svd(doc_term_mtx.toarray(), full_matrices=False)
u.shape, s.shape, vt.shape
k = optht(doc_term_mtx_tfidf.toarray(), sv=s, sigma=None)
k

### $LSA$($TrucatedSVD)-169$ 

In [None]:
svd169, question_topic = svd_n(doc_term_mtx_tfidf,169)

In [None]:
q = 'how can i cancel my account'
def predict(model, ques_topic,new_q): 
#     model, ques_topic = func
    embeded_querry = tfidf1.transform([new_q])
    embeded_q_topic = model.transform(embeded_querry)

    res = []
    for index in range(corpus.shape[0]):
        question, embedding = corpus['answer'][index], ques_topic[index]
        cos_sim = round(cosine_similarity([embedding], embeded_q_topic)[0][0],3)
        res.append(cos_sim)
    #     print(idx)
    #     print(sim, sent)

    n = 5
    idx_array = np.array(res).argsort()[-n:][::-1]
    answer_idx = idx_array[0]
    print(f'Top {n} clossest questions:\n')
    for i in idx_array:
        print(res[i],'---', corpus['question'][i])

    print(f'\nQ: {new_q}?')
    print(f"Matched: {corpus['question'][answer_idx]}?\n")
    print(f"Ans: {corpus['answer'][answer_idx]}?")
    return embeded_querry

predict(svd169, question_topic, q)

### $NMF-169$ 

In [None]:
nmf169, question_topic_nmf169 = nmf_n(doc_term_mtx,169)

In [None]:
predict(nmf169, question_topic_nmf169, q)

## I am pretty happy with both results and can be seen the prediction underfits and thinks almost evrything is similar to the question when when we go lower than 169 components and overfits when we go higher because we are adding more noise to the data. So this proves the Gavin and Donoho paper on $IEEE$ 




# * Bonus
## Standard scaling my data to see before dimensionatilty reduction

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaled_doc_term_mtx_tfidf = sc.fit_transform(doc_term_mtx_tfidf.toarray())

* ### finding optimal n components

In [None]:
u,s,vt = svd(scaled_doc_term_mtx_tfidf, full_matrices=False)
u.shape, s.shape, vt.shape
k = optht(scaled_doc_term_mtx_tfidf, sv=s, sigma=None)
k

In [None]:
q1 = 'how do i cancel my insurance'

In [None]:
svd8, question_topic8 = svd_n(scaled_doc_term_mtx_tfidf,169)
predict(svd8, question_topic8, q1)

# Let's put everything together - using functions and finaly a Chatbotclass

In [55]:
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.metrics.pairwise import cosine_similarity

stemmer = SnowballStemmer("english")
lemmer = WordNetLemmatizer()

STOP_WORDS = [stemmer.stem(stopword) for stopword in stopwords.words("english")]
L_STOP_WORDS = [lemmer.lemmatize(stopword) for stopword in stopwords.words("english")]

def clean_text1(text):
    ''' Make texts lower case, remove text in square bracket, remove punctuation'''
    text = text.lower()
    text = re.sub(r"""[\/]""", ' or ', text)
    # Removes quotation marks.
    text = text.replace('"', "")

    # Remove numeric characters.
    text = re.sub('\w*\d\w*', ' ', text)

    # Remove punctuation.
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)

    return text

def tokenizer(text):

    tokens = word_tokenize(text)
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

# TFIDF
def mods(self):
    tfidf = TfidfVectorizer(
        stop_words=STOP_WORDS,
        preprocessor=clean_text1,
        tokenizer=tokenizer,
        min_df=2,
    #     max_df=.80
    )

    model = TruncatedSVD(169)
    return tfidf, model

document = corpus['question']

def train():
    tfidf, model  = mods()
    doc_term_mtx = tfidf.fit_transform(document)
    vocab = tfidf.get_feature_names()
    model.fit(doc_term_mtx)
    question_topic = model.transform( doc_term_mtx)
    
    return model, question_topic

    
def predict(new_q):
    model, ques_topic = train()
    embeded_querry = tfidf1.transform([new_q])
    embeded_q_topic = model.transform(embeded_querry)

    res = []
    for index in range(corpus.shape[0]):
        question, embedding = corpus['answer'][index], ques_topic[index]
        cos_sim = round(cosine_similarity([embedding], embeded_q_topic)[0][0],3)
        res.append(cos_sim)
    #     print(idx)
    #     print(sim, sent)

    n = 5
    idx_array = np.array(res).argsort()[-n:][::-1]
    answer_idx = idx_array[0]
    print(f'Top {n} clossest questions:\n')
    for i in idx_array:
        print(res[i],'---', corpus['question'][i])

    print(f'\nQ: {new_q}?')
    print(f"Matched: {corpus['question'][answer_idx]}?\n")
    print(f"Ans: {corpus['answer'][answer_idx]}?")
    ans = f"Ans: {corpus['answer'][answer_idx]}?"
    return ans
    
    
train()
predict('how do i cancel my account?')

In [56]:
train()
predict('how do i cancel my account?')

Top 5 clossest questions:

0.77 --- How do I cancel my registration?
0.751 --- Can the policy be cancelled?
0.75 --- Can I cancel a transaction?
0.709 --- Can DRFs be rejected? What are the reasons for rejection?
0.704 --- What is the cancellation procedure?

Q: how do i cancel my account??
Matched: How do I cancel my registration??

Ans: There are 2 ways to de-register 1) You can request a de-registration of your biller online by logging into NetBanking ---> Bill Payment Tab ---> View/Delete Billers OR 2) You can visit your HDFC Bank branch and submit an application to de-register the selected biller from the ATM-BillPay service.?


'Ans: There are 2 ways to de-register 1) You can request a de-registration of your biller online by logging into NetBanking ---> Bill Payment Tab ---> View/Delete Billers OR 2) You can visit your HDFC Bank branch and submit an application to de-register the selected biller from the ATM-BillPay service.?'

In [28]:
def mods():
    tfidf = TfidfVectorizer(
        stop_words=STOP_WORDS,
        preprocessor=clean_text1,
        tokenizer=tokenizer,
        min_df=2,
    #     max_df=.80
    )

    model = TruncatedSVD(169)
    return tfidf, model

In [33]:
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.metrics.pairwise import cosine_similarity
import pickle as pkl

class Chatbot():
    
    def __init__(self, corpus):
        self.corpus = corpus
        self.stemmer = stemmer
        self.document = document
        
    stemmer = SnowballStemmer("english")
    STOP_WORDS = [stemmer.stem(stopword) for stopword in stopwords.words("english")]

    def clean_text1(self, text):
        ''' Make texts lower case, remove text in square bracket, remove punctuation'''
        text = text.lower()
        text = re.sub(r"""[\/]""", ' or ', text)
        # Removes quotation marks.
        text = text.replace('"', "")

        # Remove numeric characters.
        text = re.sub('\w*\d\w*', ' ', text)

        # Remove punctuation.
        text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)

        return text

    def tokenizer(text):

        tokens = word_tokenize(text)
        tokens = [stemmer.stem(token) for token in tokens]
        return tokens

    # TFIDF and svd
    def mods(self):
        tfidf = TfidfVectorizer(
            stop_words=STOP_WORDS,
            preprocessor=clean_text1,
            tokenizer=tokenizer,
            min_df=2,
        #     max_df=.80
        )

        model = TruncatedSVD(169)
        return tfidf, model

    document = corpus['question']

    def train(self):
        tfidf, model = mods()
        doc_term_mtx = tfidf.fit_transform(document)
        vocab = tfidf.get_feature_names()
#         model = TruncatedSVD(169)
        model.fit(doc_term_mtx)
        question_topic = model.transform( doc_term_mtx)

        return model, question_topic

    def predict(self,new_q):
        model, ques_topic = train(self)
        embeded_querry = tfidf1.transform([new_q])
        embeded_q_topic = model.transform(embeded_querry)

        res = []
        for index in range(corpus.shape[0]):
            question, embedding = corpus['answer'][index], ques_topic[index]
            cos_sim = round(cosine_similarity([embedding], embeded_q_topic)[0][0],3)
            res.append(cos_sim)
        #     print(idx)
        #     print(sim, sent)

        n = 5
        idx_array = np.array(res).argsort()[-n:][::-1]
        answer_idx = idx_array[0]
        print(f'Top {n} clossest questions:\n')
        for i in idx_array:
            print(res[i],'---', corpus['question'][i])

        print(f'\nQ: {new_q}?')
        print(f"Matched: {corpus['question'][answer_idx]}?\n")
        print(f"Ans: {corpus['answer'][answer_idx]}?")
        ans = f"Ans: {corpus['answer'][answer_idx]}?"
        return ans

    


In [34]:
chatbot_model = Chatbot(corpus)
chatbot_model.train()

NameError: name 'document' is not defined

In [67]:
chatbot.predict('how do i create a new account?')

TypeError: train() takes 0 positional arguments but 1 was given

In [32]:
mask =document.str.lower().str.contains('creat')
document[mask]

621     When does the source account get debited - at ...
1878    What all information is required for User id &...
Name: question, dtype: object

In [34]:
document.iloc[1878]

'What all information is required for User id & password creation?'

In [35]:
chatbot.predict('what do i need to create a new user id and password')

Top 5 clossest questions:

0.638 --- What all information is required for User id & password creation?
0.609 --- If there is no Login Id/Password, is it secure?
0.58 --- Do I need to keep a separate User ID and Password for payment of GVAT and Commercial Tax online?
0.558 --- If I have registered two email IDs, on which email ID will I receive the IVR Password ?
0.553 --- How to set Prepaid NetBanking Login User ID / Password for the first time?

Q: what do i need to create a new user id and password?
Matched: What all information is required for User id & password creation??

Ans: Following information is required to Create User id and Password: Loan Account NumberLast EMI PaidDate of Birth?


<1x880 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [38]:

with open("models/chatbot.mdl", "wb") as f:
    pkl.dump(chatbot_model, f)

In [36]:
with open("models/chatbot.mdl", "rb") as f:
    chatbot_model2 = pkl.load(f)

In [37]:
 chatbot_model2.predict('how can I cancel my account')

NameError: name 'train' is not defined

In [41]:
nlp.pipe()

NameError: name 'nlp' is not defined