In [6]:
import re
import string

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from optht import optht

In [7]:
corpus = pd.read_json('./data/HDFC_faq.txt')
corpus.head(5)

Unnamed: 0,question,answer,found_duplicate
0,How do I change my password?,"After you have logged in, you can change your ...",False
1,When will I receive my changed ATM PIN?,You will receive your new ATM PIN by post with...,False
2,Can I get my newly generated PIN online?,"No, for security reasons we send you your ATM ...",False
3,How can I register for Autopay?,To register for Autopay: Step 1: Click on the ...,False
4,Can Chip Credit cards be used anywhere?,"Yes, your HDFC Bank Chip Credit card can be us...",False


In [8]:
corpus= corpus[['question', 'answer']]
corpus.head(5)

Unnamed: 0,question,answer
0,How do I change my password?,"After you have logged in, you can change your ..."
1,When will I receive my changed ATM PIN?,You will receive your new ATM PIN by post with...
2,Can I get my newly generated PIN online?,"No, for security reasons we send you your ATM ..."
3,How can I register for Autopay?,To register for Autopay: Step 1: Click on the ...
4,Can Chip Credit cards be used anywhere?,"Yes, your HDFC Bank Chip Credit card can be us..."


In [9]:
corpus.shape

(2236, 2)

In [10]:
# Lets check and drop dupicate questions
corpus.drop_duplicates(subset='question', keep='first', inplace=True)
corpus.reset_index(drop=True, inplace=True)
corpus.shape

(2233, 2)

In [11]:
# lets check and drop Nans
corpus[corpus.isna().any(axis=1)]

Unnamed: 0,question,answer


In [12]:
corpus['question']

0                            How do I change my password?
1                 When will I receive my changed ATM PIN?
2                Can I get my newly generated PIN online?
3                         How can I register for Autopay?
4                 Can Chip Credit cards be used anywhere?
                              ...                        
2228    How to make payment for Insta Loan / Insta Jum...
2229    What is the disbursement time for Insta Loan /...
2230             How to check the available credit limit?
2231    What is the promo code to be entered in the lo...
2232    After loan disbursal, How to check the active ...
Name: question, Length: 2233, dtype: object

### we can seee there are some slashes which are mostly representing ***or*** , so I will go ahead and re place with 'or'

In [5]:
def clean_text1(text):
    ''' Make texts lower case, remove text in square bracket, remove punctuation'''
    text = text.lower()
    text = re.sub(r"""[\/]""", ' or ', text)
    
    # Removes quotation marks.
    text = text.replace('"', "")
    
    # Remove numeric characters.
    text = re.sub('\w*\d\w*', ' ', text)
    
    # Remove puncuation.
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    
    return text
round1 = lambda x: clean_text1(x)

In [14]:
corpus['question'] = corpus['question'].apply(round1)

In [22]:
# Replace "/" with "or"
corpus['question'] = corpus['question'].str.replace(' / ', ' or ')
# Ignore numbers and punctuations
corpus['question'] = corpus['question'].str.replace('[^A-Za-z\s]+', '')
corpus['question']

  corpus['question'] = corpus['question'].str.replace('[^A-Za-z\s]+', '')


0                             How do I change my password
1                  When will I receive my changed ATM PIN
2                 Can I get my newly generated PIN online
3                          How can I register for Autopay
4                  Can Chip Credit cards be used anywhere
                              ...                        
2228    How to make payment for Insta Loan or Insta Ju...
2229    What is the disbursement time for Insta Loan o...
2230              How to check the available credit limit
2231    What is the promo code to be entered in the lo...
2232    After loan disbursal How to check the active l...
Name: question, Length: 2233, dtype: object

179

In [4]:
stemmer = SnowballStemmer('english')
lemmer = WordNetLemmatizer()


<nltk.stem.snowball.SnowballStemmer at 0x13b837730>

In [None]:
def xform_and_train(doc, vectorizer= CountVectorizer(stop_words=None, ngram_range=(1,1))):
    doc_word = vectorizer.fit_transform(document)
    vocab = vectorizer.get_feature_names()
    return vectorizer, doc_word, vocab