# Chatbot - example

Example from Chatbot Magazine - https://chatbotsmagazine.com/contextual-chat-bots-with-tensorflow-4391749d0077

In [1]:
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

In [2]:
import pandas as pd
import numpy as np
import tflearn
import tensorflow as tf
import random

  from ._conv import register_converters as _register_converters


## Step 1: Create a json file for intents (intents.json)
- tags are the name of the intent  
- patterns are the questions for the intent so there is a sentence patterns for the NN classifier of which intents  
- resposnes are the answers to the question in that intent

In [4]:
# Import intents file
import json

with open('intents.json') as json_data:
    intents = json.load(json_data)

## Step 2: Preprocess data - words, intents classes and documents

In [8]:
# to use nltk work_tokenize
# nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/hmoon/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
words = []
classes = []
documents = []
ignore_words = ['?']
# loop through each sentence in our intents patterns
for intent in intents['intents']:
    for pattern in intent['patterns']:
        # tokenize each word in the sentence
        w = nltk.word_tokenize(pattern)
        # add to our words list
        words.extend(w)
        # add to documents in our corpus
        documents.append((w, intent['tag']))
        # add to our classes list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

# remove duplicates
classes = sorted(list(set(classes)))

print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)

40 documents
6 classes ['affirm', 'goodbye', 'greeting', 'taxcode', 'taxdescription', 'thanks']
48 unique stemmed words ["'s", 'amount', 'anyon', 'ar', 'bye', 'cas', 'circumst', 'cod', 'correct', 'day', 'deduc', 'exceiv', 'good', 'goodby', 'hello', 'help', 'hi', 'how', 'in', 'indee', 'is', 'lat', 'law', 'liabl', 'limit', 'maxim', 'minim', 'of', 'ok', 'part', 'right', 'rul', 'sect', 'see', 'tax', 'thank', 'that', 'the', 'ther', 'und', 'what', 'wher', 'which', 'withdraw', 'ye', 'yeah', 'you', 'yup']


## Step 3: Train with BOW

### 3a) Tranformation for TensorFlow: from documents of words into tensors of numbers

In [6]:
# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # stem each word
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # create our bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)

# create train and test lists
train_x = list(training[:,0])  # first column = BOW rep
train_y = list(training[:,1])  # second column = intent class rep

### 3b) Build model

In [7]:
# reset underlying graph data
tf.reset_default_graph()
# Build neural network
net = tflearn.input_data(shape=[None, len(train_x[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
net = tflearn.regression(net)

# Define model and setup tensorboard
model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')
# Start training (apply gradient descent algorithm)
model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)
model.save('model.tflearn')

Training Step: 4999  | total loss: [1m[32m0.02297[0m[0m | time: 0.010s
| Adam | epoch: 1000 | loss: 0.02297 - acc: 1.0000 -- iter: 32/40
Training Step: 5000  | total loss: [1m[32m0.02234[0m[0m | time: 0.013s
| Adam | epoch: 1000 | loss: 0.02234 - acc: 1.0000 -- iter: 40/40
--
INFO:tensorflow:/Users/yuhzhao/Desktop/hackathon/project-python-flask-webapp/Bin/model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


In [10]:
# save all of our data structures
import pickle

pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( "training_data", "wb" ) )

## Step 4: Build Chatbot Framework

In [8]:
# restore all of our data structures
import pickle
data = pickle.load( open( "training_data", "rb" ) )
words = data['words']
classes = data['classes']
train_x = data['train_x']
train_y = data['train_y']

# import our chat-bot intents file
import json
with open('intents.json') as json_data:
    intents = json.load(json_data)

In [9]:
# load our saved model
model.load('./model.tflearn')

INFO:tensorflow:Restoring parameters from /Users/yuhzhao/Desktop/hackathon/project-python-flask-webapp/Bin/model.tflearn


In [10]:
# Functions for tokenizing and BOW vector rep of the query

def clean_up_sentence(sentence):
    # tokenize the pattern
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=False):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))

In [11]:
p = bow("what is the tax code for qualified contribution limits?", words)
print (p)

[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
 1 0 0 1 0 0 0 0 0 0 0]


In [100]:
# Functions to classify intent and output responses based on the intent

ERROR_THRESHOLD = 0.25

def classify(sentence):
    # generate probabilities from the model
    results = model.predict([bow(sentence, words)])[0]
    # filter out predictions below a threshold
    results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD]
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append((classes[r[0]], r[1]))
    # return tuple of intent and probability
    return return_list

def response(sentence, userID='123', show_details=False):
    results = classify(sentence)
    # if we have a classification then find the matching intent tag
    if results:
        # loop as long as there are matches to process
        while results:
            for i in intents['intents']:
                # find a tag matching the first result
                if i['tag'] == results[0][0]:
                    # a random response from the intent
                    #return print(random.choice(i['responses']))
                    return random.choice(i['responses'])

            results.pop(0)

In [101]:
# Example 1:
classify('what is the tax code for qualified contribution limits?')

[('taxcode', 0.90338826)]

In [102]:
# Example 1:
response('what is the tax code for qualified contribution limits?')

'Tax code is'

In [98]:
r = response('what is the tax code for qualified contribution limits?')

Tax code is


In [103]:
# Example 2:
classify('How does increased compensation from previous year taxed?')

[('taxdescription', 0.79001725)]

In [104]:
# Example 2:
response('hello')

'Hello, thanks for visiting'

In [126]:
# Import script to search the right tax code and description from tax corpus
from taxcode_tfidf_search_script import *

def full_response(sentence, top_n):
    answer = response(sentence)
    
    if answer == "Tax code is":
        temp = query_wrapper(sentence,cosine_sim_threshold=0.2,top=top_n)
        if len(temp) == 0:
            final = "No tax code found"
        else:
            temp_values = temp['title'].values
            final = (answer + " " + temp_values).tolist()

    elif answer == "Here is what we found in the tax code:":
        temp = query_wrapper(sentence,cosine_sim_threshold=0.2,top=top_n)
        if len(temp) == 0:
            final = "We have not found any section in tax code related to your question"
        else:
            temp_values = temp['title'].values + " " + temp['text'].values
            final = (answer + " " + temp_values).tolist()
        
    else:
        final = answer
        
    return final

In [119]:
# Example: tax codes top 5
full_response("what is the tax code for qualified contribution limits?", 5)

['Tax code is 26 U.S. Code § 414 - Definitions and special rules',
 'Tax code is 26 U.S. Code § 408A - Roth IRAs',
 'Tax code is 26 U.S. Code § 402A - Optional treatment of elective deferrals as Roth contributions',
 'Tax code is 26 U.S. Code § 414 - Definitions and special rules']

In [127]:
# Example: tax descriptions top 3
full_response("For a benefit plan to be considered as a qualified plan, what are the minimum plan participation criteria?",3)

['Here is what we found in the tax code: 26 U.S. Code § 411 - Minimum vesting standards (D) Accrual for service before effective dateSubparagraphs (A)  (B)  and (C) shall not apply with respect to years of participation before the first plan year to which this section applies  but a defined benefit plan satisfies the requirements of this subparagraph with respect to such years of participation only if the accrued benefit of any participant with respect to such years of participation is not less than the greater of—(i)his accrued benefit determined under the plan  as in effect from time to time prior to September 2  1974  or(ii)an accrued benefit which is not less than one-half of the accrued benefit to which such participant would have been entitled if subparagraph (A)  (B)  or (C) applied with respect to such years of participation.',
 'Here is what we found in the tax code: 26 U.S. Code § 410 - Minimum participation standards §\u202f410.Minimum participation standards(a) Participatio