# TRAINING A CHATBOT INTENT CLASSIFIER

# 1. Importing the libraries

In [42]:
import nltk
import numpy as np
import random

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD


# 2. Downloading the NLTK packages

In [43]:
# download the wordnet and stopwords corpus
nltk.download('wordnet') # wordnet is a lexical database for the English language
nltk.download('stopwords') # stopwords means words like 'the', 'a', 'an', 'is', 'are', etc.
nltk.download('punkt') # punkt means punctuations

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dagbo_b40tnyc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dagbo_b40tnyc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dagbo_b40tnyc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 3. Importing the intents.json file

In [44]:
# import intents file
import json

def load_intents():
    with open('intents.json') as file:
        intents = json.load(file)
    return intents

intents = load_intents()
    

# 4. using nltk techniques to preprocess the data for training

In [45]:
# initialize lemmatizer to get stem of words
lemmatizer = WordNetLemmatizer() # lemma is the root form of the word and it is very accurate than stemming


# loop through each sentence in the intent's patterns
def preprocess_intents(intents):
    # create empty lists for documents, classes and words
    documents = [] # documents means patterns
    classes = [] # classes means tags
    words = [] # words means vocabulary
    ignore_letters = ['!', '?', ',', '.'] # ignore these letters
    stop_words = set(stopwords.words('english')) # stop words are words like 'the', 'a', 'an', 'is', 'are', etc.


    for intent in intents['intents']:
        
        # debug for keyerror @ 'patterns'
        # print(intent['patterns'])
        for pattern in intent['patterns']:
            # tokenize each and every word in the sentence
            word = nltk.word_tokenize(pattern) 
            # lemmatize each word and convert into lowercase
            word = [lemmatizer.lemmatize(w.lower()) for w in word if w not in stop_words and w not in ignore_letters]
            # add word to the word list
            words.extend(word) # extend means add to the list and append means add to the end of the list
            # add word(s) to documents
            documents.append((word, intent['tag'])) #
            # add tags to our classes list
            if intent['tag'] not in classes: # if tag is not in classes list
                classes.append(intent['tag']) # then add it to the classes list
                
    # sort words and remove duplicates
    words = sorted(list(set(words)))
    # sort classes
    classes = sorted(list(set(classes)))
    return documents, classes, words

documents, classes, words = preprocess_intents(intents)


# 5. Creating training and testing data using bag of words technique or term frequency

In [57]:
import numpy
from sklearn.model_selection import train_test_split
# create training and testing data and also convert the words to numbers using bag of words technique
def create_training_testing_data(documents, classes, words):
    training = [] 
    # using one hot encoding for our training data with a bag of words
    output_empty = [0] * len(classes) # output is a '0' for each tag and '1' for the current tag (for each pattern)
    # create a training set, bag of words for each sentence
    for doc in documents:
        # initialize a bag of words
        bag = []
        # list of tokenized words for the pattern
        word_patterns = doc[0]
        # lemmatize each word - create base word, in an attempt to represent related words
        word_patterns = [lemmatizer.lemmatize(word.lower()) for word in word_patterns]
        # create our bag of words arrays with 1 if word matches found in a current pattern
        for word in words:
            bag.append(1) if word in word_patterns else bag.append(0)
            
        # output is a '0' for each tag and '1' for the current tag (for each pattern)
        output_row = list(output_empty)
        output_row[classes.index(doc[1])] = 1
        
        training.append([bag, output_row])
        
    # shuffle the features and make numpy array
    random.shuffle(training)
    training = np.array(training, dtype=object)
    
    
    # Splitting features and target (label) for training and testing using sklearn
    train_x, test_x, train_y, test_y = train_test_split(training[:,0], training[:,1], test_size=0.25)
    return numpy.array(train_x), numpy.array(train_y), numpy.array(test_x), numpy.array(test_y)

# test the function
train_x, train_y, test_x, test_y = create_training_testing_data(documents, classes, words)
print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)


(232,) (232,) (78,) (78,)
