# TRAINING A CHATBOT INTENT CLASSIFIER

# 1. Importing the libraries

In [1]:
import nltk
import numpy as np
import random

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD




# 2. Downloading the NLTK packages

In [2]:
# download the wordnet and stopwords corpus
nltk.download('wordnet') # wordnet is a lexical database for the English language
nltk.download('stopwords') # stopwords means words like 'the', 'a', 'an', 'is', 'are', etc.
nltk.download('punkt') # punkt means punctuations

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dagbo_b40tnyc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dagbo_b40tnyc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dagbo_b40tnyc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 3. Importing the intents.json file

In [3]:
# import intents file
import json

def load_intents():
    with open('intents.json') as file:
        intents = json.load(file)
    return intents

intents = load_intents()
    

# 4. using nltk techniques to preprocess the data for training

In [4]:
# initialize lemmatizer to get stem of words
lemmatizer = WordNetLemmatizer() # lemma is the root form of the word and it is very accurate than stemming


# loop through each sentence in the intent's patterns
def preprocess_intents(intents):
    # create empty lists for documents, classes and words
    documents = [] # documents means patterns
    classes = [] # classes means tags
    words = [] # words means vocabulary
    ignore_letters = ['!', '?', ',', '.'] # ignore these letters
    stop_words = set(stopwords.words('english')) # stop words are words like 'the', 'a', 'an', 'is', 'are', etc.


    for intent in intents['intents']:
        
        # debug for keyerror @ 'patterns'
        # print(intent['patterns'])
        for pattern in intent['patterns']:
            # tokenize each and every word in the sentence
            word = nltk.word_tokenize(pattern) 
            # lemmatize each word and convert into lowercase
            word = [lemmatizer.lemmatize(w.lower()) for w in word if w not in stop_words and w not in ignore_letters]
            # add word to the word list
            words.extend(word) # extend means add to the list and append means add to the end of the list
            # add word(s) to documents
            documents.append((word, intent['tag'])) #
            # add tags to our classes list
            if intent['tag'] not in classes: # if tag is not in classes list
                classes.append(intent['tag']) # then add it to the classes list
                
    # sort words and remove duplicates
    words = sorted(list(set(words)))
    # sort classes
    classes = sorted(list(set(classes)))
    return documents, classes, words

documents, classes, words = preprocess_intents(intents)


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

def preprocess_data_for_lstm(documents, words, classes, max_sequence_len=None):
    tokenizer = Tokenizer(num_words=len(words))
    tokenizer.fit_on_texts(words)

    X = tokenizer.texts_to_sequences([' '.join(doc[0]) for doc in documents])
    
    if not max_sequence_len:
        max_sequence_len = max([len(x) for x in X])
    
    X_padded = pad_sequences(X, maxlen=max_sequence_len, padding='post')

    encoder = LabelEncoder()
    Y = encoder.fit_transform([doc[1] for doc in documents])
    Y_categorical = to_categorical(Y)

    train_x, test_x, train_y, test_y = train_test_split(X_padded, Y_categorical, test_size=0.25, random_state=42)

    return train_x, test_x, train_y, test_y, max_sequence_len, tokenizer, encoder


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout

def create_and_train_lstm(train_x, train_y, test_x, test_y, max_sequence_len, num_words, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=num_words, output_dim=64, input_length=max_sequence_len))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(train_x, train_y, epochs=50, batch_size=5, verbose=1)

    loss, accuracy = model.evaluate(test_x, test_y, verbose=0)
    print("Test Loss:", loss)
    print("Test Accuracy:", accuracy)

    return model


In [7]:
# Assume 'documents', 'words', and 'classes' are already defined
train_x, test_x, train_y, test_y, max_len, tokenizer, encoder = preprocess_data_for_lstm(documents, words, classes)

# The number of words and classes
num_words = len(tokenizer.word_index) + 1
num_classes = len(classes)

model = create_and_train_lstm(train_x, train_y, test_x, test_y, max_len, num_words, num_classes)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 4.327894687652588
Test Accuracy: 0.19230769574642181


In [10]:
def evaluate_model(model, tokenizer, encoder, max_len):
    while True:
        message = input("Enter a message (type 'quit' to exit): ")
        if message.lower() == "quit":
            break

        processed_message = nltk.word_tokenize(message)
        processed_message = [lemmatizer.lemmatize(word.lower()) for word in processed_message if word.lower() not in stopwords.words('english')]

        if not processed_message:
            print("Couldn't process the input. Try again.")
            continue

        message_sequence = tokenizer.texts_to_sequences([processed_message])
        message_padded = pad_sequences(message_sequence, maxlen=max_len, padding='post')

        predictions = model.predict(message_padded)
        class_index = np.argmax(predictions, axis=1)[0]
        class_name = encoder.classes_[class_index]

        print(f"Predicted Class: {class_name} (Index: {class_index}, Confidence: {predictions[0][class_index]:.2f})")

evaluate_model(model, tokenizer, encoder, max_len)


Predicted Class: greeting (Index: 47, Confidence: 0.91)
