<a href="https://colab.research.google.com/github/Dark-Sied/Intent_Classification/blob/master/Intent_classification_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intent Recognition Dolores V01
Using a new vocabulary and training set.  
Intent Recognition is based on Intent_classification_final
Created by Christoph Windheuser, April 2020

In [1]:
import numpy as np
import pandas as pd
import csv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [2]:
# DEFINE GLOBAL VARIABLES:
NUM_SENT = 0
NUM_INTENTS = 0
NUM_INTENTS_UNIQUE = 0
VOCABULARY_SIZE = 0
MAX_SENT_LENGTH = 0

In [12]:
# SHOW GLOBAL VARIABLES
print ("NUM_SENT:           ", NUM_SENT)
print ("NUM_INTENTS:        ", NUM_INTENTS)
print ("NUM_INTENTS_UNIQUE: ", NUM_INTENTS_UNIQUE)
print ("VOCABULARY_SIZE:    ", VOCABULARY_SIZE)
print ("MAX_SENT_LENGTH:    ", MAX_SENT_LENGTH)

NUM_SENT:            346
NUM_INTENTS:         346
NUM_INTENTS_UNIQUE:  14
VOCABULARY_SIZE:     284
MAX_SENT_LENGTH:     12


# Read the data

In [4]:
# df = pd.read_csv("Dolores_Dataset_v01.csv", encoding = "latin1", names = ["Sentence", "Intent"])
df = pd.read_csv("mowgli_train_new.csv", encoding = "latin1", names = ["Sentence", "Intent"])
print(df.head())
intents            = df["Intent"]
NUM_INTENTS        = len(list(df["Intent"]))
intents_unique     = list(set(df["Intent"]))
NUM_INTENTS_UNIQUE = len(intents_unique)
sentences          = list(df["Sentence"])
NUM_SENT           = len(sentences)


                    Sentence           Intent
0         are you a machine?  are_you_a_robot
1  how are the things going?      how_are_you
2             nah not for me             deny
3           What's going on?      how_are_you
4             are you a bot?  are_you_a_robot


In [6]:
print ("Unique Intents: ")
print (intents_unique)
print ("Num of unique Intents: ", len(intents_unique))


Unique Intents: 
['greet', 'are_you_a_robot', 'conversation_restart', 'goodbye', 'insult', 'skills', 'personal_question', 'leave_budget', 'sorry', 'how_are_you', 'deny', 'what_is_your_name', 'confirm', 'thanks']
Num of unique Intents:  14


In [7]:
df.head()

Unnamed: 0,Sentence,Intent
0,are you a machine?,are_you_a_robot
1,how are the things going?,how_are_you
2,nah not for me,deny
3,What's going on?,how_are_you
4,are you a bot?,are_you_a_robot


In [8]:
df.shape

(346, 2)

In [9]:
print(sentences[:5])

['are you a machine?', 'how are the things going?', 'nah not for me', "What's going on?", 'are you a bot?']


# Word Cleaning
re.sub is a routine from the "Regular Expression" Library.     
r'string' means that this is a "raw string", where backslashes are treated as charachters.    
re.sub(r'[^ a-z A-Z 0-9]', " ", s) means that all characters exept a-z, A-Z and 0-9 will be replaced by space.

In [10]:
clean_sent = []
for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    clean_sent.append([i.lower() for i in w])

print (clean_sent[:5])
print("Len of clean_sent: ", len(clean_sent))


[['are', 'you', 'a', 'machine'], ['how', 'are', 'the', 'things', 'going'], ['nah', 'not', 'for', 'me'], ['what', 's', 'going', 'on'], ['are', 'you', 'a', 'bot']]
Len of clean_sent:  346


### Documentation for Tokenizer:
https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer

In [11]:
token = Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
token.fit_on_texts(clean_sent)
VOCABULARY_SIZE = len(token.word_index) + 1
MAX_SENT_LENGTH = len(max(clean_sent, key = len))

#print("Vocab Size = %d. Maximum sent length = %d" % (vocab_size, max_sent_length))
print ("Max sent length: ", MAX_SENT_LENGTH)
encoded_sent = token.texts_to_sequences(clean_sent)
padded_sent = pad_sequences(encoded_sent, maxlen = MAX_SENT_LENGTH, padding = "post")


Max sent length:  12


In [53]:
print (encoded_sent)
print ("Num of Sentences: ", len(encoded_sent))

[[4, 1, 23, 129], [2, 4, 60, 61, 24], [89, 31, 21, 15], [7, 10, 24, 73], [4, 1, 23, 46], [11, 13, 51, 130], [2, 9, 8, 3, 6, 47, 52], [131], [16, 16], [32, 12], [132], [48, 25, 1, 36, 17, 49, 15], [3, 133, 25, 5, 33], [1, 4, 23, 90], [32, 3, 36, 17, 62, 37, 51, 18, 8], [12, 21, 27, 49], [134], [19, 3, 20, 25], [91], [34, 63], [135], [7, 4, 1, 136, 17, 5], [3, 92, 25], [93, 38], [2, 13, 28, 24], [3, 94, 19], [7, 95, 39, 8, 3, 47, 137, 138], [139], [140], [74, 141, 142, 143], [75, 4, 1], [12], [26], [19], [144], [7, 10, 50, 96], [7, 10, 27, 64], [29, 65], [26, 13, 145, 53], [97], [34], [2, 20, 74, 66, 11, 24, 67], [98, 34, 146, 23, 147], [2, 9, 54, 5, 3, 6, 148], [40], [76], [2, 22, 14, 5, 3, 6, 52, 11, 30], [55, 99], [68, 10, 32], [2, 6, 1, 69], [149, 100], [41, 42], [2, 4, 1, 101], [150], [4, 1, 77], [56, 1], [151], [12, 46], [102], [29, 41], [152], [1, 103, 153], [154], [34], [155], [29, 156], [38, 12], [157, 104], [26, 42], [41, 21, 70], [7, 50], [19, 71, 19, 67], [26, 53], [13, 68, 3

In [54]:
padded_sent[:5]

array([[  4,   1,  23, 129,   0,   0,   0,   0,   0,   0,   0,   0],
       [  2,   4,  60,  61,  24,   0,   0,   0,   0,   0,   0,   0],
       [ 89,  31,  21,  15,   0,   0,   0,   0,   0,   0,   0,   0],
       [  7,  10,  24,  73,   0,   0,   0,   0,   0,   0,   0,   0],
       [  4,   1,  23,  46,   0,   0,   0,   0,   0,   0,   0,   0]],
      dtype=int32)

In [55]:
print("Shape of padded sent = ",padded_sent.shape)

Shape of padded sent =  (346, 12)


# Tokenizing the intents

In [56]:
#tokenizer for the intents
token_intents = Tokenizer(filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')
token_intents.fit_on_texts(intents_unique)


In [57]:
token_intents.word_index

{'conversation_restart': 1,
 'greet': 2,
 'sorry': 3,
 'insult': 4,
 'are_you_a_robot': 5,
 'deny': 6,
 'skills': 7,
 'what_is_your_name': 8,
 'goodbye': 9,
 'leave_budget': 10,
 'how_are_you': 11,
 'personal_question': 12,
 'thanks': 13,
 'confirm': 14}

In [58]:
encoded_output = token_intents.texts_to_sequences(intents)


In [59]:
print (encoded_output)


[[5], [11], [6], [11], [5], [1], [10], [13], [2], [13], [14], [7], [6], [5], [10], [13], [14], [6], [2], [14], [14], [7], [6], [14], [11], [6], [10], [2], [2], [9], [8], [13], [2], [6], [14], [11], [8], [2], [2], [1], [14], [1], [14], [10], [14], [4], [10], [14], [11], [11], [6], [9], [8], [6], [11], [13], [2], [13], [14], [9], [14], [4], [13], [14], [4], [2], [13], [9], [2], [9], [2], [6], [2], [11], [11], [13], [1], [6], [7], [11], [14], [11], [11], [2], [10], [10], [2], [7], [14], [7], [13], [2], [8], [11], [11], [13], [6], [10], [2], [8], [1], [14], [6], [14], [11], [6], [2], [13], [2], [2], [14], [13], [2], [10], [9], [10], [9], [11], [14], [14], [2], [2], [2], [2], [13], [2], [11], [14], [14], [3], [12], [2], [14], [14], [3], [10], [9], [6], [4], [6], [11], [1], [9], [9], [9], [6], [10], [1], [14], [13], [10], [11], [5], [10], [8], [2], [1], [14], [13], [8], [2], [10], [10], [13], [10], [11], [6], [2], [13], [11], [11], [14], [2], [12], [2], [14], [9], [10], [2], [4], [11], [14],

In [60]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [61]:
# print (encoded_output)
type (encoded_output)

numpy.ndarray

In [62]:
encoded_output.shape

(346, 1)

In [63]:
one_hot = OneHotEncoder(sparse = False)
output_one_hot = one_hot.fit_transform(encoded_output)

In [64]:
output_one_hot.shape

(346, 14)

# Define Training- and Testset

In [65]:
from sklearn.model_selection import train_test_split

In [66]:
# train_X, val_X, train_Y, val_Y = train_test_split(padded_sent, output_one_hot, shuffle = True, test_size = 0.1)
train_X = padded_sent
train_Y = output_one_hot
val_X   = train_X
val_Y   = train_Y


In [67]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (346, 12) and train_Y = (346, 14)
Shape of val_X = (346, 12) and val_Y = (346, 14)


# Defining the Model

In [68]:
model = Sequential()
model.add(Embedding(VOCABULARY_SIZE, 128, input_length = MAX_SENT_LENGTH, trainable = False))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(32, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(NUM_INTENTS_UNIQUE, activation = "softmax"))

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 12, 128)           36352     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_3 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 14)                462       
Total params: 308,206
Trainable params: 271,854
Non-trainable params: 36,352
_________________________________________________________________


# Training the Model

In [75]:
import time

filename = 'dir_01.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

start = time.time()

hist = model.fit(train_X, train_Y, epochs = 200, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

print("Elapsed time in seconds: ", time.time() - start)


Train on 346 samples, validate on 346 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.29706, saving model to dir_01.h5
Epoch 2/200

Epoch 00002: val_loss improved from 0.29706 to 0.27128, saving model to dir_01.h5
Epoch 3/200

Epoch 00003: val_loss improved from 0.27128 to 0.23660, saving model to dir_01.h5
Epoch 4/200

Epoch 00004: val_loss did not improve from 0.23660
Epoch 5/200

Epoch 00005: val_loss did not improve from 0.23660
Epoch 6/200

Epoch 00006: val_loss did not improve from 0.23660
Epoch 7/200

Epoch 00007: val_loss improved from 0.23660 to 0.23645, saving model to dir_01.h5
Epoch 8/200

Epoch 00008: val_loss improved from 0.23645 to 0.22896, saving model to dir_01.h5
Epoch 9/200

Epoch 00009: val_loss improved from 0.22896 to 0.20001, saving model to dir_01.h5
Epoch 10/200

Epoch 00010: val_loss did not improve from 0.20001
Epoch 11/200

Epoch 00011: val_loss did not improve from 0.20001
Epoch 12/200

Epoch 00012: val_loss did not improve from 0.20001
E


Epoch 00083: val_loss did not improve from 0.02272
Epoch 84/200

Epoch 00084: val_loss did not improve from 0.02272
Epoch 85/200

Epoch 00085: val_loss did not improve from 0.02272
Epoch 86/200

Epoch 00086: val_loss did not improve from 0.02272
Epoch 87/200

Epoch 00087: val_loss improved from 0.02272 to 0.02037, saving model to dir_01.h5
Epoch 88/200

Epoch 00088: val_loss improved from 0.02037 to 0.01863, saving model to dir_01.h5
Epoch 89/200

Epoch 00089: val_loss did not improve from 0.01863
Epoch 90/200

Epoch 00090: val_loss improved from 0.01863 to 0.01836, saving model to dir_01.h5
Epoch 91/200

Epoch 00091: val_loss did not improve from 0.01836
Epoch 92/200

Epoch 00092: val_loss did not improve from 0.01836
Epoch 93/200

Epoch 00093: val_loss did not improve from 0.01836
Epoch 94/200

Epoch 00094: val_loss did not improve from 0.01836
Epoch 95/200

Epoch 00095: val_loss did not improve from 0.01836
Epoch 96/200

Epoch 00096: val_loss did not improve from 0.01836
Epoch 97/2


Epoch 00167: val_loss did not improve from 0.00799
Epoch 168/200

Epoch 00168: val_loss did not improve from 0.00799
Epoch 169/200

Epoch 00169: val_loss did not improve from 0.00799
Epoch 170/200

Epoch 00170: val_loss did not improve from 0.00799
Epoch 171/200

Epoch 00171: val_loss did not improve from 0.00799
Epoch 172/200

Epoch 00172: val_loss did not improve from 0.00799
Epoch 173/200

Epoch 00173: val_loss did not improve from 0.00799
Epoch 174/200

Epoch 00174: val_loss did not improve from 0.00799
Epoch 175/200

Epoch 00175: val_loss did not improve from 0.00799
Epoch 176/200

Epoch 00176: val_loss did not improve from 0.00799
Epoch 177/200

Epoch 00177: val_loss did not improve from 0.00799
Epoch 178/200

Epoch 00178: val_loss did not improve from 0.00799
Epoch 179/200

Epoch 00179: val_loss did not improve from 0.00799
Epoch 180/200

Epoch 00180: val_loss did not improve from 0.00799
Epoch 181/200

Epoch 00181: val_loss did not improve from 0.00799
Epoch 182/200

Epoch 001

In [70]:
 model = load_model("dir_01.h5")

In [71]:
def predictions(text):
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
    test_word = word_tokenize(clean)
    test_word = [w.lower() for w in test_word]
    test_ls = token.texts_to_sequences(test_word)
    #print(test_word)
    #Check for unknown words
    if [] in test_ls:
        test_ls = list(filter(None, test_ls))
    
    test_ls = np.array(test_ls).reshape(1, len(test_ls))

    #print("test_ls: ", test_ls)

    x = pad_sequences(test_ls, maxlen = MAX_SENT_LENGTH, padding = "post")
    
    # print ("x: ", x)
    
    pred = model.predict_proba(x)
  
    return pred


In [72]:
def get_final_output(pred, classes):
    #print (type (pred))
    #print (pred)
    
    predictions = pred[0]
 
    classes = np.array(classes)
    ids = np.argsort(-predictions)
    classes = classes[ids]
#    predictions = -np.sort(-predictions)

#    for i in range(pred.shape[1]):
#        print("%s has confidence = %s" % (classes[i], (predictions[i])))
    
    return classes[0]


In [73]:
text = "Good morning"
pred = predictions(text)
out = get_final_output(pred, intents_unique)
print (out)

greet


In [77]:
with open('mowgli_test_new.csv', newline='') as csvfile:
    testfilelist = list(csv.reader(csvfile))

total   = 0
correct = 0

for s in testfilelist:
    right_label = s[1]
    message     = s[0]

    pred   = predictions(message)
    intent = get_final_output(pred, intents_unique)

    if intent == right_label:
        correct += 1
    else:
        print ("%s recognized as %s, correct is %s" % (message, intent, right_label))
    total += 1

print ("Results: %d from %d correct = %4.2f percent" % (correct, total, (correct/total)*100.0))


thanks this is great news recognized as goodbye, correct is thanks
merci recognized as confirm, correct is thanks
how is your evening recognized as what_is_your_name, correct is how_are_you
how nice! recognized as goodbye, correct is confirm
definitely recognized as greet, correct is confirm
never recognized as confirm, correct is deny
yep if i have to recognized as skills, correct is confirm
no you did it wrong recognized as goodbye, correct is deny
worthless recognized as confirm, correct is insult
greetings recognized as confirm, correct is greet
cool! recognized as thanks, correct is confirm
where am i recognized as greet, correct is skills
have a nice day recognized as how_are_you, correct is goodbye
nope recognized as confirm, correct is deny
i'm off recognized as sorry, correct is goodbye
planned leaves recognized as greet, correct is leave_budget
What's new? recognized as goodbye, correct is how_are_you
correct recognized as greet, correct is confirm
i apologize recognized as g