<a href="https://colab.research.google.com/github/Dark-Sied/Intent_Classification/blob/master/Intent_classification_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intent Recognition Dolores V01
Using a new vocabulary and training set.  
Intent Recognition is based on Intent_classification_final
Created by Christoph Windheuser, April 2020

In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [2]:
# DEFINE GLOBAL VARIABLES:
NUM_SENT = 0
NUM_INTENTS = 0
NUM_INTENTS_UNIQUE = 0
VOCABULARY_SIZE = 0
MAX_SENT_LENGTH = 0

In [3]:
# SHOW GLOBAL VARIABLES
print ("NUM_SENT:           ", NUM_SENT)
print ("NUM_INTENTS:        ", NUM_INTENTS)
print ("NUM_INTENTS_UNIQUE: ", NUM_INTENTS_UNIQUE)
print ("VOCABULARY_SIZE:    ", VOCABULARY_SIZE)
print ("MAX_SENT_LENGTH:    ", MAX_SENT_LENGTH)

NUM_SENT:            0
NUM_INTENTS:         0
NUM_INTENTS_UNIQUE:  0
VOCABULARY_SIZE:     0
MAX_SENT_LENGTH:     0


# Read the data

In [5]:
df = pd.read_csv("Dolores_Dataset_v01.csv", encoding = "latin1", names = ["Sentence", "Intent"])
print(df.head())
intents            = df["Intent"]
NUM_INTENTS        = len(list(df["Intent"]))
intents_unique     = list(set(df["Intent"]))
NUM_INTENTS_UNIQUE = len(intents_unique)
sentences          = list(df["Sentence"])
NUM_SENT           = len(sentences)



               Sentence      Intent
0          Good Morning   greetings
1                    Hi   greetings
2  Good morning Dolores   greetings
3           Hey Dolores   greetings
4                 Hello   greetings


In [6]:
print ("Unique Intents: ")
print (intents_unique)
print ("Num of unique Intents: ", len(intents_unique))


Unique Intents: 
[' greetings', ' saveAppoint', ' getDate', ' getTime', ' saveLink', ' goodBye', ' toSetDownSomeOfThisNaturalSplendor', ' haveViolentEnds']
Num of unique Intents:  8


In [7]:
df.head()

Unnamed: 0,Sentence,Intent
0,Good Morning,greetings
1,Hi,greetings
2,Good morning Dolores,greetings
3,Hey Dolores,greetings
4,Hello,greetings


In [8]:
df.shape

(32, 2)

In [9]:
print(sentences[:5])

['Good Morning', 'Hi', 'Good morning Dolores', 'Hey Dolores', 'Hello']


# Word Cleaning
re.sub is a routine from the "Regular Expression" Library.     
r'string' means that this is a "raw string", where backslashes are treated as charachters.    
re.sub(r'[^ a-z A-Z 0-9]', " ", s) means that all characters exept a-z, A-Z and 0-9 will be replaced by space.

In [10]:
clean_sent = []
for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    clean_sent.append([i.lower() for i in w])

print (clean_sent[:5])
print("Len of clean_sent: ", len(clean_sent))


[['good', 'morning'], ['hi'], ['good', 'morning', 'dolores'], ['hey', 'dolores'], ['hello']]
Len of clean_sent:  32


### Documentation for Tokenizer:
https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer

In [15]:
token = Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
token.fit_on_texts(clean_sent)
VOCABULARY_SIZE = len(token.word_index) + 1
MAX_SENT_LENGTH = len(max(clean_sent, key = len))

#print("Vocab Size = %d. Maximum sent length = %d" % (vocab_size, max_sent_length))
print ("Max sent length: ", MAX_SENT_LENGTH)
encoded_sent = token.texts_to_sequences(clean_sent)
padded_sent = pad_sequences(encoded_sent, maxlen = MAX_SENT_LENGTH, padding = "post")


Max sent length:  5


In [16]:
print (encoded_sent)
print ("Num of Sentences: ", len(encoded_sent))

[[10, 14], [24], [10, 14, 4], [25, 4], [15], [15, 4], [5], [5, 5], [26], [10, 5], [16, 11], [16, 11, 4], [5, 4], [6, 17, 3, 9], [6, 9, 18, 27], [9], [28, 19, 3, 9], [6, 18, 3, 7, 29], [6, 7, 20, 21, 22], [30, 7, 20, 21, 22], [6, 17, 3, 7], [31, 32, 33], [11, 34, 35], [1, 2, 12], [13, 1, 2, 12], [1, 3, 36, 12], [1, 3, 37, 7], [1, 2, 8, 38, 19], [1, 2, 8], [13, 1, 2, 8], [1, 2, 23, 8, 13], [1, 2, 23, 8]]
Num of Sentences:  32


In [17]:
padded_sent[:5]

array([[10, 14,  0,  0,  0],
       [24,  0,  0,  0,  0],
       [10, 14,  4,  0,  0],
       [25,  4,  0,  0,  0],
       [15,  0,  0,  0,  0]], dtype=int32)

In [18]:
print("Shape of padded sent = ",padded_sent.shape)

Shape of padded sent =  (32, 5)


# Tokenizing the intents

In [19]:
#tokenizer for the intents
token_intents = Tokenizer(filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')
token_intents.fit_on_texts(intents_unique)


In [20]:
token_intents.word_index

{'greetings': 1,
 'saveappoint': 2,
 'getdate': 3,
 'gettime': 4,
 'savelink': 5,
 'goodbye': 6,
 'tosetdownsomeofthisnaturalsplendor': 7,
 'haveviolentends': 8}

In [21]:
encoded_output = token_intents.texts_to_sequences(intents)


In [22]:
print (encoded_output)


[[1], [1], [1], [1], [1], [1], [6], [6], [6], [6], [6], [6], [6], [4], [4], [4], [4], [3], [3], [3], [3], [8], [7], [2], [2], [2], [2], [5], [5], [5], [5], [5]]


In [23]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [24]:
# print (encoded_output)
type (encoded_output)

numpy.ndarray

In [25]:
encoded_output.shape

(32, 1)

In [26]:
one_hot = OneHotEncoder(sparse = False)
output_one_hot = one_hot.fit_transform(encoded_output)

In [27]:
output_one_hot.shape

(32, 8)

# Define Training- and Testset

In [28]:
from sklearn.model_selection import train_test_split

In [30]:
# train_X, val_X, train_Y, val_Y = train_test_split(padded_sent, output_one_hot, shuffle = True, test_size = 0.1)
train_X = padded_sent
train_Y = output_one_hot
val_X   = train_X
val_Y   = train_Y


In [31]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (32, 5) and train_Y = (32, 8)
Shape of val_X = (32, 5) and val_Y = (32, 8)


# Defining the Model

In [32]:
model = Sequential()
model.add(Embedding(VOCABULARY_SIZE, 128, input_length = MAX_SENT_LENGTH, trainable = False))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(32, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(NUM_INTENTS_UNIQUE, activation = "softmax"))

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5, 128)            4992      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_1 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 264       
Total params: 276,648
Trainable params: 271,656
Non-trainable par

# Training the Model

In [33]:
import time

filename = 'dir_01.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

start = time.time()

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

print("Elapsed time in seconds: ", time.time() - start)


Instructions for updating:
Use tf.cast instead.
Train on 32 samples, validate on 32 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 2.07198, saving model to dir_01.h5
Epoch 2/100

Epoch 00002: val_loss improved from 2.07198 to 2.06405, saving model to dir_01.h5
Epoch 3/100

Epoch 00003: val_loss improved from 2.06405 to 2.05644, saving model to dir_01.h5
Epoch 4/100

Epoch 00004: val_loss improved from 2.05644 to 2.04865, saving model to dir_01.h5
Epoch 5/100

Epoch 00005: val_loss improved from 2.04865 to 2.04035, saving model to dir_01.h5
Epoch 6/100

Epoch 00006: val_loss improved from 2.04035 to 2.03178, saving model to dir_01.h5
Epoch 7/100

Epoch 00007: val_loss improved from 2.03178 to 2.02280, saving model to dir_01.h5
Epoch 8/100

Epoch 00008: val_loss improved from 2.02280 to 2.01315, saving model to dir_01.h5
Epoch 9/100

Epoch 00009: val_loss improved from 2.01315 to 2.00237, saving model to dir_01.h5
Epoch 10/100

Epoch 00010: val_loss improved from 2.00237


Epoch 00037: val_loss improved from 1.46012 to 1.43210, saving model to dir_01.h5
Epoch 38/100

Epoch 00038: val_loss improved from 1.43210 to 1.40526, saving model to dir_01.h5
Epoch 39/100

Epoch 00039: val_loss improved from 1.40526 to 1.37799, saving model to dir_01.h5
Epoch 40/100

Epoch 00040: val_loss improved from 1.37799 to 1.34988, saving model to dir_01.h5
Epoch 41/100

Epoch 00041: val_loss improved from 1.34988 to 1.32017, saving model to dir_01.h5
Epoch 42/100

Epoch 00042: val_loss improved from 1.32017 to 1.28965, saving model to dir_01.h5
Epoch 43/100

Epoch 00043: val_loss improved from 1.28965 to 1.26243, saving model to dir_01.h5
Epoch 44/100

Epoch 00044: val_loss improved from 1.26243 to 1.23793, saving model to dir_01.h5
Epoch 45/100

Epoch 00045: val_loss improved from 1.23793 to 1.21287, saving model to dir_01.h5
Epoch 46/100

Epoch 00046: val_loss improved from 1.21287 to 1.18851, saving model to dir_01.h5
Epoch 47/100

Epoch 00047: val_loss improved from 1.1


Epoch 00075: val_loss improved from 0.43571 to 0.41850, saving model to dir_01.h5
Epoch 76/100

Epoch 00076: val_loss improved from 0.41850 to 0.39986, saving model to dir_01.h5
Epoch 77/100

Epoch 00077: val_loss improved from 0.39986 to 0.38300, saving model to dir_01.h5
Epoch 78/100

Epoch 00078: val_loss improved from 0.38300 to 0.36955, saving model to dir_01.h5
Epoch 79/100

Epoch 00079: val_loss improved from 0.36955 to 0.35489, saving model to dir_01.h5
Epoch 80/100

Epoch 00080: val_loss improved from 0.35489 to 0.33653, saving model to dir_01.h5
Epoch 81/100

Epoch 00081: val_loss improved from 0.33653 to 0.31481, saving model to dir_01.h5
Epoch 82/100

Epoch 00082: val_loss improved from 0.31481 to 0.29726, saving model to dir_01.h5
Epoch 83/100

Epoch 00083: val_loss improved from 0.29726 to 0.28289, saving model to dir_01.h5
Epoch 84/100

Epoch 00084: val_loss improved from 0.28289 to 0.27419, saving model to dir_01.h5
Epoch 85/100

Epoch 00085: val_loss improved from 0.2

In [34]:
 model = load_model("dir_01.h5")

In [35]:
def predictions(text):
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
    test_word = word_tokenize(clean)
    test_word = [w.lower() for w in test_word]
    test_ls = token.texts_to_sequences(test_word)
    #print(test_word)
    #Check for unknown words
    if [] in test_ls:
        test_ls = list(filter(None, test_ls))
    
    test_ls = np.array(test_ls).reshape(1, len(test_ls))

    #print("test_ls: ", test_ls)

    x = pad_sequences(test_ls, maxlen = MAX_SENT_LENGTH, padding = "post")
    
    #print ("x: ", x)
    
    pred = model.predict_proba(x)
  
    return pred


In [36]:
def get_final_output(pred, classes):
    #print (type (pred))
    #print (pred)
    
    predictions = pred[0]
 
    classes = np.array(classes)
    ids = np.argsort(-predictions)
    classes = classes[ids]
    predictions = -np.sort(-predictions)

    for i in range(pred.shape[1]):
        print("%s has confidence = %s" % (classes[i], (predictions[i])))


In [50]:
text = "Can you save the appointment"
pred = predictions(text)
get_final_output(pred, intents_unique)

 saveAppoint has confidence = 0.7557694
 getDate has confidence = 0.124784194
 getTime has confidence = 0.064198196
 greetings has confidence = 0.017944703
 saveLink has confidence = 0.017852085
 goodBye has confidence = 0.012756296
 toSetDownSomeOfThisNaturalSplendor has confidence = 0.003466972
 haveViolentEnds has confidence = 0.0032281294
