<a href="https://colab.research.google.com/github/Dark-Sied/Intent_Classification/blob/master/Intent_classification_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intent Recognition Dolores V01
Using a new vocabulary and training set.  
Intent Recognition is based on Intent_classification_final
Created by Christoph Windheuser, April 2020

In [1]:
import numpy as np
import pandas as pd
import csv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [2]:
# DEFINE GLOBAL VARIABLES:
NUM_SENT = 0
NUM_INTENTS = 0
NUM_INTENTS_UNIQUE = 0
VOCABULARY_SIZE = 0
MAX_SENT_LENGTH = 0

In [23]:
# SHOW GLOBAL VARIABLES
print ("NUM_SENT:           ", NUM_SENT)
print ("NUM_INTENTS:        ", NUM_INTENTS)
print ("NUM_INTENTS_UNIQUE: ", NUM_INTENTS_UNIQUE)
print ("VOCABULARY_SIZE:    ", VOCABULARY_SIZE)
print ("MAX_SENT_LENGTH:    ", MAX_SENT_LENGTH)

NUM_SENT:            346
NUM_INTENTS:         346
NUM_INTENTS_UNIQUE:  14
VOCABULARY_SIZE:     284
MAX_SENT_LENGTH:     12


# Read the data

In [4]:
# df = pd.read_csv("Dolores_Dataset_v01.csv", encoding = "latin1", names = ["Sentence", "Intent"])
df = pd.read_csv("mowgli_train_new.csv", encoding = "latin1", names = ["Sentence", "Intent"])
print(df.head())
intents            = df["Intent"]
NUM_INTENTS        = len(list(df["Intent"]))
intents_unique     = list(set(df["Intent"]))
NUM_INTENTS_UNIQUE = len(intents_unique)
sentences          = list(df["Sentence"])
NUM_SENT           = len(sentences)


                    Sentence           Intent
0         are you a machine?  are_you_a_robot
1  how are the things going?      how_are_you
2             nah not for me             deny
3           What's going on?      how_are_you
4             are you a bot?  are_you_a_robot


In [5]:
print ("Unique Intents: ")
print (intents_unique)
print ("Num of unique Intents: ", len(intents_unique))


Unique Intents: 
['leave_budget', 'deny', 'goodbye', 'thanks', 'confirm', 'sorry', 'what_is_your_name', 'are_you_a_robot', 'insult', 'how_are_you', 'greet', 'conversation_restart', 'skills', 'personal_question']
Num of unique Intents:  14


In [6]:
df.head()

Unnamed: 0,Sentence,Intent
0,are you a machine?,are_you_a_robot
1,how are the things going?,how_are_you
2,nah not for me,deny
3,What's going on?,how_are_you
4,are you a bot?,are_you_a_robot


In [7]:
df.shape

(346, 2)

In [8]:
print(sentences[:5])

['are you a machine?', 'how are the things going?', 'nah not for me', "What's going on?", 'are you a bot?']


# Word Cleaning
re.sub is a routine from the "Regular Expression" Library.     
r'string' means that this is a "raw string", where backslashes are treated as charachters.    
re.sub(r'[^ a-z A-Z 0-9]', " ", s) means that all characters exept a-z, A-Z and 0-9 will be replaced by space.

In [9]:
clean_sent = []
for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    clean_sent.append([i.lower() for i in w])

print (clean_sent[:5])
print("Len of clean_sent: ", len(clean_sent))


[['are', 'you', 'a', 'machine'], ['how', 'are', 'the', 'things', 'going'], ['nah', 'not', 'for', 'me'], ['what', 's', 'going', 'on'], ['are', 'you', 'a', 'bot']]
Len of clean_sent:  346


### Documentation for Tokenizer:
https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer

In [92]:
token = Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
token.fit_on_texts(clean_sent)
word_index = token.word_index
VOCABULARY_SIZE = len(word_index) + 1
MAX_SENT_LENGTH = len(max(clean_sent, key = len))

#print("Vocab Size = %d. Maximum sent length = %d" % (vocab_size, max_sent_length))
print ("Max sent length: ", MAX_SENT_LENGTH)
encoded_sent = token.texts_to_sequences(clean_sent)
padded_sent = pad_sequences(encoded_sent, maxlen = MAX_SENT_LENGTH, padding = "post")


Max sent length:  12


In [11]:
# print (encoded_sent)
# print ("Num of Sentences: ", len(encoded_sent))

In [12]:
padded_sent[:5]

array([[  4,   1,  23, 129,   0,   0,   0,   0,   0,   0,   0,   0],
       [  2,   4,  60,  61,  24,   0,   0,   0,   0,   0,   0,   0],
       [ 89,  31,  21,  15,   0,   0,   0,   0,   0,   0,   0,   0],
       [  7,  10,  24,  73,   0,   0,   0,   0,   0,   0,   0,   0],
       [  4,   1,  23,  46,   0,   0,   0,   0,   0,   0,   0,   0]],
      dtype=int32)

In [13]:
print("Shape of padded sent = ",padded_sent.shape)

Shape of padded sent =  (346, 12)


# Tokenizing the intents

In [14]:
#tokenizer for the intents
token_intents = Tokenizer(filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')
token_intents.fit_on_texts(intents_unique)


In [15]:
token_intents.word_index

{'leave_budget': 1,
 'deny': 2,
 'goodbye': 3,
 'thanks': 4,
 'confirm': 5,
 'sorry': 6,
 'what_is_your_name': 7,
 'are_you_a_robot': 8,
 'insult': 9,
 'how_are_you': 10,
 'greet': 11,
 'conversation_restart': 12,
 'skills': 13,
 'personal_question': 14}

In [16]:
encoded_output = token_intents.texts_to_sequences(intents)


In [17]:
# print (encoded_output)


In [18]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [19]:
# print (encoded_output)
type (encoded_output)

numpy.ndarray

In [20]:
encoded_output.shape

(346, 1)

In [21]:
one_hot = OneHotEncoder(sparse = False)
output_one_hot = one_hot.fit_transform(encoded_output)

In [22]:
output_one_hot.shape

(346, 14)

# Define Training- and Testset

In [24]:
from sklearn.model_selection import train_test_split

In [47]:
# train_X, val_X, train_Y, val_Y = train_test_split(padded_sent, output_one_hot, shuffle = True, test_size = 0.1)
train_X = padded_sent
train_Y = output_one_hot
val_X   = train_X
val_Y   = train_Y


In [48]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (346, 12) and train_Y = (346, 14)
Shape of val_X = (346, 12) and val_Y = (346, 14)


# Define Embeddings (from glove)

In [90]:
import os

GLOVE_DIR  = "/Users/cwindheu/gensim-data/glove-wiki-gigaword-200/"
GLOVE_FILE = "glove-wiki-gigaword-200.txt"
EMBEDDING_DIM = 200

embeddings_index = {}

f = open(os.path.join(GLOVE_DIR, GLOVE_FILE))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Found 400001 word vectors.


In [93]:
EMBEDDING_DIM = 200

embedding_matrix = np.zeros((VOCABULARY_SIZE, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        print ("Not in vacabulary: ", word)
                    

Not in vacabulary:  chatbot
Not in vacabulary:  amayzing
Not in vacabulary:  heeey
Not in vacabulary:  helloooo
Not in vacabulary:  jojojo
Not in vacabulary:  thanx
Not in vacabulary:  hellllooooooo
Not in vacabulary:  hellooo
Not in vacabulary:  hiihihi
Not in vacabulary:  thnx
Not in vacabulary:  heyho
Not in vacabulary:  hiii
Not in vacabulary:  sweatheart
Not in vacabulary:  heyo
Not in vacabulary:  ayyyy
Not in vacabulary:  whaddup


# Defining the Model

In [102]:
model = Sequential()
model.add(Embedding(VOCABULARY_SIZE, EMBEDDING_DIM, weights=[embedding_matrix], input_length = MAX_SENT_LENGTH, trainable = True))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(32, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(NUM_INTENTS_UNIQUE, activation = "softmax"))

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 12, 200)           56800     
_________________________________________________________________
bidirectional_7 (Bidirection (None, 256)               336896    
_________________________________________________________________
dense_13 (Dense)             (None, 32)                8224      
_________________________________________________________________
dropout_7 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 14)                462       
Total params: 402,382
Trainable params: 402,382
Non-trainable params: 0
_________________________________________________________________


# Training the Model

In [103]:
import time

filename = 'dir_01.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

start = time.time()

hist = model.fit(train_X, train_Y, epochs = 200, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

print("Elapsed time in seconds: ", time.time() - start)


Train on 346 samples, validate on 346 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 2.31797, saving model to dir_01.h5
Epoch 2/200

Epoch 00002: val_loss improved from 2.31797 to 2.02916, saving model to dir_01.h5
Epoch 3/200

Epoch 00003: val_loss improved from 2.02916 to 1.66250, saving model to dir_01.h5
Epoch 4/200

Epoch 00004: val_loss improved from 1.66250 to 1.34680, saving model to dir_01.h5
Epoch 5/200

Epoch 00005: val_loss improved from 1.34680 to 1.13456, saving model to dir_01.h5
Epoch 6/200

Epoch 00006: val_loss improved from 1.13456 to 0.94963, saving model to dir_01.h5
Epoch 7/200

Epoch 00007: val_loss improved from 0.94963 to 0.73597, saving model to dir_01.h5
Epoch 8/200

Epoch 00008: val_loss improved from 0.73597 to 0.61654, saving model to dir_01.h5
Epoch 9/200

Epoch 00009: val_loss improved from 0.61654 to 0.56770, saving model to dir_01.h5
Epoch 10/200

Epoch 00010: val_loss improved from 0.56770 to 0.38167, saving model to dir_01.h5
Epoch 1


Epoch 00040: val_loss did not improve from 0.00889
Epoch 41/200

Epoch 00041: val_loss did not improve from 0.00889
Epoch 42/200

Epoch 00042: val_loss did not improve from 0.00889
Epoch 43/200

Epoch 00043: val_loss did not improve from 0.00889
Epoch 44/200

Epoch 00044: val_loss did not improve from 0.00889
Epoch 45/200

Epoch 00045: val_loss did not improve from 0.00889
Epoch 46/200

Epoch 00046: val_loss improved from 0.00889 to 0.00852, saving model to dir_01.h5
Epoch 47/200

Epoch 00047: val_loss improved from 0.00852 to 0.00719, saving model to dir_01.h5
Epoch 48/200

Epoch 00048: val_loss did not improve from 0.00719
Epoch 49/200

Epoch 00049: val_loss improved from 0.00719 to 0.00716, saving model to dir_01.h5
Epoch 50/200

Epoch 00050: val_loss improved from 0.00716 to 0.00659, saving model to dir_01.h5
Epoch 51/200

Epoch 00051: val_loss improved from 0.00659 to 0.00585, saving model to dir_01.h5
Epoch 52/200

Epoch 00052: val_loss did not improve from 0.00585
Epoch 53/200



Epoch 00082: val_loss did not improve from 0.00430
Epoch 83/200

Epoch 00083: val_loss did not improve from 0.00430
Epoch 84/200

Epoch 00084: val_loss did not improve from 0.00430
Epoch 85/200

Epoch 00085: val_loss did not improve from 0.00430
Epoch 86/200

Epoch 00086: val_loss did not improve from 0.00430
Epoch 87/200

Epoch 00087: val_loss did not improve from 0.00430
Epoch 88/200

Epoch 00088: val_loss did not improve from 0.00430
Epoch 89/200

Epoch 00089: val_loss improved from 0.00430 to 0.00425, saving model to dir_01.h5
Epoch 90/200

Epoch 00090: val_loss improved from 0.00425 to 0.00421, saving model to dir_01.h5
Epoch 91/200

Epoch 00091: val_loss improved from 0.00421 to 0.00415, saving model to dir_01.h5
Epoch 92/200

Epoch 00092: val_loss improved from 0.00415 to 0.00413, saving model to dir_01.h5
Epoch 93/200

Epoch 00093: val_loss did not improve from 0.00413
Epoch 94/200

Epoch 00094: val_loss did not improve from 0.00413
Epoch 95/200

Epoch 00095: val_loss did not 


Epoch 00125: val_loss did not improve from 0.00410
Epoch 126/200

Epoch 00126: val_loss did not improve from 0.00410
Epoch 127/200

Epoch 00127: val_loss did not improve from 0.00410
Epoch 128/200

Epoch 00128: val_loss did not improve from 0.00410
Epoch 129/200

Epoch 00129: val_loss did not improve from 0.00410
Epoch 130/200

Epoch 00130: val_loss did not improve from 0.00410
Epoch 131/200

Epoch 00131: val_loss did not improve from 0.00410
Epoch 132/200

Epoch 00132: val_loss did not improve from 0.00410
Epoch 133/200

Epoch 00133: val_loss did not improve from 0.00410
Epoch 134/200

Epoch 00134: val_loss did not improve from 0.00410
Epoch 135/200

Epoch 00135: val_loss improved from 0.00410 to 0.00410, saving model to dir_01.h5
Epoch 136/200

Epoch 00136: val_loss did not improve from 0.00410
Epoch 137/200

Epoch 00137: val_loss did not improve from 0.00410
Epoch 138/200

Epoch 00138: val_loss did not improve from 0.00410
Epoch 139/200

Epoch 00139: val_loss did not improve from 0


Epoch 00168: val_loss did not improve from 0.00405
Epoch 169/200

Epoch 00169: val_loss did not improve from 0.00405
Epoch 170/200

Epoch 00170: val_loss did not improve from 0.00405
Epoch 171/200

Epoch 00171: val_loss did not improve from 0.00405
Epoch 172/200

Epoch 00172: val_loss did not improve from 0.00405
Epoch 173/200

Epoch 00173: val_loss did not improve from 0.00405
Epoch 174/200

Epoch 00174: val_loss did not improve from 0.00405
Epoch 175/200

Epoch 00175: val_loss did not improve from 0.00405
Epoch 176/200

Epoch 00176: val_loss did not improve from 0.00405
Epoch 177/200

Epoch 00177: val_loss did not improve from 0.00405
Epoch 178/200

Epoch 00178: val_loss did not improve from 0.00405
Epoch 179/200

Epoch 00179: val_loss did not improve from 0.00405
Epoch 180/200

Epoch 00180: val_loss did not improve from 0.00405
Epoch 181/200

Epoch 00181: val_loss did not improve from 0.00405
Epoch 182/200

Epoch 00182: val_loss did not improve from 0.00405
Epoch 183/200

Epoch 001

In [104]:
 model = load_model("dir_01.h5")

In [105]:
def predictions(text):
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
    test_word = word_tokenize(clean)
    test_word = [w.lower() for w in test_word]
    test_ls = token.texts_to_sequences(test_word)
    #print(test_word)
    #Check for unknown words
    if [] in test_ls:
        test_ls = list(filter(None, test_ls))
    
    test_ls = np.array(test_ls).reshape(1, len(test_ls))

    #print("test_ls: ", test_ls)

    x = pad_sequences(test_ls, maxlen = MAX_SENT_LENGTH, padding = "post")
    
    # print ("x: ", x)
    
    pred = model.predict_proba(x)
  
    return pred


In [106]:
def get_final_output(pred, classes):
    #print (type (pred))
    #print (pred)
    
    predictions = pred[0]
 
    classes = np.array(classes)
    ids = np.argsort(-predictions)
    classes = classes[ids]
#    predictions = -np.sort(-predictions)

#    for i in range(pred.shape[1]):
#        print("%s has confidence = %s" % (classes[i], (predictions[i])))
    
    return classes[0]


In [107]:
text = "Good morning"
pred = predictions(text)
out = get_final_output(pred, intents_unique)
print (out)

greet


In [109]:
# with open('mowgli_test_new.csv', newline='') as csvfile:
#    testfilelist = list(csv.reader(csvfile))

with open('mowgli_train_new.csv', newline='') as csvfile:
    testfilelist = list(csv.reader(csvfile))

total   = 0
correct = 0

for s in testfilelist:
    right_label = s[1]
    message     = s[0]

    pred   = predictions(message)
    intent = get_final_output(pred, intents_unique)

    if intent == right_label:
        correct += 1
    else:
        print ("%s recognized as %s, correct is %s" % (message, intent, right_label))
    total += 1

print ("Results: %d from %d correct = %4.2f percent" % (correct, total, (correct/total)*100.0))


cool recognized as confirm, correct is thanks
Results: 345 from 346 correct = 99.71 percent
