<a href="https://colab.research.google.com/github/Dark-Sied/Intent_Classification/blob/master/Intent_classification_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intent Recognition
Article: https://towardsdatascience.com/a-brief-introduction-to-intent-classification-96fda6b1f557    
Changed by Christoph Windheuser, April 2020

In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [2]:
df = pd.read_csv("Dataset.csv", encoding = "latin1", names = ["Sentence", "Intent"])
print(df.head())
intent = df["Intent"]
unique_intent = list(set(intent))
sentences = list(df["Sentence"])


                Sentence          Intent
0       Need help pleese  commonQ.assist
1              Need help  commonQ.assist
2       I need some info  commonQ.assist
3      Will you help me?  commonQ.assist
4  What else can you do?  commonQ.assist


In [3]:
# print (type (intent))
intent_l = list(set(df["Intent"]))
print (type (intent_l))

# print (sentences)
# print (intent_l)

<class 'list'>


In [4]:
df.head()

Unnamed: 0,Sentence,Intent
0,Need help pleese,commonQ.assist
1,Need help,commonQ.assist
2,I need some info,commonQ.assist
3,Will you help me?,commonQ.assist
4,What else can you do?,commonQ.assist


In [5]:
df.shape

(1113, 2)

In [6]:
print ("No of intents: ", len(unique_intent))
print (unique_intent)


No of intents:  21
['faq.address_proof', 'commonQ.assist', 'commonQ.name', 'faq.biz_new', 'commonQ.bot', 'commonQ.just_details', 'faq.apply_register', 'faq.banking_option_missing', 'faq.biz_simpler', 'faq.borrow_limit', 'commonQ.not_giving', 'commonQ.wait', 'faq.aadhaar_missing', 'commonQ.query', 'faq.biz_category_missing', 'faq.bad_service', 'faq.approval_time', 'faq.application_process', 'faq.borrow_use', 'commonQ.how', 'contact.contact']


In [7]:
print(sentences[:5])

['Need help pleese', 'Need help', 'I need some info', 'Will you help me?', 'What else can you do?']


In [8]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cwindheu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/cwindheu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
#define stemmer
#stemmer = LancasterStemmer()

# Word Cleaning
re.sub is a routine from the "Regular Expression" Library.     
r'string' means that this is a "raw string", where backslashes are treated as charachters.    
re.sub(r'[^ a-z A-Z 0-9]', " ", s) means that all characters exept a-z, A-Z and 0-9 will be replaced by space.

In [10]:
def cleaning(sentences):
  words = []
  for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    #stemming
    words.append([i.lower() for i in w])
    
  return words  

In [11]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:3])  


1113
[['need', 'help', 'pleese'], ['need', 'help'], ['i', 'need', 'some', 'info']]


### Documentation for Tokenizer:
https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer

In [12]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [13]:
def max_length(words):
  return(len(max(words, key = len)))
  

In [14]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 492 and Maximum length = 28


In [15]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [16]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [17]:
print (encoded_doc)

[[25, 77, 332], [25, 77], [1, 25, 198, 181], [51, 10, 77, 16], [8, 268, 4, 10, 30], [8, 30, 10, 30], [8, 4, 10, 77, 16, 60], [8, 4, 10, 30], [8, 39, 10, 182, 134], [25, 77, 333], [11, 4, 10, 77, 16], [4, 10, 77, 16], [11, 4, 10, 269], [11, 4, 10, 77, 16], [11, 30, 10, 77], [11, 6, 77, 10], [11, 51, 10, 269, 16], [1, 25, 77], [4, 10, 27, 198, 181], [77, 16, 35], [7, 23, 21, 143], [7, 89, 21, 143, 199, 6, 16], [7, 83, 270, 271], [228, 272, 334], [143, 144, 145], [143, 7, 199, 60, 16], [39, 10, 229, 273, 271], [39, 10, 274], [39, 10, 200], [39, 10, 183, 228], [39, 10, 21, 274], [39, 10, 21, 160], [39, 10, 21, 200], [39, 10, 21, 275], [39, 10, 21, 143], [39, 335, 270, 336], [161, 1, 337, 6, 21, 143], [10, 39, 21, 338, 39, 339], [10, 39, 21, 143, 144, 145], [23, 7, 21, 200], [23, 7, 21, 275], [229, 273, 340], [230, 10, 229, 341], [7, 23, 228, 272, 184], [7, 23, 21, 200], [39, 10, 21, 145], [342, 343], [11, 344, 31, 345], [11, 10, 346], [276, 109], [8, 78, 347], [348, 349, 350], [351], [11, 

In [18]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [19]:
padded_doc = padding_doc(encoded_doc, max_length)

In [20]:
padded_doc[:5]

array([[ 25,  77, 332,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [ 25,  77,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  1,  25, 198, 181,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [ 51,  10,  77,  16,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  8, 268,   4,  10,  30,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int32)

In [21]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (1113, 28)


# Tokenizing the intents

In [22]:
#tokenizer wfor the intents
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')


In [23]:
output_tokenizer.word_index

{'faq.address_proof': 1,
 'commonq.assist': 2,
 'commonq.name': 3,
 'faq.biz_new': 4,
 'commonq.bot': 5,
 'commonq.just_details': 6,
 'faq.apply_register': 7,
 'faq.banking_option_missing': 8,
 'faq.biz_simpler': 9,
 'faq.borrow_limit': 10,
 'commonq.not_giving': 11,
 'commonq.wait': 12,
 'faq.aadhaar_missing': 13,
 'commonq.query': 14,
 'faq.biz_category_missing': 15,
 'faq.bad_service': 16,
 'faq.approval_time': 17,
 'faq.application_process': 18,
 'faq.borrow_use': 19,
 'commonq.how': 20,
 'contact.contact': 21}

In [24]:
type (intent)

pandas.core.series.Series

In [25]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [26]:
print (encoded_output)
type (encoded_output)

[[2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [20], [20], [20], [20], [20], [20], [20], [20], [20], [20], [20], [20], [6], [6], [6], [6], [6], [6], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [11], [11], [11], [11], [11], [11], [11], [11], [11], [11], [11], [14], [14], [14], [14], [14], [14], [14], [12], [12], [12], [12], [12], [12], [12], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [21], [

list

In [27]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [28]:
print (encoded_output)
type (encoded_output)

[[ 2]
 [ 2]
 [ 2]
 ...
 [19]
 [19]
 [19]]


numpy.ndarray

In [29]:
encoded_output.shape

(1113, 1)

In [30]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [31]:
output_one_hot = one_hot(encoded_output)

In [32]:
output_one_hot.shape

(1113, 21)

In [33]:
type (output_one_hot)

numpy.ndarray

# Define Training- and Testset

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)


In [36]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (890, 28) and train_Y = (890, 21)
Shape of val_X = (223, 28) and val_Y = (223, 21)


# Defining the Model

In [37]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
  model.add(Dense(32, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(21, activation = "softmax"))
  
  return model

In [38]:
print ("vocab_size: ", vocab_size)
print ("max_length: ", max_length)


vocab_size:  492
max_length:  28


In [39]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 28, 128)           62976     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_1 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 21)                693       
Total params: 335,061
Trainable params: 272,085
Non-trainable par

# Training the Model

In [40]:
import time

filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

start = time.time()

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

print("Elapsed time in seconds: ", time.time() - start)


Instructions for updating:
Use tf.cast instead.
Train on 890 samples, validate on 223 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 2.78590, saving model to model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 2.78590 to 2.72851, saving model to model.h5
Epoch 3/100

Epoch 00003: val_loss improved from 2.72851 to 2.60632, saving model to model.h5
Epoch 4/100

Epoch 00004: val_loss improved from 2.60632 to 2.58040, saving model to model.h5
Epoch 5/100

Epoch 00005: val_loss improved from 2.58040 to 2.46334, saving model to model.h5
Epoch 6/100

Epoch 00006: val_loss improved from 2.46334 to 2.45804, saving model to model.h5
Epoch 7/100

Epoch 00007: val_loss improved from 2.45804 to 2.31538, saving model to model.h5
Epoch 8/100

Epoch 00008: val_loss improved from 2.31538 to 2.20061, saving model to model.h5
Epoch 9/100

Epoch 00009: val_loss improved from 2.20061 to 2.19842, saving model to model.h5
Epoch 10/100

Epoch 00010: val_loss improved from 2.19842 to 2.1


Epoch 00039: val_loss improved from 1.03098 to 1.00428, saving model to model.h5
Epoch 40/100

Epoch 00040: val_loss did not improve from 1.00428
Epoch 41/100

Epoch 00041: val_loss improved from 1.00428 to 0.96339, saving model to model.h5
Epoch 42/100

Epoch 00042: val_loss did not improve from 0.96339
Epoch 43/100

Epoch 00043: val_loss did not improve from 0.96339
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.96339
Epoch 45/100

Epoch 00045: val_loss improved from 0.96339 to 0.91056, saving model to model.h5
Epoch 46/100

Epoch 00046: val_loss did not improve from 0.91056
Epoch 47/100

Epoch 00047: val_loss did not improve from 0.91056
Epoch 48/100

Epoch 00048: val_loss did not improve from 0.91056
Epoch 49/100

Epoch 00049: val_loss did not improve from 0.91056
Epoch 50/100

Epoch 00050: val_loss improved from 0.91056 to 0.89228, saving model to model.h5
Epoch 51/100

Epoch 00051: val_loss did not improve from 0.89228
Epoch 52/100

Epoch 00052: val_loss did not impr


Epoch 00081: val_loss did not improve from 0.66499
Epoch 82/100

Epoch 00082: val_loss did not improve from 0.66499
Epoch 83/100

Epoch 00083: val_loss did not improve from 0.66499
Epoch 84/100

Epoch 00084: val_loss did not improve from 0.66499
Epoch 85/100

Epoch 00085: val_loss did not improve from 0.66499
Epoch 86/100

Epoch 00086: val_loss did not improve from 0.66499
Epoch 87/100

Epoch 00087: val_loss did not improve from 0.66499
Epoch 88/100

Epoch 00088: val_loss did not improve from 0.66499
Epoch 89/100

Epoch 00089: val_loss did not improve from 0.66499
Epoch 90/100

Epoch 00090: val_loss did not improve from 0.66499
Epoch 91/100

Epoch 00091: val_loss did not improve from 0.66499
Epoch 92/100

Epoch 00092: val_loss improved from 0.66499 to 0.65580, saving model to model.h5
Epoch 93/100

Epoch 00093: val_loss did not improve from 0.65580
Epoch 94/100

Epoch 00094: val_loss did not improve from 0.65580
Epoch 95/100

Epoch 00095: val_loss did not improve from 0.65580
Epoch 96

In [41]:
 model = load_model("model.h5")

In [65]:
def predictions(text):
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
    test_word = word_tokenize(clean)
    test_word = [w.lower() for w in test_word]
    test_ls = word_tokenizer.texts_to_sequences(test_word)
    print(test_word)
    #Check for unknown words
    if [] in test_ls:
        test_ls = list(filter(None, test_ls))
    
    test_ls = np.array(test_ls).reshape(1, len(test_ls))

    print("test_ls: ", test_ls)

    x = padding_doc(test_ls, max_length)

    #print ("x: ", x)
    
    pred = model.predict_proba(x)
  
    return pred


In [66]:
def get_final_output(pred, classes):
    #print (type (pred))
    #print (pred)
    
    predictions = pred[0]
 
    classes = np.array(classes)
    ids = np.argsort(-predictions)
    classes = classes[ids]
    predictions = -np.sort(-predictions)

    for i in range(pred.shape[1]):
        print("%s has confidence = %s" % (classes[i], (predictions[i])))


In [70]:
text = "orrow money given can used by me for what reasons?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['orrow', 'money', 'given', 'can', 'used', 'by', 'me', 'for', 'what', 'reasons']
test_ls:  [[ 42 133   4  48  68  16   5   8 108]]
faq.borrow_use has confidence = 0.9997315
faq.biz_simpler has confidence = 0.00022668822
faq.application_process has confidence = 3.967376e-05
faq.borrow_limit has confidence = 2.1740723e-06
faq.biz_new has confidence = 1.1228516e-08
faq.approval_time has confidence = 5.149337e-09
commonQ.name has confidence = 2.9953149e-09
faq.aadhaar_missing has confidence = 1.1742087e-09
commonQ.just_details has confidence = 6.154682e-12
commonQ.wait has confidence = 4.4551466e-13
faq.apply_register has confidence = 3.9353574e-14
contact.contact has confidence = 7.382199e-15
commonQ.bot has confidence = 4.6436553e-15
commonQ.assist has confidence = 1.8390822e-15
faq.address_proof has confidence = 9.125884e-16
commonQ.query has confidence = 1.1856214e-16
faq.biz_category_missing has confidence = 1.3492874e-18
commonQ.how has confidence = 1.1865749e-18
faq.banking_option_m