In this notebook we will demonstrate various CNN and RNN models for the task of intent detection on the ATIS dataset. 

In [None]:

#making the necessary imports
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model, Sequential
from keras.initializers import Constant

from sklearn.preprocessing import LabelEncoder

import random
random.seed(0)#for reproducability of results

import pandas as pd
import numpy as np

Using TensorFlow backend.


Loading the training data

In [4]:
import os
os.getcwd()

'/content'

In [12]:
import pandas as pd
import numpy as np

def get_data(filename):
    df = pd.read_csv(filename,delim_whitespace=True,names=['word','label'])
    beg_indices = list(df[df['word'] == 'BOS'].index)+[df.shape[0]]
    sents,labels,intents = [],[],[]
    for i in range(len(beg_indices[:-1])):
        sents.append(df[beg_indices[i]+1:beg_indices[i+1]-1]['word'].values)
        labels.append(df[beg_indices[i]+1:beg_indices[i+1]-1]['label'].values)
        intents.append(df.loc[beg_indices[i+1]-1]['label'])    
    return np.array(sents, dtype=object), np.array(labels, dtype=object), np.array(intents, dtype=object)

def get_data2(filename):
    with open(filename) as f:
        contents = f.read()
    sents,labels,intents = [],[],[]
    for line in contents.strip().split('\n'):
        words,labs = [i.split(' ') for i in line.split('\t')]
        sents.append(words[1:-1])
        labels.append(labs[1:-1])
        intents.append(labs[-1])
    return np.array(sents, dtype=object), np.array(labels, dtype=object), np.array(intents, dtype=object)

read_method = {'atis-2.dev.w-intent.iob':get_data,
               'atis.train.w-intent.iob':get_data2,
               'atis.test.w-intent.iob':get_data,
              'atis-2.train.w-intent.iob':get_data2}

def fetch_data(fname):
    func = read_method[fname]
    return func(fname)

In [19]:
#utils is included in this repository'c Ch6 folder under folder name "Data"
#from Data.utils import fetch_data, read_method
sents,labels,intents = fetch_data('atis.train.w-intent.iob')
print(len(sents),len(labels),len(intents))

4978 4978 4978


In [18]:
sents

array([list(['i', 'want', 'to', 'fly', 'from', 'boston', 'at', '838', 'am', 'and', 'arrive', 'in', 'denver', 'at', '1110', 'in', 'the', 'morning']),
       list(['what', 'flights', 'are', 'available', 'from', 'pittsburgh', 'to', 'baltimore', 'on', 'thursday', 'morning']),
       list(['what', 'is', 'the', 'arrival', 'time', 'in', 'san', 'francisco', 'for', 'the', '755', 'am', 'flight', 'leaving', 'washington']),
       ...,
       list(['which', 'airlines', 'fly', 'into', 'and', 'out', 'of', 'denver']),
       list(['does', 'continental', 'fly', 'from', 'boston', 'to', 'san', 'francisco', 'with', 'a', 'stop', 'in', 'denver']),
       list(['is', 'there', 'a', 'delta', 'flight', 'from', 'denver', 'to', 'san', 'francisco'])],
      dtype=object)

In [16]:
labels

array([list(['O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-depart_time.time', 'I-depart_time.time', 'O', 'O', 'O', 'B-toloc.city_name', 'O', 'B-arrive_time.time', 'O', 'O', 'B-arrive_time.period_of_day']),
       list(['O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'O', 'B-depart_date.day_name', 'B-depart_time.period_of_day']),
       list(['O', 'O', 'O', 'B-flight_time', 'I-flight_time', 'O', 'B-fromloc.city_name', 'I-fromloc.city_name', 'O', 'O', 'B-depart_time.time', 'I-depart_time.time', 'O', 'O', 'B-fromloc.city_name']),
       ..., list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-city_name']),
       list(['O', 'B-airline_name', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'O', 'O', 'O', 'O', 'B-stoploc.city_name']),
       list(['O', 'O', 'O', 'B-airline_name', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name'])],
      dtype=object)

In [17]:
intents

array(['atis_flight', 'atis_flight', 'atis_flight_time', ...,
       'atis_airline', 'atis_flight', 'atis_flight'], dtype=object)

In [24]:
train_sentences = [" ".join(i) for i in sents]
train_sentences[:10]

['i want to fly from boston at 838 am and arrive in denver at 1110 in the morning',
 'what flights are available from pittsburgh to baltimore on thursday morning',
 'what is the arrival time in san francisco for the 755 am flight leaving washington',
 'cheapest airfare from tacoma to orlando',
 'round trip fares from pittsburgh to philadelphia under 1000 dollars',
 'i need a flight tomorrow from columbus to minneapolis',
 'what kind of aircraft is used on a flight from cleveland to dallas',
 'show me the flights from pittsburgh to los angeles on thursday',
 'all flights from boston to washington',
 'what kind of ground transportation is available in denver']

In [26]:
train_texts = train_sentences
train_labels= intents.tolist()
train_labels[:5]

['atis_flight',
 'atis_flight',
 'atis_flight_time',
 'atis_airfare',
 'atis_airfare']

In [28]:
set(train_labels)

{'atis_abbreviation',
 'atis_aircraft',
 'atis_aircraft#atis_flight#atis_flight_no',
 'atis_airfare',
 'atis_airfare#atis_flight_time',
 'atis_airline',
 'atis_airline#atis_flight_no',
 'atis_airport',
 'atis_capacity',
 'atis_cheapest',
 'atis_city',
 'atis_distance',
 'atis_flight',
 'atis_flight#atis_airfare',
 'atis_flight_no',
 'atis_flight_time',
 'atis_ground_fare',
 'atis_ground_service',
 'atis_ground_service#atis_ground_fare',
 'atis_meal',
 'atis_quantity',
 'atis_restriction'}

In [29]:
vals = []

for i in range(len(train_labels)):
    if "#" in train_labels[i]:
        vals.append(i)
        
for i in vals[::-1]:
    train_labels.pop(i)
    train_texts.pop(i)

print ("Number of training sentences :",len(train_texts))
print ("Number of unique intents :",len(set(train_labels)))

for i in zip(train_texts[:5], train_labels[:5]):
    print(i)

Number of training sentences : 4952
Number of unique intents : 17
('i want to fly from boston at 838 am and arrive in denver at 1110 in the morning', 'atis_flight')
('what flights are available from pittsburgh to baltimore on thursday morning', 'atis_flight')
('what is the arrival time in san francisco for the 755 am flight leaving washington', 'atis_flight_time')
('cheapest airfare from tacoma to orlando', 'atis_airfare')
('round trip fares from pittsburgh to philadelphia under 1000 dollars', 'atis_airfare')


Loading the test data

In [56]:
#from Data.utils import fetch_data, read_method

sents,labels,intents = fetch_data('atis.test.w-intent.iob')

test_sentences = [" ".join(i) for i in sents]

test_texts = test_sentences
test_labels = intents.tolist()

new_labels = set(test_labels) - set(train_labels)

vals = []

for i in range(len(test_labels)):
    if "#" in test_labels[i]:
        vals.append(i)
    elif test_labels[i] in new_labels:
        print(test_labels[i])
        vals.append(i)
        
for i in vals[::-1]:
    test_labels.pop(i)
    test_texts.pop(i)

print ("Number of testing sentences :",len(test_texts))
print ("Number of unique intents :",len(set(test_labels)))

for i in zip(test_texts[:5], test_labels[:5]):
    print(i)

atis_flight
atis_airfare
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_ground_service
atis_flight
atis_day_name
atis_flight
atis_day_name
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_meal
atis_meal
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_flight
atis_fl

In [57]:
set(test_labels)

set()

Pre-Processing

In [58]:
MAX_SEQUENCE_LENGTH = 300
MAX_NUM_WORDS = 20000 
EMBEDDING_DIM = 100 
VALIDATION_SPLIT = 0.3

Su dung Toeknizer (max 20,000 tu) de hoc toan bo tu trong train_texts, sau do se ma hoa tung tu trong cau theo word index (sap xep theo thu tu pho bien nhat)

In [59]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes
test_sequences = tokenizer.texts_to_sequences(test_texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 897 unique tokens.


In [60]:
word_index

{'0900': 846,
 '1': 308,
 '10': 148,
 '100': 539,
 '1000': 227,
 '1017': 844,
 '1020': 805,
 '1024': 638,
 '1026': 682,
 '1030': 656,
 '1039': 694,
 '1045': 758,
 '1055': 771,
 '1059': 731,
 '106': 664,
 '1083': 845,
 '11': 379,
 '110': 722,
 '1100': 672,
 '1110': 530,
 '1115': 446,
 '1130': 657,
 '1133': 851,
 '1145': 582,
 '1158': 739,
 '12': 176,
 '1200': 833,
 '1205': 780,
 '1209': 871,
 '1220': 715,
 '1222': 572,
 '1230': 836,
 '124': 683,
 '1245': 447,
 '1288': 645,
 '1291': 501,
 '130': 747,
 '1300': 873,
 '137338': 828,
 '139': 670,
 '150': 721,
 '1500': 868,
 '1505': 813,
 '1600': 706,
 '163': 890,
 '1700': 548,
 '1765': 553,
 '1850': 812,
 '19': 668,
 '1940': 881,
 '1991': 258,
 '1992': 450,
 '1993': 809,
 '2': 228,
 '200': 697,
 '201': 567,
 '21': 502,
 '210': 578,
 '2100': 461,
 '212': 855,
 '2134': 550,
 '2153': 659,
 '217': 590,
 '225': 738,
 '229': 841,
 '230': 434,
 '257': 646,
 '269': 612,
 '270': 523,
 '271': 732,
 '279': 827,
 '281': 397,
 '296': 513,
 '297': 700,
 '

In [61]:
train_sequences[:3]

[[18, 70, 1, 38, 2, 9, 67, 402, 86, 16, 79, 15, 12, 67, 530, 15, 4, 35],
 [6, 3, 26, 58, 2, 19, 1, 22, 5, 77, 35],
 [6, 20, 4, 403, 180, 15, 11, 14, 37, 4, 531, 86, 8, 33, 32]]

In [62]:
le = LabelEncoder()
le.fit(train_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)
print(train_labels.shape)
print(np.unique(train_labels))
train_labels

(4952,)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]


array([ 9,  9, 11, ...,  3,  9,  9])

In [63]:
#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier
 #initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH
trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
trainvalid_labels = to_categorical(train_labels)
print(trainvalid_data.shape)
trainvalid_data

(4952, 300)


array([[  0,   0,   0, ...,  15,   4,  35],
       [  0,   0,   0, ...,   5,  77,  35],
       [  0,   0,   0, ...,   8,  33,  32],
       ...,
       [  0,   0,   0, ..., 230,  29,  12],
       [  0,   0,   0, ..., 109,  15,  12],
       [  0,   0,   0, ...,   1,  11,  14]], dtype=int32)

In [64]:
print(np.unique(test_labels))
test_labels

[]


array([], dtype=float64)

In [49]:
test_labels = to_categorical(np.asarray(test_labels), num_classes= trainvalid_labels.shape[1])
print(test_labels.shape)
test_labels

(876, 17, 17)


array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

In [None]:
# split the training data into a training set and a validation set
indices = np.arange(trainvalid_data.shape[0])
np.random.shuffle(indices)
trainvalid_data = trainvalid_data[indices]
trainvalid_labels = trainvalid_labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])
x_train = trainvalid_data[:-num_validation_samples]
y_train = trainvalid_labels[:-num_validation_samples]
x_val = trainvalid_data[-num_validation_samples:]
y_val = trainvalid_labels[-num_validation_samples:]
#This is the data we will use for CNN and RNN training
print('Splitting the train data into train and valid is done')

Splitting the train data into train and valid is done


In [None]:
print('Preparing embedding matrix.')

# first, build index mapping words in the embeddings set
# to their embedding vector

# Download GloVe 6B from here: https://nlp.stanford.edu/projects/glove/
BASE_DIR = 'Data'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))
#print(embeddings_index["google"])

# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load these pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
print("Preparing of embedding matrix is done")

Preparing embedding matrix.
Found 400001 word vectors in Glove embeddings.
Preparing of embedding matrix is done


In [None]:
print('Define a 1D CNN model.')

cnnmodel = Sequential()
cnnmodel.add(embedding_layer)
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(128, activation='relu'))
cnnmodel.add(Dense(len(trainvalid_labels[0]), activation='softmax'))

cnnmodel.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

cnnmodel.summary()

#Train the model. Tune to validation set. 
cnnmodel.fit(x_train, y_train,
          batch_size=128,
          epochs=1, validation_data=(x_val, y_val))
#Evaluate on test set:
score, acc = cnnmodel.evaluate(test_data, test_labels)
print('Test accuracy with CNN:', acc)

Define a 1D CNN model.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 100)          89800     
_________________________________________________________________
conv1d (Conv1D)              (None, 296, 128)          64128     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 59, 128)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 55, 128)           82048     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 11, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 7, 128)            82048     
_________________________________________________________________
global_max_pooling1d (Global (Non

In [None]:
print("Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings")
cnnmodel = Sequential()
cnnmodel.add(Embedding(MAX_NUM_WORDS, 128))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(Conv1D(128, 5, activation='relu'))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(128, activation='relu'))
cnnmodel.add(Dense(len(trainvalid_labels[0]), activation='softmax'))

cnnmodel.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

cnnmodel.summary()

#Train the model. Tune to validation set. 
cnnmodel.fit(x_train, y_train,
          batch_size=128,
          epochs=1, validation_data=(x_val, y_val))
#Evaluate on test set:
score, acc = cnnmodel.evaluate(test_data, test_labels)
print('Test accuracy with CNN:', acc)


Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, None, 128)         82048     
_________

In [None]:
print("Defining and training an LSTM model, training embedding layer on the fly")

#modified from: 

rnnmodel = Sequential()
rnnmodel.add(Embedding(MAX_NUM_WORDS, 128))
rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel.add(Dense(len(trainvalid_labels[0]), activation='sigmoid'))
rnnmodel.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

rnnmodel.summary()

print('Training the RNN')
rnnmodel.fit(x_train, y_train,
          batch_size=32,
          epochs=1,
          validation_data=(x_val, y_val))
score, acc = rnnmodel.evaluate(test_data, test_labels,
                            batch_size=32)
print('Test accuracy with RNN:', acc)


Defining and training an LSTM model, training embedding layer on the fly
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense_4 (Dense)              (None, 17)                2193      
Total params: 2,693,777
Trainable params: 2,693,777
Non-trainable params: 0
_________________________________________________________________
Training the RNN
Test accuracy with RNN: 0.7214611768722534


In [None]:
print("Defining and training an LSTM model, using pre-trained embedding layer")

#modified from: 

rnnmodel2 = Sequential()
rnnmodel2.add(embedding_layer)
rnnmodel2.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel2.add(Dense(len(trainvalid_labels[0]), activation='sigmoid'))
rnnmodel2.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

rnnmodel2.summary()

print('Training the RNN')
rnnmodel2.fit(x_train, y_train,
          batch_size=32,
          epochs=1,
          validation_data=(x_val, y_val))
score, acc = rnnmodel2.evaluate(test_data, test_labels,
                            batch_size=32)
print('Test accuracy with RNN:', acc)


Defining and training an LSTM model, using pre-trained embedding layer
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 100)          89800     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_5 (Dense)              (None, 17)                2193      
Total params: 209,241
Trainable params: 119,441
Non-trainable params: 89,800
_________________________________________________________________
Training the RNN
Test accuracy with RNN: 0.7214611768722534
