This is a straightforward approach to map natural language to intents in the form of labels. So the whole effort really is in first converting the NL input into vectors and training an appropriate network which understands arbitrary variations.

Mapping input to intends is a common approach to chat-bots in order to present the appropriate response. For example, there are many ways to request a train ticket but it all maps to the same purchase procedure. So, this kinda learning attempts to handle all ways.

In [2]:
# Import necessary libraries
import keras
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D, LSTM
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
import itertools
import numpy as np
from keras.utils.np_utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


The data we’ll use in the limited case is as follows:



In [3]:
train = ["What would it cost to travel to the city on Monday?",
         "Need to travel this afternoon",
         "I want to buy a ticket",
         "Can I order a trip?", 
         "I would like to buy a ticket to Brussels", 
 
         "What will be the weather tomorrow?",
         "Will it rain this afternoon?",
         "The sunshine feels great",
         "Can you predict rain?",
         "Guess I should wear a jacket hey!",
 
        "Dit is geheel iets anders",
         "Kan ik dit goed vinden",
         "Wat is dit soms goed",
        "Maar anders is soms goed"]
 
T = "Buy a train ticket"
W = "Asking about the weather"
F = "Babble in 't Vlaamsch"

labelsTrain = [T,
               T,
               T,
               T,
               T,
 
               W,
               W,
               W,
               W,
               W,
 
               F,
               F,
               F,
               F]
 
test = [
        "Do you think it will be sunny tomorrow?",
        "What a wonderful feeling in the sun!",
        "How can I travel to Leuven?",
        "Can I buy it from you?",
        "Anders is heel goed"
       ]
labelsTest = [W, W, T, T, F]
 

This data is constrained in the sense that the testing questions use only words which were trained. In this case we can use the Keras Tokenizer class:

In [10]:
tokenizer = Tokenizer()
all_texts = train + test # Combines two lists
tokenizer.fit_on_texts(all_texts) 
print(tokenizer.word_index)
print("\nLength of all text:",len(all_texts))

{'to': 1, 'i': 2, 'a': 3, 'it': 4, 'the': 5, 'can': 6, 'is': 7, 'goed': 8, 'what': 9, 'travel': 10, 'buy': 11, 'will': 12, 'you': 13, 'dit': 14, 'anders': 15, 'would': 16, 'this': 17, 'afternoon': 18, 'ticket': 19, 'be': 20, 'tomorrow': 21, 'rain': 22, 'soms': 23, 'cost': 24, 'city': 25, 'on': 26, 'monday': 27, 'need': 28, 'want': 29, 'order': 30, 'trip': 31, 'like': 32, 'brussels': 33, 'weather': 34, 'sunshine': 35, 'feels': 36, 'great': 37, 'predict': 38, 'guess': 39, 'should': 40, 'wear': 41, 'jacket': 42, 'hey': 43, 'geheel': 44, 'iets': 45, 'kan': 46, 'ik': 47, 'vinden': 48, 'wat': 49, 'maar': 50, 'do': 51, 'think': 52, 'sunny': 53, 'wonderful': 54, 'feeling': 55, 'in': 56, 'sun': 57, 'how': 58, 'leuven': 59, 'from': 60, 'heel': 61}

Length of all text: 19


texts_to_matrix method -> to convert the sentences directly to equal size arrays:

In [13]:
X_train = tokenizer.texts_to_matrix(train) # Convert the sentences directly to equal size arrays
X_test = tokenizer.texts_to_matrix(test)   # Convert the sentences directly to equal size arrays

In [23]:
print(train)
print("\nShape of X_train:",X_train.shape)
print("Shpae of X_test:",X_test.shape)

['What would it cost to travel to the city on Monday?', 'Need to travel this afternoon', 'I want to buy a ticket', 'Can I order a trip?', 'I would like to buy a ticket to Brussels', 'What will be the weather tomorrow?', 'Will it rain this afternoon?', 'The sunshine feels great', 'Can you predict rain?', 'Guess I should wear a jacket hey!', 'Dit is geheel iets anders', 'Kan ik dit goed vinden', 'Wat is dit soms goed', 'Maar anders is soms goed']

Shape of X_train: (14, 62)
Shpae of X_test: (5, 62)


In [21]:
# Displaying first 2 rows of X_train:
for i in range(0,2):
    print(X_train[i])

[0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


### The labels are converted to index vectors in order to use categorical crossentropy.

In [24]:
all_labels = labelsTest + labelsTrain
labels = set(all_labels) # set -> function avoid duplicates
idx2labels = list(labels)
print(idx2labels)

['Buy a train ticket', 'Asking about the weather', "Babble in 't Vlaamsch"]


In [26]:
# The enumerate() function adds a counter to an iterable.
label2idx = dict((v, i) for i, v in enumerate(labels)) # dictionary holding key and value
print(label2idx)

{'Buy a train ticket': 0, 'Asking about the weather': 1, "Babble in 't Vlaamsch": 2}


In [27]:
y_train = to_categorical([label2idx[w] for w in labelsTrain])
y_test = to_categorical([label2idx[w] for w in labelsTest])

In [72]:
print(y_test)

[[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [76]:
for i in labelsTest:
    print(label2idx[i],end=" ")

1 1 0 0 2 

In [51]:
vocab_size = len(tokenizer.word_index) + 1
 
model = Sequential()
model.add(Embedding(2, 45, input_length= X_train.shape[1], dropout=0.2 ))
model.add(Flatten())
model.add(Dense(50, name='middle'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax', name='output')) 
 
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
 
model.fit(X_train, y=y_train, epochs=1500, verbose=0, validation_split=0.2, shuffle=True)
 
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

  after removing the cwd from sys.path.


acc: 80.00%


Even without sophisticated RNN you get all the accuracy you can wish. Note that despite the lack of non-learned words you can still get results:

In [52]:
model.predict(tokenizer.texts_to_matrix(["Welke dag is het vandaag?"])).round()

array([[0., 1., 0.]], dtype=float32)

### Using word2vec and LSTM

Below you can find networks on the basis of word2vec and LSTM.
There are a few factors:

- the training data is measure zero
- the pretrained word embedding is in English, so the Flemish language is not embedded and hence not learned
- there is no temporal correlation and the learning is not used towards predicting the next word or something like this, so LSTM is not very meaningful here

Still, an accuracy of 20% while the plain dense network gives 100% is surprising. Definitely a proof that ‘more’ is not always automatically better in this field.

In [55]:
embeddings_index = {}
# see here to download the pretrained model
# http://nlp.stanford.edu/projects/glove/
glove_data = './data/glove.6B/glove.6B.50d.txt'
f = open(glove_data,encoding='UTF-8')
for line in f:
    values = line.split()
    word = values[0]
    value = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = value
f.close()
 
print('Loaded %s word vectors.' % len(embeddings_index))
 

Loaded 400000 word vectors.


In [56]:
embedding_dimension = 50 # Setting embedding dimension to 50
word_index = tokenizer.word_index 
print(word_index)

{'to': 1, 'i': 2, 'a': 3, 'it': 4, 'the': 5, 'can': 6, 'is': 7, 'goed': 8, 'what': 9, 'travel': 10, 'buy': 11, 'will': 12, 'you': 13, 'dit': 14, 'anders': 15, 'would': 16, 'this': 17, 'afternoon': 18, 'ticket': 19, 'be': 20, 'tomorrow': 21, 'rain': 22, 'soms': 23, 'cost': 24, 'city': 25, 'on': 26, 'monday': 27, 'need': 28, 'want': 29, 'order': 30, 'trip': 31, 'like': 32, 'brussels': 33, 'weather': 34, 'sunshine': 35, 'feels': 36, 'great': 37, 'predict': 38, 'guess': 39, 'should': 40, 'wear': 41, 'jacket': 42, 'hey': 43, 'geheel': 44, 'iets': 45, 'kan': 46, 'ik': 47, 'vinden': 48, 'wat': 49, 'maar': 50, 'do': 51, 'think': 52, 'sunny': 53, 'wonderful': 54, 'feeling': 55, 'in': 56, 'sun': 57, 'how': 58, 'leuven': 59, 'from': 60, 'heel': 61}


In [57]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimension)) # Initiallizing the embedding matrix with 0

In [None]:
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector[:embedding_dimension]
 

In [67]:
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            input_length=len(word_index) + 1)


In [59]:
from keras.preprocessing.sequence import pad_sequences
X_train = tokenizer.texts_to_sequences(train) # Which Turns The input into numerical arrays 
print(X_train)

[[9, 16, 4, 24, 1, 10, 1, 5, 25, 26, 27], [28, 1, 10, 17, 18], [2, 29, 1, 11, 3, 19], [6, 2, 30, 3, 31], [2, 16, 32, 1, 11, 3, 19, 1, 33], [9, 12, 20, 5, 34, 21], [12, 4, 22, 17, 18], [5, 35, 36, 37], [6, 13, 38, 22], [39, 2, 40, 41, 3, 42, 43], [14, 7, 44, 45, 15], [46, 47, 14, 8, 48], [49, 7, 14, 23, 8], [50, 15, 7, 23, 8]]


In [61]:
print("Length of word_index:",len(word_index))

Length of word_index: 61


In [63]:
X_train = pad_sequences(X_train, maxlen=len(word_index) + 1) 
# pad_sequence takes a LIST of sequences as an input (list of list) and will return a list of padded sequences.

# First 2 rows of X_train
for i in range(0,2):
    print(X_train[i])

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  9 16  4 24  1 10  1  5 25 26 27]
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0 28  1 10 17 18]


In [70]:
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(50, activation='sigmoid'))
model.layers[0].trainable=False # bug in Keras or Theano
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax', name='output')) 
 
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
 
model.fit(X_train, y=y_train, epochs=2500, verbose=0, validation_split=0.2, shuffle=True)
 
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 40.00%


In [71]:
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
model.add(Dense(3, activation='softmax', name='output')) 
 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
 
model.fit(X_train, y=y_train, nb_epoch=1000, verbose=0, validation_split=0.2, shuffle=True)
 
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
 

  This is separate from the ipykernel package so we can avoid doing imports until


acc: 40.00%
