In [1]:
import nltk
#import gensim
import pandas as pd
import numpy as np
import snowballstemmer
from sklearn.model_selection import train_test_split

## Loading Intent DataSet

In [2]:
intent_dataset = pd.read_csv('./snipsdataset.csv')
print(intent_dataset['intents'].unique())
print(len(intent_dataset['intents'].unique()))
lm_dataset = pd.DataFrame()
lm_dataset[['text']] = intent_dataset[['text']]

FileNotFoundError: File b'./snipsdataset.csv' does not exist

## Split DataSet in Training Set / Testing Set in 80% / 20% (can be changed)

In [11]:
intent_train_dataset, intent_test_dataset = train_test_split(intent_dataset, test_size=0.2, random_state = 1)


In [13]:
print("Test Dataset : " ,len(intent_test_dataset))
print("Train Dataset : ", len(intent_train_dataset))
print("Labels : ", len(intent_train_dataset.intents.unique()))

Test Dataset :  3177
Train Dataset :  12707
Labels :  7


# Custom RNN for Text Classification (with TFIDF embeddings)

Load Keras Library

In [14]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, LSTM, Embedding
from sklearn.preprocessing import LabelBinarizer

Using TensorFlow backend.


Check if Keras uses GPU

In [15]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

Load Sentences from pandas

In [16]:
train_sentences = intent_train_dataset['text'].values
test_sentences = intent_test_dataset['text'].values
train_labels = intent_train_dataset['intents'].values
test_labels = intent_test_dataset['intents'].values

Parameters for the model

In [21]:
num_labels = 7
vocab_size = 15000
batch_size = 1000
top_words = 15000
embedding_vector_length = 300

Tokenize using TFIDF

In [18]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)
x_train = tokenizer.texts_to_matrix(train_sentences, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_sentences, mode='tfidf')

Encode the Labels

In [19]:
encoder = LabelBinarizer()
encoder.fit(train_labels)
y_train = encoder.transform(train_labels)
y_test = encoder.transform(test_labels)

Build the Model

In [24]:
model = Sequential()
model.add(Dense(50, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()
 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 50)                750050    
_________________________________________________________________
activation_7 (Activation)    (None, 50)                0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 7)                 357       
_________________________________________________________________
activation_8 (Activation)    (None, 7)                 0         
Total params: 750,407
Trainable params: 750,407
Non-trainable params: 0
_________________________________________________________________
Train on 11436 samples, validate on 1271 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

Test the model

In [25]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
 
print('Test accuracy:', score[1])
 
text_labels = encoder.classes_

Result = []
for i in range(len(intent_test_dataset)):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    Result.append(predicted_label)
    
# Append the list of results to the test dataframe
pd.options.mode.chained_assignment = None  # default='warn'
intent_test_dataset['result_CustomRNN'] = Result


Test accuracy: 0.9757632947696017


Confusion Matrix

In [28]:
from pandas_ml import ConfusionMatrix
cm = ConfusionMatrix(intent_test_dataset['intents'].values, intent_test_dataset['result_CustomRNN'].values)
cm.print_stats()

  num = df[df > 1].dropna(axis=[0, 1], thresh=1).applymap(lambda n: choose(n, 2)).sum().sum() - np.float64(nis2 * njs2) / n2
  return(np.float64(self.LRP) / self.LRN)


Confusion Matrix:

Predicted             AddToPlaylist  BookRestaurant  GetWeather  PlayMusic  \
Actual                                                                       
AddToPlaylist                   444               0           0          3   
BookRestaurant                    0             461           0          0   
GetWeather                        0               3         459          0   
PlayMusic                        19               0           1        381   
RateBook                          0               1           0          0   
SearchCreativeWork                3               0           1          6   
SearchScreeningEvent              0               0           1          0   
__all__                         466             465         462        390   

Predicted             RateBook  SearchCreativeWork  SearchScreeningEvent  \
Actual                                                                     
AddToPlaylist                0                  

# Custom CNN for Text Classification (with Learned embeddings)

Load Keras Library

In [29]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, Model
from keras.layers import Activation, Concatenate, Dense, Conv2D, Reshape, MaxPool2D, Dropout, LSTM, Embedding, Bidirectional, GlobalAveragePooling1D, CuDNNLSTM, Input, Multiply, TimeDistributed, multiply, Flatten, RepeatVector, Permute, Lambda
from keras.optimizers import Adamax, Adam
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

Check if Keras uses GPU

In [30]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

Parameters for the model

In [45]:
num_labels = 7
vocab_size = 15000
batch_size = 1000
top_words = 15000
embedding_vector_length = 300
maxlen = 30

## for CNN
filter_sizes = [2,4,6]
num_filters = 20
drop = 0.5


Tokenize

In [36]:
import snowballstemmer
import nltk
pd.options.mode.chained_assignment = None  # default='warn'

stemmer = snowballstemmer.EnglishStemmer
stop = nltk.corpus.stopwords.words('english')
#stop.extend(lowfreq)
toktok = nltk.tokenize.toktok.ToktokTokenizer()

intent_train_dataset['cleaned'] = intent_train_dataset['text']
intent_train_dataset['cleaned'].replace('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~’”“′‘\\\]',' ',inplace=True,regex=True)
intent_train_dataset['cleaned'] = intent_train_dataset['cleaned'].str.lower()
intent_train_dataset['cleaned'] = intent_train_dataset['cleaned'].apply(toktok.tokenize)
intent_train_dataset['cleaned'] = intent_train_dataset['cleaned'].apply(lambda x: [word for word in x if word not in stop])
intent_train_dataset['cleaned'] = intent_train_dataset['cleaned'].apply(lambda x: " ".join(x))

intent_test_dataset['cleaned'] = intent_test_dataset['text']
intent_test_dataset['cleaned'].replace('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~’”“′‘\\\]',' ',inplace=True,regex=True)
intent_test_dataset['cleaned'] = intent_test_dataset['cleaned'].str.lower()
intent_test_dataset['cleaned'] = intent_test_dataset['cleaned'].apply(toktok.tokenize)
intent_test_dataset['cleaned'] = intent_test_dataset['cleaned'].apply(lambda x: [word for word in x if word not in stop])
intent_test_dataset['cleaned'] = intent_test_dataset['cleaned'].apply(lambda x: " ".join(x))

Load Sentences from pandas

In [37]:
train_sentences = intent_train_dataset['cleaned'].values
test_sentences = intent_test_dataset['cleaned'].values
train_labels = intent_train_dataset['intents'].values
test_labels = intent_test_dataset['intents'].values

In [40]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)
x_train = tokenizer.texts_to_sequences(train_sentences)
x_test = tokenizer.texts_to_sequences(test_sentences)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
x_train.shape

(12707, 30)

Encode the Labels

In [41]:
encoder = LabelBinarizer()
encoder.fit(train_labels)
y_train = encoder.transform(train_labels)
y_test = encoder.transform(test_labels)

Build the Model

In [46]:
Inputs_W = Input(shape=(maxlen,))
Inputs_E = Embedding(vocab_size, embedding_vector_length, input_length=maxlen)(Inputs_W)
reshape = Reshape((maxlen,embedding_vector_length,1))(Inputs_E)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_vector_length), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_vector_length), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_vector_length), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
Dense_Label = Dense(num_labels)(dropout)
Classifier = Activation('softmax')(Dense_Label)

model = Model(input=Inputs_W, output=Classifier)
model.summary()

optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0015, amsgrad=False)
    
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=25,
                    verbose=1,
                    validation_split=0.1)



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 30, 300)      4500000     input_3[0][0]                    
__________________________________________________________________________________________________
reshape_3 (Reshape)             (None, 30, 300, 1)   0           embedding_3[0][0]                
__________________________________________________________________________________________________
conv2d_4 (Conv2D)               (None, 29, 1, 20)    12020       reshape_3[0][0]                  
__________________________________________________________________________________________________
conv2d_5 (

Test the model

In [47]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
 
print('Test accuracy:', score[1])
 
text_labels = encoder.classes_

Result = []
for i in range(len(intent_test_dataset)):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    Result.append(predicted_label)
    
# Append the list of results to the test dataframe
pd.options.mode.chained_assignment = None  # default='warn'
intent_test_dataset['result_CustomRNN'] = Result


Test accuracy: 0.9807995105450131


Confusion Matrix

In [None]:
from pandas_ml import ConfusionMatrix
cm = ConfusionMatrix(intent_test_dataset['ACM'].values, intent_test_dataset['result_CustomRNN'].values)
cm.print_stats()

# Custom LSTM for Text Classification (with Learned Vector embeddings)

Load Keras Library

In [50]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, LSTM, Embedding, Bidirectional, GlobalAveragePooling1D
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelBinarizer

Check if Keras uses GPU

In [51]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

Load Sentences from pandas

In [52]:
train_sentences = intent_train_dataset['text'].values
test_sentences = intent_test_dataset['text'].values
train_labels = intent_train_dataset['intents'].values
test_labels = intent_test_dataset['intents'].values

Parameters for the model

In [65]:
num_labels = 7
vocab_size = 20000
batch_size = 100
lstm_size = 256
maxlen = 50
embedding_vector_length = 256

Tokenize

In [60]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_sentences)
x_train = tokenizer.texts_to_sequences(train_sentences)
x_test = tokenizer.texts_to_sequences(test_sentences)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

Encode the Labels

In [61]:
encoder = LabelBinarizer()
encoder.fit(train_labels)
y_train = encoder.transform(train_labels)
y_test = encoder.transform(test_labels)

Build the Model

In [66]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=maxlen))
model.add(Bidirectional(LSTM(lstm_size, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()
 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 50, 256)           5120000   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 512)               1050624   
_________________________________________________________________
dense_15 (Dense)             (None, 7)                 3591      
_________________________________________________________________
activation_15 (Activation)   (None, 7)                 0         
Total params: 6,174,215
Trainable params: 6,174,215
Non-trainable params: 0
_________________________________________________________________
Train on 11436 samples, validate on 1271 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Test the model

In [67]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
 
print('Test accuracy:', score[1])
 
text_labels = encoder.classes_

Result = []
for i in range(len(intent_test_dataset)):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    Result.append(predicted_label)
    
# Append the list of results to the test dataframe
pd.options.mode.chained_assignment = None  # default='warn'
intent_test_dataset['result_CustomLSTM'] = Result


Test accuracy: 0.982373317457949


KeyboardInterrupt: 

Confusion Matrix

In [None]:
from pandas_ml import ConfusionMatrix
cm = ConfusionMatrix(intent_test_dataset['ACM'].values, intent_test_dataset['result_CustomLSTM'].values)
cm.print_stats()