In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk 
import string
import re
%matplotlib inline
pd.set_option('display.max_colwidth', 100)
from keras.layers import Dropout, Dense, GRU, Embedding
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.datasets import fetch_20newsgroups

In [2]:
df=pd.read_excel('RNN-data.xlsx')
df['intent_cleaned']=df.intent.str.split('.').str.get(0) # split the intent with dot and take 1st

def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    text = [word for word in re.split('\W+', text) if word not in stopword]
    return text

ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return ' '.join(text)

def clean_data(x):
    x=x.encode('ascii','ignore').decode() # remove texts other than english
    x=re.sub('https*\S+','',x) # remove urls
    x=remove_punct(x) # remove punctuations
    x=remove_stopwords(x) # remove stopwords
    x=stemming(x) # stemming
    x=lemmatizer(x) # lemmatization
    return x

df['utterance_cleaned']=df['utterance'].apply(lambda x: clean_data(x))

# RNN

In [3]:
def TFIDF(X_train, X_test, MAX_NB_WORDS=75000):
    
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with",str(np.array(X_train).shape[1]),"features")
    
    return (X_train,X_test)

from sklearn.model_selection import train_test_split
X=df['utterance_cleaned']
y=df['intent_cleaned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [4]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional

vocab_size = 5000 # make the top list of words (common words)
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # OOV = Out of Vocabulary
training_portion = .8

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

validation_sequences = tokenizer.texts_to_sequences(X_test)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


In [48]:
from sklearn import preprocessing
le = preprocessing.OneHotEncoder()
le.fit(pd.DataFrame(y))

y_train_encoded=le.transform(pd.DataFrame(y_train)).toarray()
y_test_encoded=le.transform(pd.DataFrame(y_test)).toarray()

model=Sequential()
model.add(Embedding(vocab_size,embedding_dim))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(13,activation='softmax'))
model.summary()

opt = tf.keras.optimizers.Adam(lr=0.01, decay=1e-6)
model.compile(
    loss='categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

OneHotEncoder()

In [72]:
num_epochs = 10
history = model.fit(train_padded, y_train_encoded, epochs=num_epochs, validation_data=(validation_padded, y_test_encoded), verbose=2)

Epoch 1/10
2/2 - 1s - loss: 2.5296 - accuracy: 0.0682 - val_loss: 2.1201 - val_accuracy: 0.3043
Epoch 2/10
2/2 - 0s - loss: 2.1323 - accuracy: 0.2500 - val_loss: 2.2385 - val_accuracy: 0.1304
Epoch 3/10
2/2 - 0s - loss: 2.0990 - accuracy: 0.2045 - val_loss: 2.1277 - val_accuracy: 0.3043
Epoch 4/10
2/2 - 0s - loss: 2.0433 - accuracy: 0.2500 - val_loss: 2.1045 - val_accuracy: 0.3478
Epoch 5/10
2/2 - 0s - loss: 1.9210 - accuracy: 0.4318 - val_loss: 2.0897 - val_accuracy: 0.3478
Epoch 6/10
2/2 - 0s - loss: 1.8264 - accuracy: 0.4318 - val_loss: 2.0900 - val_accuracy: 0.3478
Epoch 7/10
2/2 - 0s - loss: 1.6758 - accuracy: 0.5682 - val_loss: 2.0433 - val_accuracy: 0.3913
Epoch 8/10
2/2 - 0s - loss: 1.4744 - accuracy: 0.6136 - val_loss: 1.9799 - val_accuracy: 0.4348
Epoch 9/10
2/2 - 0s - loss: 1.1660 - accuracy: 0.6136 - val_loss: 1.9949 - val_accuracy: 0.3913
Epoch 10/10
2/2 - 0s - loss: 0.9297 - accuracy: 0.7273 - val_loss: 2.0872 - val_accuracy: 0.3478


In [82]:
y_pred=pd.DataFrame(le.inverse_transform(model.predict(validation_padded))).values

y_test=pd.DataFrame(y_test).values

from sklearn.metrics import confusion_matrix
cf=confusion_matrix(y_test, y_pred,labels=np.unique(y_test))

import seaborn as sn
df_cm = pd.DataFrame(cf,index=np.unique(y_test),columns=np.unique(y_test))
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True)

In [103]:
y.value_counts()

computer                           18
microsoft_office_365               13
access_management                  11
network                             7
application_system                  7
mobility_suits                      3
cybersecurity                       2
voice_services                      1
change_management                   1
printer_scanner                     1
general_enquiry                     1
incident_and_problem_management     1
applications_systems                1
Name: intent_cleaned, dtype: int64