### import libraries

In [1]:
import os, sys
import csv

import numpy as np
import pandas as pd

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import stem
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn import model_selection
from keras.utils import to_categorical

from keras.models import Model
from keras.models import Sequential

from keras.layers import Input, Dense, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Embedding

from keras.layers import SpatialDropout1D
from keras.layers import LSTM
from keras.layers import Dropout

from sklearn import metrics

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Set path and load train data

In [2]:
# set working directory
os.chdir("F://Prajwal_DS_Course//CUTe//7321c_CUTe/")

# Define path
PATH = os.getcwd()

# Define File name
FILE_NAME = "train.csv"

# Define full path with file name
FILE = os.path.join(PATH + "\\" + FILE_NAME)

# Print 
print(FILE)

F:\Prajwal_DS_Course\CUTe\7321c_CUTe\train.csv


In [3]:
cat = []
converse = []

with open(FILE) as csvfile:
    readCSV = csv.reader(csvfile, delimiter=",")
    headers = next(readCSV) # Ignore Header
    for row in readCSV:
        category = row[1]
        convers = row[2]
        cat.append(category)
        converse.append(convers)


In [4]:
#creating dataframe
dict_new = {
    'category': cat ,
    'text': converse
}

dataset = pd.DataFrame(dict_new)

dataset.head()

Unnamed: 0,category,text
0,ASK_A_DOCTOR,mom wants to know if the drugname needs some d...
1,ASK_A_DOCTOR,patients to discuss drugname she says she has ...
2,MISCELLANEOUS,letter of patient establishment request name s...
3,APPOINTMENTS,appointment question name mom appointments pat...
4,MISCELLANEOUS,please refax neurocog order to new wake medici...


In [5]:
# check for null values
dataset[dataset.isnull().any(axis=1)]

Unnamed: 0,category,text


In [6]:
# remove stop words & lemmatize words
w_tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')

dataset["converse"] = dataset['text'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split() if word not in (stop)]))

dataset = dataset.drop(["text"], axis=1)

print(dataset.shape)

dataset.head()

(45825, 2)


Unnamed: 0,category,converse
0,ASK_A_DOCTOR,mom want know drugname need dosage adjusting n...
1,ASK_A_DOCTOR,patient discus drugname say weird ta patient p...
2,MISCELLANEOUS,letter patient establishment request name spou...
3,APPOINTMENTS,appointment question name mom appointment pati...
4,MISCELLANEOUS,please refax neurocog order new wake medicine ...


### loading keras library

In [7]:
# Prepare tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset["converse"])
    
word_Index = tokenizer.word_index

vocab_Size = len(word_Index) + 1
print("Found %s unique tokens" % vocab_Size)

Found 32695 unique tokens


In [8]:
# integer encode the documents
sequences = tokenizer.texts_to_sequences(dataset["converse"])

MAX_SEQUENCE_LENGTH = 150

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
label = pd.factorize(dataset['category'])[0]

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', label.shape)

Shape of data tensor: (45825, 150)
Shape of label tensor: (45825,)


In [9]:
dataset['category'].value_counts()

PRESCRIPTION     12077
APPOINTMENTS     11098
MISCELLANEOUS     9736
ASK_A_DOCTOR      9440
LAB               3457
JUNK                17
Name: category, dtype: int64

In [10]:
X_train, X_Test, Y_train, Y_test = model_selection.train_test_split(data, 
                                                                    label, 
                                                                    random_state = 7, 
                                                                    stratify = label, 
                                                                    test_size = 0.15)

y_train = to_categorical(Y_train)
y_test = to_categorical(Y_test)

In [11]:
embeddings_index = {}
f = open(os.path.join(PATH, 'glove.6B.50d.txt'), encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
    
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [12]:
embedding_Matrix = np.zeros((vocab_Size, 50))
for word, i in word_Index.items():
    embedding_Vector = embeddings_index.get(word)
    if embedding_Vector is not None:
        embedding_Matrix[i] = embedding_Vector

print (embedding_Matrix.shape)

(32695, 50)


### MLP Model

In [13]:
embedding_layer = Embedding(vocab_Size,
                            50,
                            weights=[embedding_Matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [14]:
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(len(np.unique(label)), activation='softmax' ))
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=3 )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 50)           1634750   
_________________________________________________________________
flatten_1 (Flatten)          (None, 7500)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                480064    
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 390       
Total params: 2,115,204
Trainable params: 480,454
Non-trainable params: 1,634,750
_________________________________________________________________
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1c8b1cbf080>

In [15]:
Y_pred_t = model.predict(X_train)
y_pred_t = []
for i in Y_pred_t:
    y_pred_t.append(np.argmax(i))

Y_pred = model.predict(X_Test)
y_pred =[]
for i in Y_pred:
    y_pred.append(np.argmax(i))

# summarize the model
print()




In [16]:
# calculate accuracy of class predictions
print("Train Model Accuracy - " , metrics.accuracy_score(Y_train, y_pred_t))
print("Test Model Accuracy - " , metrics.accuracy_score(Y_test, y_pred))

Train Model Accuracy -  0.8743549587943826
Test Model Accuracy -  0.6851905731742799


### Build CNN Model with Pre-Embeddings

In [17]:
cnn_model = Sequential()
cnn_model.add(embedding_layer)
cnn_model.add(Conv1D(64, 5, activation='relu'))
cnn_model.add(MaxPooling1D(4))
cnn_model.add(Conv1D(64, 5, activation='relu'))
cnn_model.add(MaxPooling1D(4))
cnn_model.add(Flatten())
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dense(len(np.unique(label)), activation='softmax' ))
cnn_model.summary()

cnn_model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

cnn_model.fit(X_train, y_train, epochs=3 )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 50)           1634750   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 146, 64)           16064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 36, 64)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 32, 64)            20544     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 8, 64)             0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                32832     
__________

<keras.callbacks.History at 0x1c8b87a6320>

In [18]:
Y_pred_t = cnn_model.predict(X_train)
y_pred_t = []
for i in Y_pred_t:
    y_pred_t.append(np.argmax(i))

Y_pred = cnn_model.predict(X_Test)
y_pred =[]
for i in Y_pred:
    y_pred.append(np.argmax(i))


In [19]:
# calculate accuracy of class predictions
print("Train Model Accuracy - " , metrics.accuracy_score(Y_train, y_pred_t))
print("Test Model Accuracy - " , metrics.accuracy_score(Y_test, y_pred))

Train Model Accuracy -  0.8318400041077251
Test Model Accuracy -  0.7836776258364853


### Build LSTM Model

In [20]:
lstm_model = Sequential()
lstm_model.add(embedding_layer)
lstm_model.add(SpatialDropout1D(0.3))
lstm_model.add(LSTM(100))
lstm_model.add(Dense(64, activation = "relu"))
lstm_model.add(Dropout(0.25))
lstm_model.add(Dense(len(np.unique(label)), activation="softmax"))
lstm_model.summary()

lstm_model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

lstm_model.fit(X_train, y_train, epochs=3 )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 50)           1634750   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 150, 50)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dense_5 (Dense)              (None, 64)                6464      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 390       
Total params: 1,702,004
Trainable params: 67,254
Non-trainable params: 1,634,750
_____________________________________________________________

<keras.callbacks.History at 0x1c8bb823630>

In [21]:
Y_pred_t = lstm_model.predict(X_train)
y_pred_t = []
for i in Y_pred_t:
    y_pred_t.append(np.argmax(i))

Y_pred = lstm_model.predict(X_Test)
y_pred =[]
for i in Y_pred:
    y_pred.append(np.argmax(i))


In [22]:
# calculate accuracy of class predictions
print("Train Model Accuracy - " , metrics.accuracy_score(Y_train, y_pred_t))
print("Test Model Accuracy - " , metrics.accuracy_score(Y_test, y_pred))

Train Model Accuracy -  0.7845498190033632
Test Model Accuracy -  0.7832411987198138
