In [17]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence, text
# fix random seed for reproducibility
numpy.random.seed(7)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [18]:
# read and replace null values
heavywater_dataset_df = pd.read_csv("/Users/channa/Projects/heavywater/classification_heavywater/data/shuffled-full-set-hashed.csv", 
            names = ['class', 'doc'])

heavywater_dataset_df.dropna(inplace = True)


In [19]:
# convert dataframe to matrix
# matrix values have tfidf of each word
num_words_to_keep = 2000
tokenizer_obj = text.Tokenizer(num_words = num_words_to_keep, 
                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                   lower=True, 
                   split=' ', 
                   char_level=False, 
                   oov_token=None, 
                   document_count=0)
tokenizer_obj.fit_on_texts(heavywater_dataset_df.doc)
encoded_docs = tokenizer_obj.texts_to_matrix(heavywater_dataset_df.doc, mode='count')

In [20]:
heavywater_dataset_df.replace({'class' : { 'DELETION OF INTEREST' : 1, 
                                            'RETURNED CHECK' : 2, 
                                            'BILL' : 4, 
                                            'POLICY CHANGE' : 5, 
                                            'CANCELLATION NOTICE' : 6, 
                                            'DECLARATION' : 7, 
                                            'CHANGE ENDORSEMENT' : 8, 
                                            'NON-RENEWAL NOTICE' : 9, 
                                            'BINDER' : 10, 
                                            'REINSTATEMENT NOTICE' : 11, 
                                            'EXPIRATION NOTICE' : 12, 
                                            'INTENT TO CANCEL NOTICE' : 13, 
                                            'APPLICATION' : 14, 
                                            'BILL BINDER' : 0, }}, inplace=True)
y = np.array(heavywater_dataset_df['class'])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(encoded_docs, y, test_size=0.33, random_state=42)

In [22]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((41646, 2000), (41646,), (20513, 2000), (20513,))

In [23]:
# # truncate and pad input sequences
# max_review_length = 2000
# X_train = sequence.pad_sequences(X_train, maxlen = max_review_length, dtype = 'int32')
# X_test = sequence.pad_sequences(X_test, maxlen = max_review_length, dtype = 'int32')

In [24]:
min(y_train)

0

In [27]:
# create the model
embedding_vector_length = 32
max_review_length = 2000
model = Sequential()
model.add(Embedding(num_words_to_keep, embedding_vector_length, input_length=max_review_length))
model.add(LSTM(32))
model.add(Dense(15, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=200, batch_size=1000)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 2000, 32)          64000     
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_4 (Dense)              (None, 15)                495       
Total params: 72,815
Trainable params: 72,815
Non-trainable params: 0
_________________________________________________________________
None
Train on 41646 samples, validate on 20513 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch

Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200


Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200


Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x13ef60ef0>