# Test DNN Classifier

This classifier tests classification of an embedding layer feeding into a convolution layer on the news20 data set.

First steps prepare the data for use with the network.

In [1]:
import sys
import os
import numpy as np

sys.path.insert(0, 'lib')

from lib import TextReader
from lib import GloveReader


basedir = 'data'

reader = TextReader.TextReader(os.path.join(basedir, 'news20'), basedir)
vocab, all_words, all_classes = reader.read_labeled_documents('mini20-train.txt')

targets = reader.one_hot_encode_classes(all_classes)

sequences = reader.make_index_sequences(vocab, all_words)


In [2]:
len(vocab)

19580

In [3]:
sequences.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5215,5216,5217,5218,5219,5220,5221,5222,5223,5224
0,1,1,1,1,1,1,1,1,1,1,...,795,6821,2344,13491,4705,703,19117,3337,0,3
0,1,1,1,1,1,1,1,1,1,1,...,15283,2410,16281,2410,14867,2831,17714,5077,0,3
0,1,1,1,1,1,1,1,1,1,1,...,7307,14026,11856,14143,13447,13856,15892,17114,0,3


In [4]:
targets.head(3)

Unnamed: 0,alt.atheism,comp.graphics,comp.os.ms-windows.misc,comp.sys.ibm.pc.hardware,comp.sys.mac.hardware,comp.windows.x,misc.forsale,rec.autos,rec.motorcycles,rec.sport.baseball,rec.sport.hockey,sci.crypt,sci.electronics,sci.med,sci.space,soc.religion.christian,talk.politics.guns,talk.politics.mideast,talk.politics.misc,talk.religion.misc
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


Next we preload the glove embeddings and create an embedding matrix for our training vocabulary.

In [5]:

embed_reader = GloveReader.GloveReader(base_dir=basedir)
glove1 = embed_reader.read_glove_model('model50')


# save the vocab embeddings because they are expensive to recreate.
import pickle
output = os.path.join(basedir, 'news20')
output = os.path.join(output, 'vocab_embeddings.pickle')

vocab_embedding = None
if os.path.exists(output):
    with open(output, 'rb') as fin:
        vocab_embedding = pickle.load(fin)
else:
    vocab_embedding = reader.vocab_to_embedding_matrix(embed_reader, vocab)
    with open(output, 'wb') as fout:
        pickle.dump(vocab_embedding, fout)

Skipping line 18137: Expected 51 fields in line 18137, saw 52
Skipping line 77306: Expected 51 fields in line 77306, saw 52
Skipping line 78481: Expected 51 fields in line 78481, saw 52
Skipping line 80636: Expected 51 fields in line 80636, saw 52
Skipping line 86603: Expected 51 fields in line 86603, saw 52
Skipping line 95766: Expected 51 fields in line 95766, saw 52
Skipping line 97253: Expected 51 fields in line 97253, saw 52
Skipping line 98622: Expected 51 fields in line 98622, saw 52
Skipping line 102606: Expected 51 fields in line 102606, saw 52
Skipping line 104608: Expected 51 fields in line 104608, saw 52
Skipping line 120311: Expected 51 fields in line 120311, saw 52
Skipping line 123556: Expected 51 fields in line 123556, saw 52
Skipping line 129697: Expected 51 fields in line 129697, saw 52
Skipping line 140365: Expected 51 fields in line 140365, saw 52
Skipping line 141336: Expected 51 fields in line 141336, saw 52
Skipping line 147469: Expected 51 fields in line 147469,

Now we can build the network and train it.

In [15]:
# the vocab embedding can be used with our cnn embedding model.
from lib import CnnClassifier

classifier = CnnClassifier.CnnClassifier()

max_sequence_length = sequences.shape[1]
embed_dim = vocab_embedding.shape[1]
num_outputs = targets.shape[1]
pool_size = targets.shape[1]
kernel_shape = 4

model = classifier.build_network(len(vocab), max_sequence_length, num_outputs, pool_size, kernel_shape, embed_dim, embedding_matrix=vocab_embedding)
model.summary()

model.compile(optimizer='nadam',
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])

(5225, 50, 1)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 5225, 50)          979000    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 5222, 1)           201       
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 261, 1)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 261)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 20)                5240      
Total params: 984,441
Trainable params: 5,441
Non-trainable params: 979,000
_________________________________________________________________


Prepare the training data.

In [7]:

rows = sequences.shape[0]
# will shuffle the data
indices = np.arange(rows)
np.random.shuffle(indices)

shuffled_inputs = sequences.values[indices]
shuffled_targets = targets.values[indices]
train_percent = 0.8
trainX, validateX = np.split(shuffled_inputs, [int(train_percent*shuffled_inputs.shape[0])])
trainY, validateY = np.split(shuffled_targets, [int(train_percent*shuffled_targets.shape[0])])


Train the model. Note prior to training run tensorboard from the base directory to monitor progress.

```
tensorboard --logdir logs
```

In [16]:
from datetime import datetime
import keras

logdir=os.path.join("logs", "scalars")
logdir=os.path.join(logdir, "model1")
logdir=os.path.join(logdir, datetime.now().strftime("%Y%m%d-%H%M%S"))

tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

epochs=1000

history = model.fit(trainX,
                    trainY,
                    epochs=epochs,
                    validation_data=(validateX, validateY),
                    callbacks=[tensorboard_callback])

Train on 1067 samples, validate on 267 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1

<keras.callbacks.History at 0x1eb8006b978>