In [1]:
import pandas as pd
import tensorflow as tf

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model, load_model
from keras.initializers import Constant

import h5py

Using TensorFlow backend.


In [2]:
NEWS_DIR = "data/20_newsgroup/"
GLOVE_DIR = "data/glove.6B/"
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [3]:
texts = []
labels_index = {}
labels = []

for name in sorted(os.listdir(NEWS_DIR)):
    path = os.path.join(NEWS_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,): 
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i: 
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

Found 19997 texts.


In [4]:
embeddings_index = {}

with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print('Found %d word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [5]:
# vectorize the text samples into a 2D integer tensor

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index # dict

print('Found %s unique tokens.' % len(word_index))

Found 174074 unique tokens.


In [6]:
len(sequences[37])

173

In [7]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (19997, 1000)
Shape of label tensor: (19997, 20)


In [8]:
# split the data into a training set and a validation set

indices = np.arange(data.shape[0])
np.random.shuffle(indices) # shuffle!
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [9]:
# prepare embedding matrix
print('Training emb matrix model...')

num_words = min(MAX_NUM_WORDS, len(word_index)+1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words, EMBEDDING_DIM,
                           embeddings_initializer=Constant(embedding_matrix),
                           input_length=MAX_SEQUENCE_LENGTH,
                           trainable=False)

print('Done.')

Training emb matrix model...
Done.


In [10]:
# train a 1D convnet with global maxpooling
d = 0.2

seq_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_seqs = embedding_layer(seq_input)

X1 = Conv1D(128, 10, activation='relu')(embedded_seqs)
X1 = Dropout(d)(X1)
X1 = MaxPooling1D(5)(X1)

X2 = Conv1D(128, 10, activation='relu')(X1)
X2 = Dropout(d)(X2)
X2 = MaxPooling1D(5)(X2) #3

X3 = Conv1D(128, 5, activation='relu')(X2)
X3 = GlobalMaxPooling1D()(X3)

X4 = Dense(64, activation='relu')(X3)
preds = Dense(len(labels_index), activation='softmax')(X4)

model = Model(seq_input, preds)


In [11]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 991, 128)          128128    
_________________________________________________________________
dropout_1 (Dropout)          (None, 991, 128)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 198, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 189, 128)          163968    
_________________________________________________________________
dropout_2 (Dropout)          (None, 189, 128)          0         
__________

In [15]:
model.compile(loss='categorical_crossentropy', 
              optimizer='rmsprop', 
              metrics=['acc'])

model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_val, y_val))

Train on 15998 samples, validate on 3999 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f27ef9216a0>

In [17]:
# model.save('model_Conv1D.h5')

model.save_weights('weights')

In [12]:
# model = load_model('model_Conv1D.h5')
# https://stackoverflow.com/questions/42328034/keras-load-model-not-working-after-training
model.load_weights('weights')

In [13]:
index_word = {v: k for k, v in word_index.items()} # map back
# seqs = tk.texts_to_sequences(txt1) #sequences = tokenizer.texts_to_sequences(texts)

def index_to_word(seq):
    words = []
    for i in seq:
        if i != 0:
            words.append(index_word.get(i))
        else:
            words.append(' ')
    return (' '.join(words)) # output

def word_to_index(seq):
    indexes = []
    for w in seq:
        if w is not None:
            indexes.append(word_index.get(w))
        else:
            indexes.append(' ')
    return ((indexes)) # output


In [30]:
labels_index

{'alt.atheism': 0,
 'comp.graphics': 1,
 'comp.os.ms-windows.misc': 2,
 'comp.sys.ibm.pc.hardware': 3,
 'comp.sys.mac.hardware': 4,
 'comp.windows.x': 5,
 'misc.forsale': 6,
 'rec.autos': 7,
 'rec.motorcycles': 8,
 'rec.sport.baseball': 9,
 'rec.sport.hockey': 10,
 'sci.crypt': 11,
 'sci.electronics': 12,
 'sci.med': 13,
 'sci.space': 14,
 'soc.religion.christian': 15,
 'talk.politics.guns': 16,
 'talk.politics.mideast': 17,
 'talk.politics.misc': 18,
 'talk.religion.misc': 19}

In [16]:
# This code allows you to see the mislabelled examples
# C = 5
# y_test_oh = np.eye(C)[Y_test.reshape(-1)]
# X_test_indices = sentences_to_indices(X_test, word_to_index, maxLen)

pred = model.predict(x_val)

for i in range(250,255):
    num = np.argmax(pred[i])
    if(num != np.argmax(y_val[i])):
        print('Expected category: %s' % list(labels_index.keys())[np.argmax(y_val[i])]) #np.argmax(y_val[i]))
        print('Text: %s' % index_to_word(x_val[i]))    # x_val[i] + (num))
        print('Prediction category: %s' % list(labels_index.keys())[num])
        print('+++++')

Expected category: sci.space
Text:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [17]:
# TODO test on new sample texts

test_text = "It still applies, except the astronomy these days is coupled to data for studies into  Earth rotation, and \
purturbations. Every time there is a leap second added to the New Year, remember the military and science are still co-habiting nicely."

sequences = [[0 if word_index.get(w) is None else word_index.get(w) for w in test_text.split()]]
sequences2 = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

pred2 = model.predict(sequences2)
print('Predicted category: %s' % np.argmax(pred2[0]))

Predicted category: 14


In [None]:
# char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
# ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
# print((list(char_to_ix.values())))

In [18]:

# TODO use LSTM