In [1]:
'''This script loads pre-trained word embeddings (GloVe embeddings)
into a frozen Keras Embedding layer, and uses it to
train a text classification model on the 20 Newsgroup dataset
(classification of newsgroup messages into 20 different categories).

GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)

20 Newsgroup data can be found at:
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
'''

from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
from keras.models import model_from_json


Using TensorFlow backend.


In [2]:

BASE_DIR = '/Volumes/My Passport for Mac/data'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2




In [3]:

# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
index_to_label_dict = {}
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        index_to_label_dict[label_id] = name
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
                with open(fpath, **args) as f:
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                labels.append(label_id)

print('Found %s texts.' % len(texts))
#print(texts.shape)
print(labels[0])


Processing text dataset
Found 19997 texts.
0


In [4]:
print("loading model .....")
# load json and create model
json_file = open('/Volumes/My Passport for Mac/model/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("/Volumes/My Passport for Mac/model/model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print("done")


loading model .....
Loaded model from disk
done


In [None]:
#score = loaded_model.evaluate(X, Y, verbose=0)
#print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

In [6]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [7]:
test_texts = ["First of all as far as I know, only male homosexuality is explicitly mentioned in the bibles, so you're off the hook there, I think. In \
              any event, there are *plenty* of people in many denominations who \
              do not consider a person's sexual identification of gay/lesbian/bisexual \
              as an 'immoral lifestyle choice'",
     
             "The Washington Post said that the shooter never should have had a gun if \
             the gun laws worked. Kinnunen was a well-known troublemaker, had been declared \
             mentally incompetent to stand trial in Oklahoma. He had been convicted of a \
             number of felonies. Nonetheless, The Washington Post blamed Texas gun laws for \
             the tragedy, forgetting that Kinnunen could not get a gun legally in any U.S. \
             jurisdiction. So did some presidential candidates, including Joe Biden"
             ]
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)


In [8]:
nn_output = loaded_model.predict(test_data)
print(nn_output)

[[5.3181246e-02 3.8055096e-05 1.3271233e-05 3.0078272e-06 1.0533590e-05
  2.2115337e-05 3.5311536e-05 6.2333827e-05 5.4103810e-05 1.2913652e-04
  2.6522033e-05 5.0886934e-05 9.5140194e-06 1.0347556e-03 1.0389760e-04
  7.5437325e-01 5.2675768e-03 1.4406247e-03 4.3075304e-03 1.7983635e-01]
 [1.1667347e-05 6.4375803e-11 2.6426641e-10 1.2541957e-11 4.7187143e-09
  1.6756381e-11 1.4207747e-08 2.3394100e-04 4.5673627e-07 1.1114547e-11
  3.1038458e-10 1.2027851e-05 1.1325712e-07 7.8183273e-09 1.5910898e-07
  6.8352572e-09 9.4905412e-01 1.2543119e-05 1.4235376e-02 3.6439486e-02]]


In [9]:
i=0
for idx in np.argmax(nn_output, axis=1):
    print("Category: ", index_to_label_dict[idx])
    print("text: " , test_texts[i])
    print("=====================================")
    i = i + 1

Category:  soc.religion.christian
text:  First of all as far as I know, only male homosexuality is explicitly mentioned in the bibles, so you're off the hook there, I think. In               any event, there are *plenty* of people in many denominations who               do not consider a person's sexual identification of gay/lesbian/bisexual               as an 'immoral lifestyle choice'
Category:  talk.politics.guns
text:  The Washington Post said that the shooter never should have had a gun if              the gun laws worked. Kinnunen was a well-known troublemaker, had been declared              mentally incompetent to stand trial in Oklahoma. He had been convicted of a              number of felonies. Nonetheless, The Washington Post blamed Texas gun laws for              the tragedy, forgetting that Kinnunen could not get a gun legally in any U.S.              jurisdiction. So did some presidential candidates, including Joe Biden
