In [1]:
'''This script loads pre-trained word embeddings (GloVe embeddings)
into a frozen Keras Embedding layer, and uses it to
train a text classification model on the 20 Newsgroup dataset
(classification of newsgroup messages into 20 different categories).

GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)

20 Newsgroup data can be found at:
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
'''

from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
from keras.models import model_from_json


Using TensorFlow backend.


In [2]:

BASE_DIR = 'data'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2




In [3]:

# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
index_to_label_dict = {}
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        index_to_label_dict[label_id] = name
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
                with open(fpath, **args) as f:
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                labels.append(label_id)

print('Found %s texts.' % len(texts))
#print(texts.shape)
print(labels[0])


Processing text dataset
Found 19997 texts.
0


In [4]:
print("loading model .....")
# load json and create model
json_file = open('/Volumes/My Passport for Mac/model/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("/Volumes/My Passport for Mac/model/model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print("done")


loading model .....
Loaded model from disk
done


In [5]:
#score = loaded_model.evaluate(X, Y, verbose=0)
#print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

In [11]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [12]:
test_texts = ["First of all as far as I know, only male homosexuality is explicitly mentioned in the bibles, so you're off the hook there, I think. In \
              any event, there are *plenty* of people in many denominations who \
              do not consider a person's sexual identification of gay/lesbian/bisexual \
              as an 'immoral lifestyle choice'",
             "Messi is amazing soccer player from aregentina. He led Arsenal team in Europe for 2 years"]
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)


In [13]:
nn_output = loaded_model.predict(test_data)
print(nn_output)

[[5.31812459e-02 3.80550955e-05 1.32712330e-05 3.00782722e-06
  1.05335903e-05 2.21153368e-05 3.53115356e-05 6.23338274e-05
  5.41038098e-05 1.29136519e-04 2.65220333e-05 5.08869343e-05
  9.51401944e-06 1.03475561e-03 1.03897597e-04 7.54373252e-01
  5.26757678e-03 1.44062471e-03 4.30753035e-03 1.79836348e-01]
 [1.65222154e-05 2.16297394e-05 1.04245455e-05 3.79907283e-06
  8.76977481e-07 1.85691010e-06 1.13361366e-02 1.83141510e-05
  2.34169303e-04 4.72299270e-02 9.39942181e-01 1.17472030e-06
  1.72001644e-04 1.29774671e-05 7.30134721e-04 7.92755054e-06
  1.44551359e-06 2.64768687e-06 9.04939152e-05 1.65403530e-04]]


In [14]:
i=0
for idx in np.argmax(nn_output, axis=1):
    print("Category: ", index_to_label_dict[idx])
    print("text: " , test_texts[i])
    print("=====================================")
    i = i + 1

Category:  soc.religion.christian
text:  First of all as far as I know, only male homosexuality is explicitly mentioned in the bibles, so you're off the hook there, I think. In               any event, there are *plenty* of people in many denominations who               do not consider a person's sexual identification of gay/lesbian/bisexual               as an 'immoral lifestyle choice'
Category:  rec.sport.hockey
text:  Messi is amazing soccer player from aregentina. He led Arsenal team in Europe for 2 years
