In [53]:
from sklearn.datasets import fetch_20newsgroups
cats = ['alt.atheism', 'sci.space', 'talk.religion.misc', 'comp.graphics']
training_data = fetch_20newsgroups(subset = 'train', categories = cats)
test_data = fetch_20newsgroups(subset = 'test', categories = cats)
print(list(training_data.filenames.shape))
print(list(test_data.filenames.shape))

[2034]
[1353]


In [54]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils

training_data_text = training_data.data
test_data_text = test_data.data

token = Tokenizer(num_words = 2000)
token.fit_on_texts(training_data_text)

x_train_seq = token.texts_to_sequences(training_data_text)
x_test_seq = token.texts_to_sequences(test_data_text)
x_train = sequence.pad_sequences(x_train_seq, maxlen = 150)
x_test = sequence.pad_sequences(x_test_seq, maxlen = 150)

y_train = np_utils.to_categorical(training_data.target)
y_test = np_utils.to_categorical(test_data.target)

In [55]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding

model = Sequential()
model.add(Embedding(output_dim = 32, input_dim = 2000, input_length = 150))
model.add(Dropout(0.35))
model.add(Flatten())
model.add(Dense(units = 256, activation = 'relu'))
model.add(Dropout(0.35))
model.add(Dense(units = 128, activation = 'relu'))
model.add(Dropout(0.35))
model.add(Dense(units = 64, activation = 'relu'))
model.add(Dropout(0.35))
model.add(Dense(units = len(cats), activation = 'softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 150, 32)           64000     
_________________________________________________________________
dropout_34 (Dropout)         (None, 150, 32)           0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 4800)              0         
_________________________________________________________________
dense_33 (Dense)             (None, 256)               1229056   
_________________________________________________________________
dropout_35 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_34 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_36 (Dropout)         (None, 128)               0         
__________

In [56]:
model.compile(loss = 'categorical_crossentropy', 
              optimizer = 'adam', metrics = ['accuracy'])

train_history = model.fit(x_train, y_train, batch_size = 100, 
                          epochs = 10, verbose = 2, validation_split = 0.2)

Train on 1627 samples, validate on 407 samples
Epoch 1/10
 - 1s - loss: 1.3641 - acc: 0.3018 - val_loss: 1.3406 - val_acc: 0.3759
Epoch 2/10
 - 1s - loss: 1.3056 - acc: 0.3872 - val_loss: 1.3132 - val_acc: 0.4128
Epoch 3/10
 - 1s - loss: 1.2036 - acc: 0.5046 - val_loss: 1.1892 - val_acc: 0.5233
Epoch 4/10
 - 1s - loss: 0.8304 - acc: 0.7056 - val_loss: 0.8236 - val_acc: 0.6708
Epoch 5/10
 - 1s - loss: 0.3724 - acc: 0.8734 - val_loss: 0.7028 - val_acc: 0.7199
Epoch 6/10
 - 1s - loss: 0.1566 - acc: 0.9533 - val_loss: 0.7694 - val_acc: 0.7371
Epoch 7/10
 - 1s - loss: 0.0658 - acc: 0.9834 - val_loss: 0.8266 - val_acc: 0.7666
Epoch 8/10
 - 1s - loss: 0.0400 - acc: 0.9896 - val_loss: 0.8124 - val_acc: 0.7666
Epoch 9/10
 - 1s - loss: 0.0300 - acc: 0.9932 - val_loss: 0.9228 - val_acc: 0.7690
Epoch 10/10
 - 1s - loss: 0.0241 - acc: 0.9951 - val_loss: 0.9023 - val_acc: 0.7715


In [57]:
score = model.evaluate(x_test, y_test, verbose = 0)
print(score[1])

0.705838876703


In [None]:
# creates a HDF5 file 'my_model.h5'
model.model.save('my_model.h5')


# load our saved model
from keras.models import load_model
model = load_model('my_model.h5')