In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras, numpy as np
from keras.layers import Embedding, Dense, LSTM, GRU, Conv1D, Reshape
from keras.models import Sequential
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import glob
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils
import pickle

print("You have TensorFlow version", tf.__version__)

In [8]:
files = glob.glob("./data/article_data*")
data = pd.concat([pd.read_csv(f, sep='|', engine='python', header=None) for f in files])
data.columns = ['article_id', 'body', 'topic']
data = data[~data.body.isnull()]
data.shape

(189295, 3)

In [9]:
# Split data into train and test
train_size = int(len(data) * .85)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 160900
Test size: 28395


In [10]:
train_body  = data['body'][:train_size]
train_topic = data['topic'][:train_size]

test_body = data['body'][train_size:]
test_topic = data['topic'][train_size:]

In [34]:
nb_words  = 20000
try:
    with open('model/tokenizer.pickle', 'rb') as handle:
        tokenize = pickle.load(handle)
except Exception:
    tokenize = text.Tokenizer(num_words=nb_words)
    tokenize.fit_on_texts(train_body) # only fit on train
    with open('model/tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenize, handle, protocol=pickle.HIGHEST_PROTOCOL)
x_train = tokenize.texts_to_matrix(train_body)
x_test = tokenize.texts_to_matrix(test_body)

20000

In [35]:
topics = list(train_topic.unique())
ytrain_encoded = [topics.index(topic) for topic in train_topic] 
ytest_encoded = [topics.index(topic) for topic in test_topic]

y_train = keras.utils.to_categorical(ytrain_encoded)
y_test = keras.utils.to_categorical(ytest_encoded)
n_classes = len(topics)
print('classes: ', n_classes, 'less than 10 %', str(sum(train_topic.value_counts() <= 10)/train_topic.shape[0]))
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

classes:  282 less than 10 % 0.0001305158483530143
x_train shape: (160900, 20000)
x_test shape: (28395, 20000)
y_train shape: (160900, 282)
y_test shape: (28395, 282)


In [36]:
batch_size = 256
epochs = 20

word_index = tokenize.word_index


embedding_vector_length = 256
max_seq_len = 1000

vocabulary_size = nb_words
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5


epochs = 100
batch_size = 30


In [None]:
def dnn_model():
    
    global graph
    graph = tf.get_default_graph()
    print('building DNN model: ')
    model = Sequential()
    model.add(Dense(512, input_shape=(20000,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))
    model.add(Dense(n_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    print("model built: ", model.summary())
    return model

def cnn_model():
    print('building CNN model: ')
    
    model = Sequential()
    model.add(Embedding(len(word_index)+1,
                    embedding_vector_length, input_length=max_seq_len, embeddings_initializer='glorot_normal', 
                        embeddings_regularizer=keras.regularizers.l1(0.01)))
    
    model.add(Reshape((max_seq_len, embedding_vector_length,1)))

    model.add(Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

    maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

    concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
    flatten = Flatten()(concatenated_tensor)
    dropout = Dropout(drop)(flatten)
    output = Dense(units=2, activation='softmax')(dropout)

    # this creates a model that includes
    model = Model(inputs=inputs, outputs=output)

    checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    print("model built: ", model.summary())
    return model

In [None]:
model = dnn_model()
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

building DNN model: 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 512)               10240512  
_________________________________________________________________
activation_15 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 282)               144666    
_________________________________________________________________
activation_16 (Activation)   (None, 282)               0         
Total params: 10,385,178
Trainable params: 10,385,178
Non-trainable params: 0
_________________________________________________________________
model built:  None
Train on 144810 samples, validate on 16090 samples
Epoch 1/100


In [None]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

In [None]:
for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = topics[np.argmax(prediction)]
    print(test_posts.iloc[i][:50], "...")
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label + "\n")

In [None]:

y_softmaxy_softma  = model.predict(x_test)

y_test_1d = []
y_pred_1d = []

for i in range(len(y_test)):
    probs = y_test[i]
    index_arr = np.nonzero(probs)
    one_hot_index = index_arr[0].item(0)
    y_test_1d.append(one_hot_index)

for i in range(0, len(y_softmax)):
    probs = y_softmax[i]
    predicted_index = np.argmax(probs)
    y_pred_1d.append(predicted_index)

In [None]:
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):


    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, fontsize=22)
    plt.yticks(tick_marks, classes, fontsize=22)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=25)
    plt.xlabel('Predicted label', fontsize=25)

In [None]:
cnf_matrix = confusion_matrix(y_test_1d, y_pred_1d)
plt.figure(figsize=(24,20))
plot_confusion_matrix(cnf_matrix, classes=topics, title="Confusion matrix")
plt.show()

In [50]:
tf.reset_default_graph()