# Using a CNN for news topic categorization

In [1]:
# set this to true to learn the model when running. false will load the model from disk
LEARN=False

In [2]:
from gensim.models import Word2Vec
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.layers import Merge
from keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.regularizers import l2
from pandas import DataFrame
import numpy as np
from sklearn.metrics import confusion_matrix

np.random.seed(0)

Using TensorFlow backend.


In [3]:
category_names = ['Sonstiges', 'Aktuell', 'Lifestyle', 
          'Wirtschaft', 'Finanzen', 'Ausland', 'Lokal', 
          'Politik', 'Sport', 'Technologie', 'Kultur']

num_models = len(category_names)

# the list of full corpora
fulldata_paths = [(x, "corpus/corpus{}.txt".format(x)) for x in category_names]

# the corpora with a fixed split for training and validation
train_paths = [(x, "data/corpus{}.training.txt".format(x)) for x in category_names]
validation_paths = [(x, "data/corpus{}.validation.txt".format(x)) for x in category_names]

base_model = Word2Vec.load('../wiki/data/wiki.de.200dim.word2vec.model')

# we can precompute the L2-normalized vectors to save lots of memory
# we can't continue learning after they are normalized but the model is static
# in this usecase anyways
base_model.init_sims(replace=True)

k = base_model.vector_size
print("basemodel has {} dimensional vectors".format(k))

basemodel has 200 dimensional vectors


**TODO:** stopword filtering is commented out, check if filtering stopwords improves perfomance, but since the cnn learns "patterns" the filtering may distort the pattern too much

In [4]:
def load_sets(paths):
    X, y = [], []

    for name, path in paths:
        with open(path) as cur_file:
            for line in cur_file:
                tokens = [x for x in line.split()]# if x not in stopwords]
                if len(tokens) > 0:
                    X.append(tokens)
                    y.append(name)
    print("loaded {} articles".format(len(X)))
    return X, y

load the raw train and validation datasets in string format

In [5]:
raw_train_X, raw_train_y = load_sets(train_paths)
raw_validation_X, raw_validation_y = load_sets(validation_paths)

loaded 13188 articles
loaded 5614 articles


calculate the average article length $N$

$$N_{avg}=\frac { \sum _{ x\in X }^{  }{ dim(x) }  }{ dim(X) } $$

In [6]:
N_avg = sum([len(article) for article in raw_train_X]) / len(raw_train_X)
print("average article length is: {} words".format(N_avg))

#override to avoid out-of-mem errors while learning
N_avg = 200

padding = [len(article) - N_avg for article in raw_train_X if len(article) - N_avg < 0]
average_padding = sum(padding) / len(padding)
print(average_padding)
#print(padding)

average article length is: 476 words
-95


convert the raw input article to matrices of word-vectors.
each article $x$ is represented as

$${ x }_{ 1:n }={ x }_{ 1 }\oplus { x }_{ 2 }\oplus \dots \oplus { x }_{ n }$$

where:
* ${ x }_{ i }\in { { R } }^{ k }$ is the $k$-dimensional word-embedding vector for the $i$-th word in the article.
* $\oplus$ is the concatenation operator

the result is a matrix for each article in the form:
$$x=\begin{bmatrix} { x }_{ 1,1 } & { x }_{ 1,2 } & \cdots  & { x }_{ 1,k } \\ { x }_{ 2,1 } & { x }_{ 2,2 } & \cdots  & { x }_{ 2,k } \\ \vdots  & \vdots  & \ddots  & \vdots  \\ { x }_{ n,1 } & { x }_{ n,2 } & \cdots  & { x }_{ n,k } \end{bmatrix}$$

where:
* ${ x }_{ n,k }$ is the value of the $k$-th dimension of the word-vector for word $n$

the matrix is padded or cropped to a length of $N_{avg}$

In [17]:
def articles_to_matrices(articles, word_dim, article_len):
    X = np.zeros((len(articles), article_len, word_dim), dtype='float32')
    words_found = 0
    words = 0
    for x, raw_article in enumerate(articles):    
        for x_n in range(N_avg):#while (words_found < N_avg):
            # if the word can't be found, use zero vector
            word_vec = np.zeros(word_dim)

            # try to load the word from the basemodel
            # TODO: maybe skip non-available words rather than default-zero
            # so if more then N_avg words ar in the article they get used
            try:
                word_vec = base_model[raw_article[x_n % len(raw_article)]]
                words_found += 1
                X[x, x_n] = word_vec
            except:
                pass
            words += 1
    
    words_skipped = words - words_found
    print("skipped {} of {} words ({}%)".format(words_skipped, words, words_skipped*100.0 / words))
    return X

train_X = articles_to_matrices(raw_train_X, k, N_avg)
validation_X = articles_to_matrices(raw_validation_X, k, N_avg)

skipped 255864 of 2637600 words (9.70063694268%)
skipped 108097 of 1122800 words (9.62744923406%)


convert the string train input data to a one-hot vector that can be used on the output layer of the cnn

In [18]:
def categories_to_one_hot(categories):
    category_names, int_y = np.unique(categories, return_inverse=True)
    y = np_utils.to_categorical(int_y)
    return y, category_names

train_y, _ = categories_to_one_hot(raw_train_y)
target_y, category_names = categories_to_one_hot(raw_validation_y)

stat = np.zeros(num_models, dtype=int)
for row in target_y:
    stat[np.argmax(row)] += 1

validation_stats = DataFrame(stat, category_names, ['dim(V_i)'])
print(validation_stats)

             dim(V_i)
Aktuell            20
Ausland           483
Finanzen          333
Kultur            208
Lifestyle         328
Lokal             249
Politik          1596
Sonstiges         770
Sport             449
Technologie       361
Wirtschaft        817


Build the model from (Convolutional Neural Networks for Sentence Classification)[https://arxiv.org/abs/1408.5882]

**Notes:**
* Don't use L2 norm contraints on weight vectors (see (A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification)[https://arxiv.org/abs/1510.03820]) (info from http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow)
* the input has 3 filter branches
* the fully connected layer has ~~one~~ no hidden layer

**Architecture** (from original paper):
![Architecture](http://d3kbpzbmcynnmx.cloudfront.net/wp-content/uploads/2015/11/Screen-Shot-2015-11-06-at-8.03.47-AM.png)

In [19]:
# number of filters of each size
num_filters = 128
# square filter sizes (3x3, 4x4, and 5x5)
filter_sizes = [3, 4, 5]
num_filter_branches = len(filter_sizes)

# add the channel dimension (only 1 channel)
# tip with np.expand_dims is from http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow
train_X = np.expand_dims(train_X, -1)
validation_X = np.expand_dims(validation_X, -1)

# create the filter branches
filter_branches = []
for i, filter_size in enumerate(filter_sizes):
    branch = Sequential()
    branch.add(Convolution2D(num_filters, filter_size, k, init='uniform', border_mode='same',
                        input_shape=train_X.shape[1:], W_regularizer=l2(0.01)))
    branch.add(Activation('relu'))
    pool_size =  num_filters;
    branch.add(MaxPooling2D(pool_size=(pool_size, pool_size)))
    #branch.add(Dropout(0.25))
    filter_branches.append(branch)

# merge the branches by concatenating 
merged_filters = Merge(filter_branches, mode='concat')

#create the final model with the filter layers and the fully connected layers
model = Sequential()
model.add(merged_filters)
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(num_models))
model.add(Activation('softmax'))

# compile the model with an accuracy measurement
model.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['acc'])

In [20]:
callbacks = []
callbacks.append(TensorBoard(log_dir='logs', histogram_freq=0, write_graph=True, write_images=False))
callbacks.append(ModelCheckpoint('checkpoints', monitor='acc', verbose=1, save_best_only=True, mode='max'))
callbacks.append(EarlyStopping(monitor='loss', min_delta=0.01, patience=3, verbose=1, mode='auto'))

train the model

In [21]:
# since we have more than one filter branch, tile the input data to all branches
# np.tile causes the numpy kernel to crash (probably due to copying of the data. 
# putting it in a list works, because it then is just referenced 3 times)

#tiled_train_X = np.tile(expanded_train_X, (num_filter_branches, 1))
#tiled_validation_X = np.tile(expanded_validation_X, (num_filter_branches, 1))

tiled_train_X = [train_X, train_X, train_X]
tiled_validation_X = [validation_X, validation_X, validation_X]

if LEARN:
    model.fit(tiled_train_X, train_y, 
              validation_data=(tiled_validation_X, target_y),
              nb_epoch=1, batch_size=128,
              callbacks=callbacks)

In [22]:
if LEARN:
    model.save('data/news.cnn.model')
else:
    from keras.models import load_model
    model = load_model('data/news.cnn.model')

In [23]:
predicted_y = model.predict([validation_X, validation_X, validation_X])

In [43]:
# TODO: weight the output by the number of elements.
# Why? because it may work although it won't make any sense...
predicted_y *= max(stat)
predicted_y /= stat

#convert the log likelyhood prediction to a single-hot vector
# with hotspot at index of highest likelihood
predicted_y_singlehot = np.zeros(predicted_y.shape)
for x, row in enumerate(predicted_y):
    max_index = np.where(row == max(row))
    predicted_y_singlehot[x, max_index] = 1
    

In [44]:
classification_matrix = np.zeros([num_models, num_models], dtype=int)

for i in range(len(predicted_y_singlehot)):
    predicted_index = np.where(predicted_y_singlehot[i] == 1)[0]
    target_index = np.where(target_y[i] == 1)[0]
    classification_matrix[predicted_index, target_index] += 1

result = DataFrame(classification_matrix, category_names, category_names)
print(result)  

             Aktuell  Ausland  Finanzen  Kultur  Lifestyle  Lokal  Politik  \
Aktuell           20      483       333     208        328    249     1596   
Ausland            0        0         0       0          0      0        0   
Finanzen           0        0         0       0          0      0        0   
Kultur             0        0         0       0          0      0        0   
Lifestyle          0        0         0       0          0      0        0   
Lokal              0        0         0       0          0      0        0   
Politik            0        0         0       0          0      0        0   
Sonstiges          0        0         0       0          0      0        0   
Sport              0        0         0       0          0      0        0   
Technologie        0        0         0       0          0      0        0   
Wirtschaft         0        0         0       0          0      0        0   

             Sonstiges  Sport  Technologie  Wirtschaft  
Aktuel