In [1]:
from gensim.models import Word2Vec
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.layers import Merge
import numpy as np

Using TensorFlow backend.


In [2]:
category_names = ['Sonstiges', 'Aktuell', 'Lifestyle', 
          'Wirtschaft', 'Finanzen', 'Ausland', 'Lokal', 
          'Politik', 'Sport', 'Technologie', 'Kultur']

num_models = len(category_names)

# the list of full corpora
fulldata_paths = [(x, "corpus/corpus{}.txt".format(x)) for x in category_names]

# the corpora with a fixed split for training and validation
train_paths = [(x, "data/corpus{}.training.txt".format(x)) for x in category_names]
validation_paths = [(x, "data/corpus{}.validation.txt".format(x)) for x in category_names]

base_model = Word2Vec.load('../wiki/data/wiki.de.word2vec.model')

k = base_model.vector_size
print("basemodel has {} dimensional vectors".format(k))

basemodel has 400 dimensional vectors


**TODO:** stopword filtering is commented out, check if filtering stopwords improves perfomance, but since the cnn learns "patterns" the filtering may distort the pattern too much

In [3]:
def load_sets(paths):
    X, y = [], []

    for name, path in paths:
        with open(path) as cur_file:
            for line in cur_file:
                tokens = [x for x in line.split()]# if x not in stopwords]
                X.append(tokens)
                y.append(name)
    print("loaded {} articles".format(len(X)))
    return X, y

load the raw train and validation datasets in string format

In [4]:
raw_train_X, raw_train_y = load_sets(train_paths)
raw_validation_X, raw_validation_y = load_sets(validation_paths)

loaded 28132 articles
loaded 12042 articles


calculate the average article length $N$

$$N_{avg}=\frac { \sum _{ x\in X }^{  }{ dim(x) }  }{ dim(X) } $$

In [5]:
N_avg = 0
N_avg = sum([len(article) for article in raw_train_X]) / len(raw_train_X)
print("average article length is: {} words".format(N_avg))

average article length is: 223 words


convert the raw input article to matrices of word-vectors.
each article $x$ is represented as

$${ x }_{ 1:n }={ x }_{ 1 }\oplus { x }_{ 2 }\oplus \dots \oplus { x }_{ n }$$

where:
* ${ x }_{ i }\in { { R } }^{ k }$ is the $k$-dimensional word-embedding vector for the $i$-th word in the article.
* $\oplus$ is the concatenation operator

the result is a matrix for each article in the form:
$$x=\begin{bmatrix} { x }_{ 1,1 } & { x }_{ 1,2 } & \cdots  & { x }_{ 1,k } \\ { x }_{ 2,1 } & { x }_{ 2,2 } & \cdots  & { x }_{ 2,k } \\ \vdots  & \vdots  & \ddots  & \vdots  \\ { x }_{ n,1 } & { x }_{ n,2 } & \cdots  & { x }_{ n,k } \end{bmatrix}$$

where:
* ${ x }_{ n,k }$ is the value of the $k$-th dimension of the word-vector for word $n$

the matrix is padded or cropped to a length of $N_{avg}$

In [6]:
def articles_to_matrices(articles, word_dim, article_len):
    X = np.zeros((len(articles), article_len, word_dim), dtype='float32')
    for x, raw_article in enumerate(articles):
        for x_n in range(article_len):
            # if the word can't be found, use zero vector
            word_vec = np.zeros(word_dim)
            # try to load the word from the basemodel
            # TODO: maybe skip non-available words rather than default-zero
            # so if more then N_avg words ar in the article they get used
            try:
                word_vec = base_model[raw_article[x_n]]
            except:
                pass
            X[x, x_n] = word_vec
    return X

train_X = articles_to_matrices(raw_train_X, k, N_avg)
validation_X = articles_to_matrices(raw_validation_X, k, N_avg)

convert the string train input data to a one-hot vector that can be used on the output layer of the cnn

In [7]:
def categories_to_one_hot(categories):
    _, int_y = np.unique(categories, return_inverse=True)
    y = np_utils.to_categorical(int_y)
    return y

train_y = categories_to_one_hot(raw_train_y)
target_y = categories_to_one_hot(raw_validation_y)

Build the model from (Convolutional Neural Networks for Sentence Classification)[https://arxiv.org/abs/1408.5882]

**Notes:**
* Don't use L2 norm contraints on weight vectors (see (A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification)[https://arxiv.org/abs/1510.03820]) (info from http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow)
* the input has 3 
* the fully connected layer has one hidden layer

In [None]:
# number of filters of each size
num_filters = 32
# square filter sizes (3x3, 4x4, and 5x5)
filter_sizes = [3,4,5]
num_filter_branches = len(filter_sizes)

# add the channel dimension (only 1 channel)
# tip with np.expand_dims is from http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow
train_X = np.expand_dims(train_X, -1)
validation_X = np.expand_dims(validation_X, -1)

# create the filter branches
filter_branches = []
for i, filter_size in enumerate(filter_sizes):
    branch = Sequential()
    branch.add(Convolution2D(num_filters, filter_size, filter_size, init='uniform', border_mode='same',
                        input_shape=train_X.shape[1:]))
    branch.add(Activation('relu'))
    branch.add(MaxPooling2D(pool_size=(2, 2)))
    #branch.add(Dropout(0.25))
    filter_branches.append(branch)

# merge the branches by concatenating 
merged_filters = Merge(filter_branches, mode='concat')

#create the final model with the filter layers and the fully connected layers
model = Sequential()
model.add(merged_filters)
model.add(Flatten())
model.add(Dense(512, init='uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_models))
model.add(Activation('softmax'))

# compile the model with an accuracy measurement
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['acc'])

train the model

In [None]:
# since we have more than one filter branch, tile the input data to all branches
# np.tile causes the numpy kernel to crash (probably due to copying of the data. 
# putting it in a list works, because it then is just referenced 3 times)

#tiled_train_X = np.tile(expanded_train_X, (num_filter_branches, 1))
#tiled_validation_X = np.tile(expanded_validation_X, (num_filter_branches, 1))

tiled_train_X = [train_X, train_X, train_X]
tiled_validation_X = [validation_X, validation_X, validation_X]

model.fit(tiled_train_X, train_y, 
          validation_data=(tiled_validation_X, target_y),
          nb_epoch=2, batch_size=128)

Train on 28132 samples, validate on 12042 samples
Epoch 1/2
  512/28132 [..............................] - ETA: 79628s - loss: 5.1524 - acc: 0.1680