# Categorization using averaged word vectors as document feature

In [1]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from pandas import DataFrame
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import numpy as np
from nltk.corpus import stopwords as sw

stopwords = sw.words('german')
# gensims LineSentence generator replaces umlauts with 
# u, a or o so add these variants to the stopwordlist
for stopword in stopwords:
    stopword = stopword.replace(u'ü', 'u')
    stopword = stopword.replace(u'ö', 'o')
    stopword = stopword.replace(u'ä', 'a')
    if stopword not in stopwords:
        stopwords.append(stopword)



get a list of:
* the full corpora split into categories -> ```fulldata_path```
* a subset of each category corpus used for training -> ```train_paths```
* a subset of each category corpus used for validation -> ```validation_paths```

the categories were split into training / validation by using the ```mail/generateSets.py``` script with a 70 / 30 split between training and validation

In [2]:
category_names = ['Sonstiges', 'Lifestyle', 
          'Wirtschaft', 'Finanzen', 'Lokal', 
          'Politik', 'Sport', 'Technologie', 'Kultur']

num_models = len(category_names)

# the list of full corpora
fulldata_paths = [(x, "corpus/corpus{}.txt".format(x)) for x in category_names]

# the corpora with a fixed split for training and validation
train_paths = [(x, "data/corpus{}.training.txt".format(x)) for x in category_names]
validation_paths = [(x, "data/corpus{}.validation.txt".format(x)) for x in category_names]

base_model = Word2Vec.load('../wiki/data/wiki.de.word2vec.model')

In [3]:
def load_sets(paths):
    X, y = [], []

    for name, path in paths:
        with open(path) as cur_file:
            for line in cur_file:
                tokens = [x for x in line.split() if x not in stopwords]
                if len(tokens) > 0:
                    X.append(tokens)
                    y.append(name)
    print("loaded {} articles".format(len(X)))
    return X, y

## word2vec Vectorizers
These vectorizers are used to transform a set of vectors to a single vector. They are used to transform a list of word embeddings to a single vector that represents the whole article.

Both variations simply build the average of all word-vectors. The TFIDF variation however uses the word frequency and inverse-document frequency to weight the word vectors.

The implementation in mostly adapted from [Text Classification With Word2Vec](http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/) by Nadbor Drozd

The ```MeanEmbeddingVectorizer``` generates a document vector $  \overrightarrow { d } $ from a list of word vectors by calculating

$$ \overrightarrow { d } =\frac { \sum _{ i=0 }^{ dim(d) }{ \overrightarrow { { w }_{ d,i } }  }  }{ dim(d) } $$

where: 
* $ \overrightarrow { {w}_{d,i} } $ is the $ i $-th word of document $ d $

In [4]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = word2vec.vector_size
    
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

The ```TfidfEmbeddingVectorizer``` uses the same averaging strategy as the ```MeanEmbeddingVectorizer```, however it also weights every word vector $ \overrightarrow { {w}_{d,i} }$ with the term frequency-inverse document frequency (TF-IDF) of the word to put more weight on words appearing in fewer documents.

$$ \overrightarrow { d } =\frac { \sum _{ i=0 }^{ dim(d) }{ \overrightarrow { { w }_{ d,i } } *tfidf(\overrightarrow { { w }_{ d,i } } ) }  }{ dim(d) }  $$

In [5]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = word2vec.vector_size
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

a simple random forest classifier is used for classification of the document vectors

In [6]:
rf_w2v = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(base_model)), 
                        ("random forest", SVC(kernel="linear"))])
rf_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(base_model)), 
                        ("random forest", SVC(kernel="linear"))])

## Cross validation Score

in this section, the cross_val_score function of scikitlearn is used to validate the model. 

However, to be able to equally compare the different classification strategies, a fixed training and validation set is used in the next section.

In [7]:
X, y = load_sets(fulldata_paths)



loaded 29648 articles


In [8]:
score_rf = cross_val_score(rf_w2v, X, y, cv=2).mean()

In [9]:
score_rf_tfidf = cross_val_score(rf_w2v_tfidf, X, y, cv=2).mean()

In [10]:
print('Score simple: {}'.format(score_rf))
print('Score TFIDF:  {}'.format(score_rf_tfidf))

Score simple: 0.684127562906
Score TFIDF:  0.631745857118


## Training

In [11]:
# use only the tfidf model for further consideration since it performs slightly better in the cross validation
# however, it also needs twice the time to compute

# create a new instance to make sure the model isn't pre trained from the previous step
test_model =  Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(base_model)), 
                        ("extra trees", SVC(kernel="linear"))])

load the training data

In [12]:
train_X, train_y = load_sets(train_paths)



loaded 26689 articles


train the model

In [13]:
# fit returns self. assign it to a dummy variable to stop jupyter from printing the model
_ = test_model.fit(train_X, train_y)

In [14]:
validate_X, validate_y = load_sets(validation_paths)
predicted_y = test_model.predict(validate_X)



loaded 2959 articles


## Validation

This section performs the same validation steps that were used when validating the log-likelihood score approach fpr article classification, so the steps aren't as well documented. Thee the other document for a complete explanation

In [15]:
classification_matrix = np.zeros([num_models, num_models], dtype=int)

for target, predicted in zip(validate_y, predicted_y):
    target_index = category_names.index(target)
    predicted_index = category_names.index(predicted)
    classification_matrix[predicted_index, target_index] += 1
    
result = DataFrame(classification_matrix, category_names, category_names)
print(result)  

             Sonstiges  Lifestyle  Wirtschaft  Finanzen  Lokal  Politik  \
Sonstiges          263         72          13         4     21       43   
Lifestyle           29         68          14         5      4        9   
Wirtschaft          19         14         348        50      4       36   
Finanzen             1          2          20        91      0        6   
Lokal                8          0           5         1     79       15   
Politik             85         20          69         6     40      770   
Sport                4          1           3         1      0        1   
Technologie          8          8          25         0      3        4   
Kultur              30         18           5         1      0        6   

             Sport  Technologie  Kultur  
Sonstiges        5           18      38  
Lifestyle        2            6      12  
Wirtschaft       3           18       4  
Finanzen         0            2       0  
Lokal            0            3       0

## Accuracy

In [16]:
# the max(, 1) function surrounding sum makes sure wo don't divide by 0 if no match occurred
accuracy_matrix = [category / float(max([sum(category) ,1])) for category in classification_matrix]

result = DataFrame(accuracy_matrix, category_names, category_names)
print(result)  

             Sonstiges  Lifestyle  Wirtschaft  Finanzen     Lokal   Politik  \
Sonstiges     0.551363   0.150943    0.027254  0.008386  0.044025  0.090147   
Lifestyle     0.194631   0.456376    0.093960  0.033557  0.026846  0.060403   
Wirtschaft    0.038306   0.028226    0.701613  0.100806  0.008065  0.072581   
Finanzen      0.008197   0.016393    0.163934  0.745902  0.000000  0.049180   
Lokal         0.072072   0.000000    0.045045  0.009009  0.711712  0.135135   
Politik       0.084493   0.019881    0.068588  0.005964  0.039761  0.765408   
Sport         0.015385   0.003846    0.011538  0.003846  0.000000  0.003846   
Technologie   0.037915   0.037915    0.118483  0.000000  0.014218  0.018957   
Kultur        0.236220   0.141732    0.039370  0.007874  0.000000  0.047244   

                Sport  Technologie    Kultur  
Sonstiges    0.010482     0.037736  0.079665  
Lifestyle    0.013423     0.040268  0.080537  
Wirtschaft   0.006048     0.036290  0.008065  
Finanzen     0.000000

In [17]:
true_positives = 0.0
num_samples = 0
for x in range(num_models):
    true_positives += classification_matrix[x][x]
    num_samples += sum(classification_matrix[x])
    
average_score = true_positives / num_samples

print('score: {}'.format(average_score))

score: 0.701926326462


In [18]:
# make suer we calculate the "same" accuracy as scipy
# just to prevent dumb mistakes...
score = accuracy_score(validate_y, predicted_y, normalize=True)
print('score: {}'.format(score))

score: 0.701926326462


In [19]:
# phew