# Categorization using averaged word vectors as document feature

In [1]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from pandas import DataFrame
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from sklearn.pipeline import Pipeline
import numpy as np
from nltk.corpus import stopwords as sw

stopwords = sw.words('german')



get a list of:
* the full corpora split into categories -> ```fulldata_path```
* a subset of each category corpus used for training -> ```train_paths```
* a subset of each category corpus used for validation -> ```validation_paths```

the categories were split into training / validation by using the ```mail/generateSets.py``` script with a 70 / 30 split between training and validation

In [2]:
category_names = ['Sonstiges', 'Aktuell', 'Lifestyle', 
          'Wirtschaft', 'Finanzen', 'Ausland', 'Lokal', 
          'Politik', 'Sport', 'Technologie', 'Kultur']

num_models = len(category_names)

# the list of full corpora
fulldata_paths = [(x, "corpus/corpus{}.txt".format(x)) for x in category_names]

# the corpora with a fixed split for training and validation
train_paths = [(x, "data/corpus{}.training.txt".format(x)) for x in category_names]
validation_paths = [(x, "data/corpus{}.validation.txt".format(x)) for x in category_names]

base_model = Word2Vec.load('../wiki/data/wiki.de.word2vec.model')

In [4]:
def load_sets(paths):
    X, y = [], []

    for name, path in paths:
        with open(path) as cur_file:
            for line in cur_file:
                tokens = [x for x in line.split() if x not in stopwords]
                X.append(tokens)
                y.append(name)
    return X, y

## word2vec Vectorizers
These vectorizers are used to transform a set of vectors to a single vector. They are used to transform a list of word embeddings to a single vector that represents the whole article.

Both variations simply build the average of all word-vectors. The TFIDF variation however uses the word frequency and inverse-document frequency to weight the word vectors.

The implementation in mostly adapted from [Text Classification With Word2Vec](http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/) by Nadbor Drozd

The ```MeanEmbeddingVectorizer``` generates a document vector $  \overrightarrow { d } $ from a list of word vectors by calculating

$$ \overrightarrow { d } =\frac { \sum _{ i=0 }^{ dim(d) }{ \overrightarrow { { w }_{ d,i } }  }  }{ dim(d) } $$

where: 
* $ \overrightarrow { {w}_{d,i} } $ is the $ i $-th word of document $ d $

In [5]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = word2vec.vector_size
    
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

The ```TfidfEmbeddingVectorizer``` uses the same averaging strategy as the ```MeanEmbeddingVectorizer```, however it also weights every word vector $ \overrightarrow { {w}_{d,i} }$ with the term frequency-inverse document frequency (TF-IDF) of the word to put more weight on words appearing in fewer documents.

$$ \overrightarrow { d } =\frac { \sum _{ i=0 }^{ dim(d) }{ \overrightarrow { { w }_{ d,i } } *tfidf(\overrightarrow { { w }_{ d,i } } ) }  }{ dim(d) }  $$

In [6]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = word2vec.vector_size
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

a simple random forest classifier is used for classification of the document vectors

In [7]:
etree_w2v = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(base_model)), 
                        ("extra trees", RandomForestClassifier(n_estimators=200))])
etree_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(base_model)), 
                        ("extra trees", RandomForestClassifier(n_estimators=200))])

## Cross validation Score

in this section, the cross_val_score function of scikitlearn is used to validate the model. 

However, to be able to equally compare the different classification strategies, a fixed training and validation set is used in the next section.

In [8]:
X, y = load_sets(fulldata_paths)



In [12]:
score_etree = cross_val_score(etree_w2v, X, y, cv=2).mean()

In [13]:
score_etree_tfidf = cross_val_score(etree_w2v_tfidf, X, y, cv=2).mean()

In [14]:
print('Score simple: {}'.format(score_etree))
print('Score TFIDF:  {}'.format(score_etree_tfidf))

Score simple: 0.58850162925
Score TFIDF:  0.586455167602


## Training

In [15]:
# use only the tfidf model for further consideration since it performs slightly better in the cross validation
# however, it also needs twice the time to compute

# create a new instance to make sure the model isn't pre trained from the previous step
test_model =  Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(base_model)), 
                        ("extra trees", RandomForestClassifier(n_estimators=200))])

load the training data

In [16]:
train_X, train_y = load_sets(train_paths)



train the model

In [17]:
# fit returns self. assign it to a dummy variable to stop jupyter from printing the model
_ = test_model.fit(train_X, train_y)

In [18]:
validate_X, validate_y = load_sets(validation_paths)
predictions = test_model.predict(validate_X)



## Validation

This section performs the same validation steps that were used when validating the log-likelihood score approach fpr article classification, so the steps aren't as well documented. Thee the other document for a complete explanation

In [19]:
classification_matrix = np.zeros([num_models, num_models], dtype=int)

for target, predicted in zip(validate_y, predictions):
    target_index = category_names.index(target)
    predicted_index = category_names.index(predicted)
    classification_matrix[predicted_index, target_index] += 1
    
result = DataFrame(classification_matrix, category_names, category_names)
print(result)  

             Sonstiges  Aktuell  Lifestyle  Wirtschaft  Finanzen  Ausland  \
Sonstiges          409        7        110          35        12       35   
Aktuell              0        0          0           0         0        0   
Lifestyle           10        0         66           4         1        1   
Wirtschaft          34        4         37         522       113       20   
Finanzen             1        0          0           7       125        0   
Ausland              6        0          1           4         2       40   
Lokal                1        0          0           0         0        9   
Politik            824       23        320         949       334      752   
Sport                8        2          1           1         1        2   
Technologie         11        0          4          22         4        1   
Kultur               4        0          7           0         0        0   

             Lokal  Politik  Sport  Technologie  Kultur  
Sonstiges       2

## Accuracy

In [20]:
# the max(, 1) function surrounding sum makes sure wo don't divide by 0 if no match occurred
accuracy_matrix = [category / float(max([sum(category) ,1])) for category in classification_matrix]

result = DataFrame(accuracy_matrix, category_names, category_names)
print(result)  

             Sonstiges   Aktuell  Lifestyle  Wirtschaft  Finanzen   Ausland  \
Sonstiges     0.503075  0.008610   0.135301    0.043050  0.014760  0.043050   
Aktuell       0.000000  0.000000   0.000000    0.000000  0.000000  0.000000   
Lifestyle     0.097087  0.000000   0.640777    0.038835  0.009709  0.009709   
Wirtschaft    0.040189  0.004728   0.043735    0.617021  0.133570  0.023641   
Finanzen      0.007407  0.000000   0.000000    0.051852  0.925926  0.000000   
Ausland       0.072289  0.000000   0.012048    0.048193  0.024096  0.481928   
Lokal         0.009346  0.000000   0.000000    0.000000  0.000000  0.084112   
Politik       0.118732  0.003314   0.046110    0.136744  0.048127  0.108357   
Sport         0.022792  0.005698   0.002849    0.002849  0.002849  0.005698   
Technologie   0.045455  0.000000   0.016529    0.090909  0.016529  0.004132   
Kultur        0.095238  0.000000   0.166667    0.000000  0.000000  0.000000   

                Lokal   Politik     Sport  Technolo

In [21]:
scores_diagonal = [accuracy_matrix[x][x] for x in range(num_models)]
average_score = sum(scores_diagonal) / len(scores_diagonal)
print('score: {}'.format(average_score))

score: 0.618435112815
