# Categorization using averaged word vectors as document feature

In [1]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from pandas import DataFrame
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from sklearn.pipeline import Pipeline
import numpy as np
from nltk.corpus import stopwords as sw

stopwords = sw.words('german')



get a list of:
* the full corpora split into categories -> ```fulldata_path```
* a subset of each category corpus used for training -> ```train_paths```
* a subset of each category corpus used for validation -> ```validation_paths```

the categories were split into training / validation by using the ```mail/generateSets.py``` script with a 70 / 30 split between training and validation

In [2]:
category_names = ['Sonstiges', 'Aktuell', 'Lifestyle', 
          'Wirtschaft', 'Finanzen', 'Ausland', 'Lokal', 
          'Politik', 'Sport', 'Technologie', 'Kultur']

num_models = len(category_names)

# the list of full corpora
fulldata_paths = [(x, "corpus/corpus{}.txt".format(x)) for x in category_names]

# the corpora with a fixed split for training and validation
train_paths = [(x, "data/corpus{}.training.txt".format(x)) for x in category_names]
validation_paths = [(x, "data/corpus{}.validation.txt".format(x)) for x in category_names]

base_model = Word2Vec.load('../wiki/data/wiki.de.200dim.word2vec.model')

In [3]:
def load_sets(paths):
    X, y = [], []

    for name, path in paths:
        with open(path) as cur_file:
            for line in cur_file:
                tokens = [x for x in line.split() if x not in stopwords]
                X.append(tokens)
                y.append(name)
    return X, y

## word2vec Vectorizers
These vectorizers are used to transform a set of vectors to a single vector. They are used to transform a list of word embeddings to a single vector that represents the whole article.

Both variations simply build the average of all word-vectors. The TFIDF variation however uses the word frequency and inverse-document frequency to weight the word vectors.

The implementation in mostly adapted from [Text Classification With Word2Vec](http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/) by Nadbor Drozd

The ```MeanEmbeddingVectorizer``` generates a document vector $  \overrightarrow { d } $ from a list of word vectors by calculating

$$ \overrightarrow { d } =\frac { \sum _{ i=0 }^{ dim(d) }{ \overrightarrow { { w }_{ d,i } }  }  }{ dim(d) } $$

where: 
* $ \overrightarrow { {w}_{d,i} } $ is the $ i $-th word of document $ d $

In [4]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = word2vec.vector_size
    
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

The ```TfidfEmbeddingVectorizer``` uses the same averaging strategy as the ```MeanEmbeddingVectorizer```, however it also weights every word vector $ \overrightarrow { {w}_{d,i} }$ with the term frequency-inverse document frequency (TF-IDF) of the word to put more weight on words appearing in fewer documents.

$$ \overrightarrow { d } =\frac { \sum _{ i=0 }^{ dim(d) }{ \overrightarrow { { w }_{ d,i } } *tfidf(\overrightarrow { { w }_{ d,i } } ) }  }{ dim(d) }  $$

In [5]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = word2vec.vector_size
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

a simple random forest classifier is used for classification of the document vectors

In [6]:
rf_w2v = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(base_model)), 
                        ("random forest", RandomForestClassifier(n_estimators=200))])
rf_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(base_model)), 
                        ("random forest", RandomForestClassifier(n_estimators=200))])

## Cross validation Score

in this section, the cross_val_score function of scikitlearn is used to validate the model. 

However, to be able to equally compare the different classification strategies, a fixed training and validation set is used in the next section.

In [7]:
X, y = load_sets(fulldata_paths)



In [8]:
score_rf = cross_val_score(rf_w2v, X, y, cv=2).mean()

In [9]:
score_rf_tfidf = cross_val_score(rf_w2v_tfidf, X, y, cv=2).mean()

In [10]:
print('Score simple: {}'.format(score_rf))
print('Score TFIDF:  {}'.format(score_rf_tfidf))

Score simple: 0.59461255141
Score TFIDF:  0.595011215514


## Training

In [11]:
# use only the tfidf model for further consideration since it performs slightly better in the cross validation
# however, it also needs twice the time to compute

# create a new instance to make sure the model isn't pre trained from the previous step
test_model =  Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(base_model)), 
                        ("extra trees", RandomForestClassifier(n_estimators=200))])

load the training data

In [12]:
train_X, train_y = load_sets(train_paths)



train the model

In [13]:
# fit returns self. assign it to a dummy variable to stop jupyter from printing the model
_ = test_model.fit(train_X, train_y)

In [14]:
validate_X, validate_y = load_sets(validation_paths)
predicted_y = test_model.predict(validate_X)



## Validation

This section performs the same validation steps that were used when validating the log-likelihood score approach fpr article classification, so the steps aren't as well documented. Thee the other document for a complete explanation

In [15]:
classification_matrix = np.zeros([num_models, num_models], dtype=int)

for target, predicted in zip(validate_y, predicted_y):
    target_index = category_names.index(target)
    predicted_index = category_names.index(predicted)
    classification_matrix[predicted_index, target_index] += 1
    
result = DataFrame(classification_matrix, category_names, category_names)
print(result)  

             Sonstiges  Aktuell  Lifestyle  Wirtschaft  Finanzen  Ausland  \
Sonstiges          509        3        131          30        10       28   
Aktuell              0        0          0           0         0        0   
Lifestyle           14        0         87           9         0        2   
Wirtschaft          39        5         47         615       133       27   
Finanzen             0        0          5           6       162        1   
Ausland              4        0          1           2         2       71   
Lokal                4        0          0           0         1        0   
Politik           1027       28        387        1183       390      900   
Sport               13        0          1           1         0        2   
Technologie         24        4          7          39         4        1   
Kultur               8        0         12           1         0        2   

             Lokal  Politik  Sport  Technologie  Kultur  
Sonstiges       2

## Accuracy

In [16]:
# the max(, 1) function surrounding sum makes sure wo don't divide by 0 if no match occurred
accuracy_matrix = [category / float(max([sum(category) ,1])) for category in classification_matrix]

result = DataFrame(accuracy_matrix, category_names, category_names)
print(result)  

             Sonstiges   Aktuell  Lifestyle  Wirtschaft  Finanzen   Ausland  \
Sonstiges     0.566185  0.003337   0.145717    0.033370  0.011123  0.031146   
Aktuell       0.000000  0.000000   0.000000    0.000000  0.000000  0.000000   
Lifestyle     0.095238  0.000000   0.591837    0.061224  0.000000  0.013605   
Wirtschaft    0.037827  0.004850   0.045587    0.596508  0.129001  0.026188   
Finanzen      0.000000  0.000000   0.028409    0.034091  0.920455  0.005682   
Ausland       0.030769  0.000000   0.007692    0.015385  0.015385  0.546154   
Lokal         0.028777  0.000000   0.000000    0.000000  0.007194  0.000000   
Politik       0.118879  0.003241   0.044797    0.136937  0.045144  0.104179   
Sport         0.028953  0.000000   0.002227    0.002227  0.000000  0.004454   
Technologie   0.071006  0.011834   0.020710    0.115385  0.011834  0.002959   
Kultur        0.085106  0.000000   0.127660    0.010638  0.000000  0.021277   

                Lokal   Politik     Sport  Technolo

In [17]:
true_positives = 0.0
num_samples = 0
for x in range(num_models):
    true_positives += classification_matrix[x][x]
    num_samples += sum(classification_matrix[x])
    
average_score = true_positives / num_samples

print('score: {}'.format(average_score))

score: 0.464457731274


In [18]:
# make suer we calculate the "same" accuracy as scipy
# just to prevent dumb mistakes...
score = accuracy_score(validate_y, predicted_y, normalize=True)
print('score: {}'.format(score))

score: 0.464457731274


In [46]:
# phew