In [11]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from pandas import DataFrame
import numpy as np

construct the file names for the learned models and the validation sets

In [12]:
model_names = ['Sonstiges', 'Aktuell', 'Lifestyle', 
          'Wirtschaft', 'Finanzen', 'Ausland', 'Lokal', 
          'Politik', 'Sport', 'Technologie', 'Kultur']

num_models = len(model_names)
model_paths = ["data/corpus{}+base.word2vec.model".format(x) for x in model_names]
validation_paths = ["data/corpus{}.validation.txt".format(x) for x in model_names]

load the validation sets

In [13]:
validators = [LineSentence(path) for path in validation_paths]

# Validation

validate each validation set with all models and calculate the score (log likelihood) of each sentence for each model.

scores is an array in the form:

    [for each model:
        [for each validation set:
            [score of each sentence]
        ]
    ]

In [14]:
def get_scores(model_path):
    # load the model
    print('loading model {}'.format(model_path))
    model =  Word2Vec.load(model_path)
    # calculate the score (log likelihood) of each validation set for this model
    print('validating model...')
    scores = [model.score(validator) for validator in validators]
    return scores

main loop that calculates the scores for all models

In [15]:
#container to hold the calculated scores
scores = []

for model in model_paths:
    model_scores = get_scores(model)
    scores.append(model_scores)

loading model data/corpusSonstiges+base.word2vec.model
validating model...
loading model data/corpusAktuell+base.word2vec.model
validating model...
loading model data/corpusLifestyle+base.word2vec.model
validating model...
loading model data/corpusWirtschaft+base.word2vec.model
validating model...
loading model data/corpusFinanzen+base.word2vec.model
validating model...
loading model data/corpusAusland+base.word2vec.model
validating model...
loading model data/corpusLokal+base.word2vec.model
validating model...
loading model data/corpusPolitik+base.word2vec.model
validating model...
loading model data/corpusSport+base.word2vec.model
validating model...
loading model data/corpusTechnologie+base.word2vec.model
validating model...
loading model data/corpusKultur+base.word2vec.model
validating model...


# Log Likelihood

output the average score (log likelihood) of all sentences in a validation set for one model

row = average likelihood that an item of this category is generated by the model in the row

e.g: the lowest value in each column is the category a sentence of this model is most likely classified to

In [16]:
average_scores = []
for score_set in scores:
    average_scores.append([sum(x) / len(x) for x in score_set])

#transpose the array before creating the DataFrame because pandas is row-oriented
result = DataFrame(np.transpose(average_scores), model_names, model_names)
print(result)

               Sonstiges      Aktuell    Lifestyle   Wirtschaft     Finanzen  \
Sonstiges   -3325.388849 -4051.485363 -3779.714496 -3927.573504 -3983.312528   
Aktuell     -2781.446618 -2859.790542 -2819.763587 -2798.698792 -2933.390316   
Lifestyle   -3400.145797 -3625.067260 -3197.680930 -3378.521383 -3654.712815   
Wirtschaft  -2279.561870 -2357.683867 -2137.304999 -1903.845588 -2280.090361   
Finanzen    -2994.836114 -3446.413571 -3134.862014 -2998.078684 -2566.185452   
Ausland     -1971.585375 -2067.931050 -2031.573275 -1994.008609 -2125.969119   
Lokal       -4569.039392 -5141.734328 -4660.747572 -5070.361464 -4758.087463   
Politik     -3311.139584 -3631.725968 -3318.819777 -3315.297906 -3493.275739   
Sport       -3976.421223 -4321.111574 -4077.288453 -4277.969966 -4276.502763   
Technologie -3563.286074 -3741.564945 -3590.740977 -3625.159370 -3687.282498   
Kultur      -7991.894719 -8471.777363 -8127.494379 -8513.776776 -8656.303435   

                 Ausland        Lokal  

# Classification Quality
calculate the number of categorizations for every category

rows = categories of training set

columns = number of items of the train set classified in the category

e.g: the highest number in each row should be on the diagonal of the matrix

first step is to transpose the model (switch the first two dimensions from model->category to category->model), then numpy.argmax is used to find the index of the model that has the highest score for this category

In [17]:
classification_matrix = np.empty([num_models, num_models], dtype=int)

for category_index in range(num_models):
    #transpose the scores array to form [model][category][sentence_score] to [category][model][sentence_score]
    category_scores = [model[category_index] for model in scores]
    #get the classification matrix in each model
    #the values represent the category index they were assigned to
    classifications = np.argmax(category_scores, axis = 0)
    
    #convert the classification matrix to a count of classification in each category
    classification_count = [np.sum(classifications == x) for x in range(len(model_names))]
    classification_matrix[category_index]=classification_count
    
result = DataFrame(classification_matrix, model_names, model_names)
print(result)   

             Sonstiges  Aktuell  Lifestyle  Wirtschaft  Finanzen  Ausland  \
Sonstiges          344        2         86          28        11       66   
Aktuell              4        1          0           4         0        2   
Lifestyle           62        2        138          20         2       12   
Wirtschaft          21        0         42         474        57       30   
Finanzen             8        1         12          85       157        7   
Ausland             24        1          6          16         1      220   
Lokal               21        1          7          14         0        8   
Politik             70        1         24          77         8      243   
Sport               12        0          6           4         0        6   
Technologie         13        2         28          30         4       11   
Kultur              46        1         34           8         1       10   

             Lokal  Politik  Sport  Technologie  Kultur  
Sonstiges        

# Accuracy

calculate the accuracy matrix

In [18]:
accuracy_matrix = [category / float(sum(category)) for category in classification_matrix]

result = DataFrame(accuracy_matrix, model_names, model_names)
print(result)  

             Sonstiges   Aktuell  Lifestyle  Wirtschaft  Finanzen   Ausland  \
Sonstiges     0.563934  0.003279   0.140984    0.045902  0.018033  0.108197   
Aktuell       0.222222  0.055556   0.000000    0.222222  0.000000  0.111111   
Lifestyle     0.235741  0.007605   0.524715    0.076046  0.007605  0.045627   
Wirtschaft    0.030523  0.000000   0.061047    0.688953  0.082849  0.043605   
Finanzen      0.028777  0.003597   0.043165    0.305755  0.564748  0.025180   
Ausland       0.060150  0.002506   0.015038    0.040100  0.002506  0.551378   
Lokal         0.109375  0.005208   0.036458    0.072917  0.000000  0.041667   
Politik       0.054988  0.000786   0.018853    0.060487  0.006284  0.190888   
Sport         0.033149  0.000000   0.016575    0.011050  0.000000  0.016575   
Technologie   0.045775  0.007042   0.098592    0.105634  0.014085  0.038732   
Kultur        0.280488  0.006098   0.207317    0.048780  0.006098  0.060976   

                Lokal   Politik     Sport  Technolo