In [5]:
%matplotlib inline

from gensim.models import Word2Vec
import sys
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import matplotlib.cm as cm
from nltk.corpus import stopwords as sw
from nltk.stem.snowball import GermanStemmer
from gensim import corpora
from gensim.models import LdaModel
from sklearn import cluster
from sklearn.metrics.cluster import homogeneity_completeness_v_measure
from sklearn.metrics.cluster import contingency_matrix
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from collections import Counter
from scipy import spatial
from sklearn import preprocessing
from pandas import DataFrame
import pickle


# there are quite a few long-running processes in this notebook.
# activated logging is a good way to get a status of these tasks
# disable logging for "presentation notebooks" since the logging uses stderr
import logging

stopwords = sw.words('german')
# gensims LineSentence generator replaces umlauts with 
# u, a or o so add these variants to the stopwordlist
for stopword in stopwords:
    stopword = stopword.replace(u'ü', 'u')
    stopword = stopword.replace(u'ö', 'o')
    stopword = stopword.replace(u'ä', 'a')
    if stopword not in stopwords:
        stopwords.append(stopword)
        
np.random.seed(0)

In [6]:
category_names = ['Sonstiges', 'Aktuell', 'Lifestyle', 
          'Wirtschaft', 'Finanzen', 'Ausland', 'Lokal', 
          'Politik', 'Sport', 'Technologie', 'Kultur']

num_models = len(category_names)

# the list of full corpora
fulldata_paths = [(x, "corpus/corpus{}.txt".format(x)) for x in category_names]

# the corpora with a fixed split for training and validation
train_paths = [(x, "data/corpus{}.training.txt".format(x)) for x in category_names]
validation_paths = [(x, "data/corpus{}.validation.txt".format(x)) for x in category_names]

In [7]:
def load_sets(paths):
    log = logging.getLogger('load-sets')
    stemmer = GermanStemmer()
    X, y = [], []

    for name, path in paths:
        with open(path) as cur_file:
            log.info('now loading path {} ...'.format(path))
            
            #get the number of lines for logging and rewind the file to start
            lines = sum([1 for line in cur_file])
            cur_file.seek(0)
            
            for num, line in enumerate(cur_file):
                if num % 1000 == 0:
                    log.info('preprocessed {} of {} lines'.format(num, lines))
                tokens = [stemmer.stem(x.decode('utf-8')) for x in line.split() if x not in stopwords]
                if len(tokens) > 0:
                    X.append(tokens)
                    y.append(name)
    print("loaded {} articles".format(len(X)))
    return X, y

In [8]:
X, target = load_sets(fulldata_paths)



loaded 32200 articles


In [9]:
dictionary = corpora.Dictionary(X)

In [10]:
corpus = [dictionary.doc2bow(text) for text in X]

In [12]:
ldamodel_filename = 'data/corpus.stemmed.lda.model'
try:
    ldamodel = LdaModel.load(ldamodel_filename)
except IOError:
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    ldamodel = LdaModel(corpus, num_topics=100, id2word = dictionary, passes=20, random_state=0)
    logging.disable(logging.CRITICAL);
    ldamodel.save(ldamodel_filename)

2017-01-30 07:21:13,659 : INFO : using symmetric alpha at 0.01
2017-01-30 07:21:13,660 : INFO : using symmetric eta at 0.01
2017-01-30 07:21:13,662 : INFO : using serial LDA version on this node
2017-01-30 07:23:09,948 : INFO : running online LDA training, 100 topics, 20 passes over the supplied corpus of 32200 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2017-01-30 07:23:09,949 : INFO : PROGRESS: pass 0, at document #2000/32200
2017-01-30 07:23:22,065 : INFO : merging changes from 2000 documents into a model of 32200 documents
2017-01-30 07:23:29,130 : INFO : topic #75 (0.010): 0.007*"polizei" + 0.006*"sei" + 0.005*"mehr" + 0.005*"sagt" + 0.005*"neu" + 0.005*"dass" + 0.004*"word" + 0.004*"frau" + 0.004*"jahr" + 0.004*"mann"
2017-01-30 07:23:29,257 : INFO : topic #57 (0.010): 0.007*"mann" + 0.007*"polizei" + 0.006*"jahrig" + 0.005*"dass" + 0.005*"jahr" + 0.005*"sei" + 0.004*"erst

In [13]:
ldamodel.print_topics(num_topics=100, num_words=5)

[(0,
  u'0.033*"jahr" + 0.014*"alt" + 0.011*"damal" + 0.011*"buch" + 0.011*"spat"'),
 (1,
  u'0.031*"mensch" + 0.019*"tot" + 0.018*"word" + 0.013*"verletzt" + 0.013*"jahr"'),
 (2,
  u'0.040*"cannabis" + 0.027*"lindn" + 0.026*"astronomi" + 0.023*"christian" + 0.019*"strafraum"'),
 (3,
  u'0.045*"bachelorett" + 0.044*"recht" + 0.030*"hotel" + 0.028*"city" + 0.028*"versucht"'),
 (4,
  u'0.020*"gesetz" + 0.018*"china" + 0.013*"muss" + 0.012*"npd" + 0.012*"verbot"'),
 (5,
  u'0.037*"of" + 0.031*"prinz" + 0.026*"neu" + 0.023*"britisch" + 0.020*"harry"'),
 (6,
  u'0.018*"star" + 0.013*"erst" + 0.010*"schon" + 0.008*"best" + 0.008*"final"'),
 (7,
  u'0.023*"frau" + 0.016*"\u201ezdf" + 0.015*"tierisch" + 0.013*"arzt" + 0.013*"medizin"'),
 (8,
  u'0.221*"lanz" + 0.032*"jurg" + 0.028*"gauck" + 0.022*"pet" + 0.021*"shopping"'),
 (9,
  u'0.039*"bank" + 0.033*"wien" + 0.029*"story\u201c" + 0.029*"lindenberg" + 0.019*"mobbing"'),
 (10,
  u'0.056*"bad" + 0.044*"nico" + 0.033*"wurttemberg" + 0.021*"sig

In [27]:
mapping_filename = 'data/corpus.stemmed.lda.mapping'
# try to load the cached file
try:
    X_by_topic = pickle.load(open(mapping_filename, 'rb'))
except IOError:
    X_by_topic = []
    for i, text in enumerate(corpus):
        X_by_topic.append(ldamodel[text])
        if i%1000 == 0:
            print(i)
    X_by_topic = [{k:v for k,v in x} for x in topic_by_X]
    pickle.dump(X_by_topic, open(mapping_filename, 'wb'))

In [67]:
topic_for_article = []
MIN_CONFIDENCE_THRESHOLD = 0.00
for art in X_by_topic:
    max_category = max(art.iterkeys(), key=lambda x: art[x])
    sorted_number_in_category = sorted(art.itervalues(), reverse=True)
    max_category_value = float(sorted_number_in_category[0] if len(art) > 0 else 0)
    second_max_category_value = float(sorted_number_in_category[1] if len(art) > 1 else 0)
        
    max_category_value /= sum(sorted_number_in_category)
    second_max_category_value /= sum(sorted_number_in_category)
    
    #print(max_category_value - second_max_category_value)
    
    if max_category_value - second_max_category_value > MIN_CONFIDENCE_THRESHOLD:
        topic_for_article.append(max_category)
    else:
        topic_for_article.append(100)    

In [68]:
le = preprocessing.LabelEncoder()
int_target = le.fit_transform(target)

target_map = np.zeros([101, num_models], dtype=float)
for pred_topic, cur_target in zip(topic_for_article, int_target):
    target_map[pred_topic][cur_target] += 1

#for row in range(100):
#    target_map[row] = target_map[row] / np.sum(target_map[row])
    
result = DataFrame(target_map, range(101), le.classes_)
print(result)

     Aktuell  Ausland  Finanzen  Kultur  Lifestyle  Lokal  Politik  Sonstiges  \
0        2.0     16.0       3.0    26.0        9.0    5.0     49.0       43.0   
1        3.0     97.0       0.0     7.0       46.0   10.0    150.0      253.0   
2        0.0      0.0       1.0     0.0        2.0    0.0      1.0        7.0   
3        0.0      0.0       0.0     1.0        4.0    1.0      1.0        4.0   
4        2.0     15.0       4.0     1.0       13.0   16.0     90.0       21.0   
5        0.0      1.0       2.0     8.0       17.0    0.0      1.0       14.0   
6        0.0     10.0       1.0   127.0       90.0    8.0     13.0      123.0   
7        0.0      1.0       0.0     8.0       21.0    1.0      0.0        7.0   
8        0.0      0.0       0.0     0.0        0.0    0.0      0.0        0.0   
9        0.0      0.0       1.0     0.0        1.0    1.0      0.0        5.0   
10       0.0      0.0       0.0     3.0        3.0    2.0      0.0        6.0   
11       0.0     24.0       

In [69]:
homogenity = np.zeros([101], dtype=float)

weights_map = Counter(target)
weights = np.asarray([weights_map[cur_class] for cur_class in le.classes_], dtype=int)
#exclude last row because it is "unknown"
for row in range(101):
    weighted_row = target_map[row]# * weights
    max_index = np.argmax(weighted_row)
    max_value = weighted_row[max_index]
    row_sum = np.sum(weighted_row)
    homogenity[row] = max_value / row_sum if row_sum > 0 else 0
    
print("average homogenity: {}".format(np.average(homogenity)))

average homogenity: 0.492153688185
