In [4]:
%matplotlib inline

from gensim.models import Word2Vec
import sys
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import matplotlib.cm as cm
from nltk.corpus import stopwords as sw
#from nltk.stem.snowball import GermanStemmer
from gensim import corpora
from gensim.models import LdaModel
from sklearn import cluster
from sklearn.metrics.cluster import homogeneity_completeness_v_measure
from sklearn.metrics.cluster import contingency_matrix
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from collections import Counter
from scipy import spatial
from sklearn import preprocessing
from pandas import DataFrame
import pickle


# there are quite a few long-running processes in this notebook.
# activated logging is a good way to get a status of these tasks
# disable logging for "presentation notebooks" since the logging uses stderr
import logging

stopwords = sw.words('german')
# gensims LineSentence generator replaces umlauts with 
# u, a or o so add these variants to the stopwordlist
for stopword in stopwords:
    stopword = stopword.replace(u'ü', 'u')
    stopword = stopword.replace(u'ö', 'o')
    stopword = stopword.replace(u'ä', 'a')
    if stopword not in stopwords:
        stopwords.append(stopword)
        
np.random.seed(0)

In [5]:
category_names = ['Sonstiges', 'Aktuell', 'Lifestyle', 
          'Wirtschaft', 'Finanzen', 'Ausland', 'Lokal', 
          'Politik', 'Sport', 'Technologie', 'Kultur']

num_models = len(category_names)

# the list of full corpora
fulldata_paths = [(x, "corpus/corpus{}.txt".format(x)) for x in category_names]

# the corpora with a fixed split for training and validation
train_paths = [(x, "data/corpus{}.training.txt".format(x)) for x in category_names]
validation_paths = [(x, "data/corpus{}.validation.txt".format(x)) for x in category_names]

base_model = Word2Vec.load('../wiki/data/wiki.de.200dim.word2vec.model')

# we can precompute the L2-normalized vectors to save lots of memory
# we can't continue learning after they are normalized but the model is static
# in this usecase anyways
base_model.init_sims(replace=True)

k = base_model.vector_size
print("basemodel has {} dimensional vectors".format(k))

basemodel has 200 dimensional vectors


In [6]:
def load_sets(paths):
    log = logging.getLogger('load-sets')
    #stemmer = GermanStemmer()
    X, y = [], []

    for name, path in paths:
        with open(path) as cur_file:
            log.info('now loading path {} ...'.format(path))
            
            #get the number of lines for logging and rewind the file to start
            lines = sum([1 for line in cur_file])
            cur_file.seek(0)
            
            for num, line in enumerate(cur_file):
                if num % 1000 == 0:
                    log.info('preprocessed {} of {} lines'.format(num, lines))
                tokens = [x for x in line.split() if x not in stopwords]
                if len(tokens) > 0:
                    X.append(tokens)
                    y.append(name)
    print("loaded {} articles".format(len(X)))
    return X, y

In [7]:
X, target = load_sets(fulldata_paths)



loaded 32200 articles


In [8]:
dictionary = corpora.Dictionary(X)

In [9]:
corpus = [dictionary.doc2bow(text) for text in X]

In [10]:
ldamodel_filename = 'data/corpus.lda.model'
try:
    ldamodel = LdaModel.load(ldamodel_filename)
except IOError:
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    ldamodel = LdaModel(corpus, num_topics=100, id2word = dictionary, passes=20, random_state=0)
    logging.disable(logging.CRITICAL);
    ldamodel.save(ldamodel_filename)

In [11]:
ldamodel.print_topics(num_topics=100, num_words=5)

[(0,
  u'0.054*"apple" + 0.035*"android" + 0.033*"smartphone" + 0.027*"iphone" + 0.023*"windows"'),
 (1,
  u'0.073*"berlin" + 0.069*"prosieben" + 0.065*"\u201eanne" + 0.059*"will\u201c" + 0.039*"xy"'),
 (2,
  u'0.036*"turkei" + 0.021*"erdogan" + 0.018*"regierung" + 0.014*"turkische" + 0.013*"parlament"'),
 (3,
  u'0.036*"klum" + 0.032*"haushalts" + 0.032*"microsoft" + 0.022*"\u201esex" + 0.015*"reality"'),
 (4,
  u'0.014*"auto" + 0.012*"autos" + 0.011*"neue" + 0.010*"hersteller" + 0.009*"bmw"'),
 (5,
  u'0.048*"\u201eard" + 0.022*"golem" + 0.019*"intelligenz" + 0.018*"roboter" + 0.014*"boni"'),
 (6,
  u'0.022*"stinkt" + 0.021*"ruckrunde" + 0.021*"\u201estar" + 0.020*"\u201evom" + 0.015*"depression"'),
 (7,
  u'0.053*"sommerhaus" + 0.046*"dfb" + 0.042*"mehr\u201c" + 0.032*"frankfurt" + 0.020*"park"'),
 (8,
  u'0.096*"eu" + 0.020*"deutschland" + 0.018*"kommission" + 0.018*"europa" + 0.017*"europaischen"'),
 (9,
  u'0.102*"spiegel" + 0.083*"online" + 0.023*"rohe" + 0.019*"modus" + 0.018*"

In [12]:
#precompute the top words from the LDA model
WORDS_PER_TOPIC = 10
topwords_in_topic = np.zeros([100, WORDS_PER_TOPIC, k])
for topic in range(100):
    words_added = 0
    for top_word_id, probability in ldamodel.get_topic_terms(topic, topn=100):
        top_word = dictionary.get(top_word_id)
        try:
            top_word_vec = base_model[top_word]
            topwords_in_topic[topic, words_added] = top_word_vec
            words_added += 1
            if words_added == WORDS_PER_TOPIC:
                break
        except:
            pass

calculate the (cosine) similarity 
$$cos(\theta )=\frac { { w }_{ d,i }\cdot { v }_{ t,j } }{ \left| { w }_{ d,i } \right| *\left| { v }_{ t,j } \right|  } $$

of each word of article $d$ with each of the top 5 words ${v}_{t,j}$ with $j\in[0..4]$ of each topic $t$

find the best matching topic

$$ \underset { { v }_{ t,j } }{ argmax } \ cos(\theta ) $$ 

for each word ${ w }_{ d,i }$

```X_by_topic``` is a list of dictionaries where each dictionary maps $$ t \rightarrow \sum _{ t\in T }^{  }{ \begin{cases} 1\ if\ \underset { { v }_{ t,j } }{ argmax } \ cos(\theta )\in T \\ 0\ otherwise \end{cases} } $$
which is the count for how many words of the article the topic $T$ is the best match

In [13]:
mapping_filename = 'data/corpus.lda.mapping'
# try to load the cached file
try:
    X_by_topic = pickle.load(open(mapping_filename, 'rb'))
except IOError:
    #if there is no previously saved file, create a new mapping

    def get_word(word):
        try:
            return base_model[word]
        except:
            return base_model.seeded_vector(word)

    X_by_topic = []
    for i, article in enumerate(X):
        words_vecs = np.asarray([get_word(word) for word in article], dtype=float)
        best_topics_for_words = [-1 for word in article]
        best_similarities_for_words = [-1 for word in article]
        for topic in range(100):
            topic_vecs = topwords_in_topic[topic]

            # consine calculation of a matrix
            # code adapted from http://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat
            similarity = np.dot(words_vecs, topic_vecs.T)

            norm_words = np.linalg.norm(words_vecs, axis=1)
            norm_topics = np.linalg.norm(topic_vecs, axis=1)

            # there should be a way to 
            for word_num in range(similarity.shape[0]):
                for j in range(similarity.shape[1]):
                    similarity[word_num,j] /= norm_words[word_num] * norm_topics[j]


            best_word_indices_in_topic = np.argmax(similarity, axis=1)

            for word_num in range(words_vecs.shape[0]):
                new_similarity = similarity[word_num, best_word_indices_in_topic[word_num]]
                if new_similarity > best_similarities_for_words[word_num]:
                    best_similarities_for_words[word_num] = new_similarity
                    best_topics_for_words[word_num] = topic

        X_by_topic.append(Counter(best_topics_for_words))

        if i%100==0:
            print(i)  

    #save the mapping data
    pickle.dump(X_by_topic, open(mapping_filename, 'wb'))

In [25]:
topic_for_article = []
MIN_CONFIDENCE_THRESHOLD = 0.00
for art in X_by_topic:
    max_category = max(art.iterkeys(), key=lambda x: art[x])
    sorted_number_in_category = sorted(art.itervalues(), reverse=True)
    max_category_value = float(sorted_number_in_category[0] if len(art) > 0 else 0)
    second_max_category_value = float(sorted_number_in_category[1] if len(art) > 1 else 0)
    
    max_category_value /= sum(sorted_number_in_category)
    second_max_category_value /= sum(sorted_number_in_category)
    
    #print(max_category_value - second_max_category_value)
    
    if max_category_value - second_max_category_value > MIN_CONFIDENCE_THRESHOLD:
        topic_for_article.append(max_category)
    else:
        topic_for_article.append(100)    

In [26]:
le = preprocessing.LabelEncoder()
int_target = le.fit_transform(target)

target_map = np.zeros([101, num_models], dtype=float)
for pred_topic, cur_target in zip(topic_for_article, int_target):
    target_map[pred_topic][cur_target] += 1

#for row in range(100):
#    target_map[row] = target_map[row] / np.sum(target_map[row])
    
result = DataFrame(target_map, range(101), le.classes_)
print(result)

     Aktuell  Ausland  Finanzen  Kultur  Lifestyle  Lokal  Politik  Sonstiges  \
0        0.0      0.0       3.0     1.0        1.0    1.0      0.0        0.0   
1        0.0      7.0       8.0     2.0       11.0   52.0     40.0       34.0   
2        6.0    199.0       8.0     4.0       14.0    7.0    374.0       55.0   
3        0.0      0.0       1.0     1.0        1.0    2.0      1.0        4.0   
4        3.0      6.0      29.0     4.0       20.0   13.0     13.0       56.0   
5        0.0      0.0       0.0     0.0        0.0    1.0      2.0        1.0   
6        0.0      0.0       1.0     0.0        8.0    1.0      0.0        4.0   
7        1.0     13.0       3.0     0.0        7.0    8.0     30.0        3.0   
8        3.0    112.0      46.0     8.0       12.0   10.0    266.0       25.0   
9        0.0      2.0       1.0     7.0        2.0    0.0      3.0        9.0   
10       0.0      0.0       0.0     0.0        1.0   38.0      5.0        6.0   
11       0.0      3.0       

In [27]:
homogenity = np.zeros([101], dtype=float)

#weights_map = Counter(target)
#weights = np.asarray([weights_map[cur_class] for cur_class in le.classes_], dtype=int)
#exclude last row because it is "unknown"
for row in range(100):
    weighted_row = target_map[row]# * weights
    max_index = np.argmax(weighted_row)
    max_value = weighted_row[max_index]
    row_sum = np.sum(weighted_row)
    homogenity[row] = max_value / row_sum if row_sum > 0 else 0
    
print("average homogenity: {}".format(np.average(homogenity)))

average homogenity: 0.440342218905
