# 02-topic-modelling-spiegel

Calculate the topic distribution for every article in every week, over a range of years.
So first I'll put all the documents together, then calculate the words-per-topic matrix, then run a long model, and finally produce the documents-per-topic matrix.

In [1]:
import os
import string
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
import time

import pandas as pd
import numpy as np
from functools import reduce
from scipy.stats import entropy
from datetime import datetime

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("poster", font_scale=1.5, rc={"lines.linewidth": 2.5})
sns.mpl.rc("figure", figsize=(9,6))

import warnings; warnings.filterwarnings('ignore')

In [2]:
def tokenize(text):
    stopwords = set(nltk.corpus.stopwords.words('german'))
    with open('data/german_stopwords.txt') as f:
        more_stopwords = [ line[:-1] for line in f ]
    stopwords = stopwords.union(set(more_stopwords))
    
    minlength = 3
    
    invalidChars = { '¡', '§', '©', '\xad', '°', '²', '³', 'µ', '¹', '¿', '×', '\u200b', 
                    '•', '‣', '…', '⁄', '₂', '€', '™', '▇', '■', '▶', '◆', '●', '★', '✽',
                    '❏', '➝', '主', '原', '年', '後', '歸', '物', '舧', '舰'}
    invalidChars = invalidChars.union(set(string.punctuation.replace("-", "–„“")))
    for token in nltk.word_tokenize(text):
        t = token.lower()
        if (len(t)<minlength) or (t in stopwords) or (t.replace('ß','ss') in stopwords) \
        or (t in string.punctuation) or (t[0] in string.punctuation) \
        or any(char in invalidChars for char in token):
            continue
        yield t
        
def normalise(vec):
    return vec / np.dot(vec,vec)

def combine_vectors(vectors):
    return normalise(np.sum(vectors, axis=0))

def important_words(vectorizer, vec, n):
    return sorted(zip(vectorizer.get_feature_names(), vec), key=lambda x:x[1], reverse=True)[:n]

In [3]:
years = range(1947,2017)

infiles = [ 'data/%d.csv' % d for d in years ]

# create DataFrame for all articles
df = pd.DataFrame()

for infile in infiles:

    df_year = pd.read_csv(infile, index_col=0)
    df_year = df_year[pd.notnull(df_year['text'])]

    # uncomment for short run
    #df = df.head(50)
    
    df = df.append(df_year)

In [5]:
len(df_year), len(df)

(3578, 308463)

### Now let's estimate how long it's going to take.
_________________________________

25 topics (time in minutes)
[
[100, 0.24],
[200, 0.42],
[500, 0.66],
[1000, 1.14],
[2000, 2.10],
[5000, 5.22]
]

Ok, from this I get that the time in minutes is:
T = 0.1514 + 0.0010*n_documents


- 10K documents               ->  10.15 minutes
- 28178 (2009-2016) documents ->  28.32 minutes
- 100K documents              ->  100.15 minutes (<2 hours)
- 1M documents                ->  1000.15 minutes (~17 hours)
_________________________________

50 topics (time in minutes)
[
[100, 0.45],
[200, 0.75],
[500, 1.30],
[1000, 2.23],
[2000, 4.14],
[5000, 9.77]
]
Ok, from this I get that the time in minutes is:
T = 0.33 + 0.00189*n_documents

- 10K documents               -> 19.23 minutes
- 28178 (2009-2016) documents -> 53.59 minutes
- 100K documents              -> 189.33 minutes (3 hours)
- 1M documents                -> 1890.33 minutes (31.5 hours)
_________________________________

100 topics (time in minutes)
[
[100, 0.80],
[200, 1.83],
[500, 2.25],
[1000, 4.51],
[2000, 8.09],
[5000, 18.97]
]

Ok, from this I get that the time in minutes is:
T = 0.7077 + 0.003659*n_documents

- 10K documents               ->  37 minutes
- 28178 (2009-2016) documents ->  103 minutes (< 2 hours)
- 100K documents              ->  366.61 minutes (6 hours)
- 1M documents                ->  3659.70 minutes (2.5 days)

_________________________________

On the number of tokens:

28178 documents:
- n_topics=50, min_df=0.010, max_df=0.8: 5866 different tokens, total 6847730
- n_topics=50, min_df=0.005, max_df=0.8: 10757 different tokens, total 8015999
- n_topics=50, min_df=0.001, max_df=0.8: 37335 different tokens, total 9960130


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
start = time.time()

# list of text documents
text = df.text.values
doc_ids = df.filename.values

# create the transform
vectorizer = CountVectorizer(tokenizer=tokenize, min_df=0.005, max_df=0.8)

# tokenize and build vocab
vectorizer.fit(text)

# summarize
#print(vectorizer.vocabulary_)

# encode document-term matrix
dtm = vectorizer.transform(text)

# summarize encoded vector
print('Shape of document-term matrix (documents, tokens):', dtm.shape)
print('Total number of tokens:', dtm.sum() )
#print(type(dtm))
#print(dtm.toarray())

end = time.time()
print((end - start)/60.0,'minutes')

Shape of document-term matrix (documents, tokens): (308463, 8041)
Total number of tokens: 55589355
85.26838853756587 minutes


In [7]:
# summarize encoded vector
print('Shape of document-term matrix (documents, tokens):', dtm.shape)
print('Total number of tokens:', dtm.sum() )
#print(type(dtm))
#print(dtm.toarray())

end = time.time()
print(round((end - start)/60.0,2),'minutes')

from scipy.sparse import save_npz
save_npz('data/dtm_matrix.npz', dtm)

Shape of document-term matrix (documents, tokens): (308463, 8041)
Total number of tokens: 55589355
85.27 minutes


In [33]:
from scipy.sparse import load_npz
dtm = load_npz('data/dtm_matrix.npz')

dtm

<308463x8041 sparse matrix of type '<class 'numpy.int64'>'
	with 41382887 stored elements in Compressed Sparse Row format>

In [34]:
nnz_per_row = dtm.getnnz(axis=1)
non_null_rows = np.where(nnz_per_row > 0)[0]
null_rows     = np.where(nnz_per_row <= 0)[0]

dtm = dtm[dtm.getnnz(1)>0]
dtm

<308443x8041 sparse matrix of type '<class 'numpy.int64'>'
	with 41382887 stored elements in Compressed Sparse Row format>

In [35]:
import lda

#n_topics = 30
#n_topics = 50
#n_topics = 70

n_topics = 90

topic_model = lda.LDA(n_topics=n_topics, n_iter=1500, random_state=1)

In [36]:
start = time.time()

document_topic_distributions = topic_model.fit_transform(dtm)

end = time.time()
print((end - start)/60.0,'minutes')

INFO:lda:n_documents: 308443
INFO:lda:vocab_size: 8041
INFO:lda:n_words: 55589355
INFO:lda:n_topics: 90
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -758086395
INFO:lda:<10> log likelihood: -598041747
INFO:lda:<20> log likelihood: -536046676
INFO:lda:<30> log likelihood: -524010456
INFO:lda:<40> log likelihood: -518544323
INFO:lda:<50> log likelihood: -515508551
INFO:lda:<60> log likelihood: -513485874
INFO:lda:<70> log likelihood: -512084028
INFO:lda:<80> log likelihood: -511049326
INFO:lda:<90> log likelihood: -510263866
INFO:lda:<100> log likelihood: -509665559
INFO:lda:<110> log likelihood: -509174543
INFO:lda:<120> log likelihood: -508799831
INFO:lda:<130> log likelihood: -508469247
INFO:lda:<140> log likelihood: -508154061
INFO:lda:<150> log likelihood: -507904839
INFO:lda:<160> log likelihood: -507691883
INFO:lda:<170> log likelihood: -507526573
INFO:lda:<180> log likelihood: -507373941
INFO:lda:<190> log likelihood: -507228515
INFO:lda:<200> log likelihood: -507109034
INF

1148.4681915998458 minutes


In [37]:
vocab = vectorizer.get_feature_names()
topic_names = ['Topic %d'%k for k in range(1, n_topics + 1)]

topic_word_distributions = pd.DataFrame(topic_model.components_, columns=vocab, index=topic_names)

document_topic_distributions = pd.DataFrame(document_topic_distributions,
                                            columns=topic_names,
                                            index=doc_ids[non_null_rows])

In [38]:
document_topic_distributions.to_csv('data/document_topic_distributions_'+str(n_topics)+'topics.csv')
topic_word_distributions.to_csv('data/topic_word_distributions_'+str(n_topics)+'topics.csv')

In [42]:
topic_word_distributions.loc['Topic 2'].sort_values(ascending=False).head(15)

herr         0.014514
frage        0.013728
brief        0.010296
herrn        0.009725
fragen       0.006816
könne        0.006811
antwort      0.006206
wissen       0.005848
erklärung    0.005674
sache        0.005607
tage         0.005143
rede         0.004489
erklärt      0.004470
oktober      0.004419
schrieb      0.004390
Name: Topic 2, dtype: float64

In [40]:
document_topic_distributions.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,...,Topic 81,Topic 82,Topic 83,Topic 84,Topic 85,Topic 86,Topic 87,Topic 88,Topic 89,Topic 90
d-41122662,0.00625,0.00625,0.00625,0.13125,0.06875,0.06875,0.00625,0.00625,0.00625,0.06875,...,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625
d-41122648,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,...,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.055,0.005
d-41122673,0.047287,0.000775,0.000775,0.008527,0.000775,0.000775,0.000775,0.000775,0.000775,0.008527,...,0.000775,0.000775,0.000775,0.000775,0.000775,0.000775,0.000775,0.000775,0.000775,0.000775
d-41122667,0.00087,0.00087,0.00087,0.00087,0.00087,0.00087,0.00087,0.00087,0.00087,0.00087,...,0.035652,0.00087,0.00087,0.00087,0.00087,0.00087,0.00087,0.00087,0.00087,0.00087
d-41122630,0.000408,0.086122,0.000408,0.000408,0.00449,0.094286,0.135102,0.000408,0.000408,0.000408,...,0.000408,0.000408,0.00449,0.000408,0.037143,0.00449,0.000408,0.000408,0.000408,0.000408


In [41]:
topic_word_distributions.head()

Unnamed: 0,000,00187,100,1000,101,110,115,120,1200,125,...,überzeugt,überzeugte,überzeugung,überzogen,üblich,übliche,üblichen,übrigen,übt,übung
Topic 1,1.26947e-08,1.26947e-08,1.26947e-08,1.26947e-08,1.26947e-08,1.26947e-08,1.26947e-08,1.26947e-08,1.26947e-08,1.26947e-08,...,0.0001053787,1.282165e-06,1.26947e-08,1.26947e-08,1.26947e-08,1.26947e-08,1.26947e-08,0.000301,1.26947e-08,1.26947e-08
Topic 2,1.099912e-08,1.099912e-08,1.099912e-08,1.099912e-08,1.099912e-08,1.099912e-08,1.099912e-08,1.099912e-08,1.099912e-08,1.099912e-08,...,0.0009360364,1.099912e-08,0.000727053,1.099912e-08,0.000601663,0.000200195,0.0003849803,0.001498,1.099912e-08,1.099912e-08
Topic 3,0.00594957,1.725008e-08,0.002803155,0.0005261447,1.725008e-08,0.0003053437,0.0002001182,0.000591695,1.725008e-08,0.0002846436,...,1.725008e-08,1.725008e-08,1.725008e-08,1.725008e-08,1.725008e-08,7.591761e-05,1.725008e-08,0.000854,1.725008e-08,1.725008e-08
Topic 4,1.315365e-08,1.315365e-08,1.315365e-08,0.0002959702,1.315365e-08,1.315365e-08,1.315365e-08,1.315365e-08,1.315365e-08,1.315365e-08,...,0.0008352698,1.315365e-08,0.0003630538,1.315365e-08,1.315365e-08,1.315365e-08,1.315365e-08,0.000431,1.315365e-08,0.0001262882
Topic 5,0.0107809,1.816493e-08,0.0009627593,0.0008210729,1.816493e-08,1.816493e-08,3.65115e-06,0.0002089148,0.0003487848,3.453153e-05,...,1.816493e-08,1.816493e-08,1.816493e-08,1.816493e-08,0.0003851146,0.000168952,0.0004214445,0.000797,1.816493e-08,1.816493e-08
