In [1]:
import os
import string
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
import time

import pandas as pd
import numpy as np
from functools import reduce
from scipy.stats import entropy
from datetime import datetime

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("poster", font_scale=1.5, rc={"lines.linewidth": 2.5})
sns.mpl.rc("figure", figsize=(9,6))

import warnings; warnings.filterwarnings('ignore')

In [2]:
def tokenize(text):
    stopwords = set(nltk.corpus.stopwords.words('german'))
    with open('../der_spiegel/german_stopwords.txt') as f:
        more_stopwords = [ line[:-1] for line in f ]
    stopwords = stopwords.union(set(more_stopwords))
    
    minlength = 3
    
    invalidChars = { '¡', '§', '©', '\xad', '°', '²', '³', 'µ', '¹', '¿', '×', '\u200b', 
                    '•', '‣', '…', '⁄', '₂', '€', '™', '▇', '■', '▶', '◆', '●', '★', '✽',
                    '❏', '➝', '主', '原', '年', '後', '歸', '物', '舧', '舰'}
    invalidChars = invalidChars.union(set(string.punctuation.replace("-", "–„“")))
    for token in nltk.word_tokenize(text):
        t = token.lower()
        if (len(t)<minlength) or (t in stopwords) or (t.replace('ß','ss') in stopwords) \
        or (t in string.punctuation) or (t[0] in string.punctuation) \
        or any(char in invalidChars for char in token):
            continue
        yield t
        
def normalise(vec):
    return vec / np.dot(vec,vec)

def combine_vectors(vectors):
    return normalise(np.sum(vectors, axis=0))

def important_words(vectorizer, vec, n):
    return sorted(zip(vectorizer.get_feature_names(), vec), key=lambda x:x[1], reverse=True)[:n]

In [4]:
df_all_raw_texts = pd.read_csv('data/C8_all_raw_texts_and_labels.csv', sep='\t', encoding='utf-8', index_col=0)
df_all_raw_texts = df_all_raw_texts[pd.notnull(df_all_raw_texts['text'])]
df_all_raw_texts.head()

Unnamed: 0,lfdn,text,time_window,wave_code,answer_key,tags
0,177.0,99,2009-09-18 to 2009-09-27,ZA5339,a03s,
1,178.0,Die arbeitslosigkeit.,2009-09-18 to 2009-09-27,ZA5339,a03s,labor market
2,179.0,ka,2009-09-18 to 2009-09-27,ZA5339,a03s,
3,180.0,Dass die Finanzen geordnet werden und keine Ne...,2009-09-18 to 2009-09-27,ZA5339,a03s,budget and debt
4,181.0,Schulpolitik sollte besser sein,2009-09-18 to 2009-09-27,ZA5339,a03s,education


In [5]:
df_all_raw_texts[df_all_raw_texts.wave_code=='ZA6817'].head().text

34184                                Soziale Gerechtigkeit
34185    groe Schere zwischen arm und reich /Ungerecht...
34186                                           Sicherheit
34187                                              Bildung
34188                                              Bildung
Name: text, dtype: object

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
start = time.time()

# list of text documents
text = df_all_raw_texts.text.values
doc_ids = df_all_raw_texts.index.values

# create the transform
vectorizer = CountVectorizer(tokenizer=tokenize, min_df=0.005, max_df=0.8)

# tokenize and build vocab
vectorizer.fit(text)

# summarize
#print(vectorizer.vocabulary_)

# encode document-term matrix
dtm = vectorizer.transform(text)

# summarize encoded vector
print('Shape of document-term matrix (documents, tokens):', dtm.shape)
print('Total number of tokens:', dtm.sum() )
#print(type(dtm))
#print(dtm.toarray())

end = time.time()
print((end - start)/60.0,'minutes')

Shape of document-term matrix (documents, tokens): (34149, 55)
Total number of tokens: 24862
1.1572377999623618 minutes


In [40]:
import lda

n_topics = 15
topic_model = lda.LDA(n_topics=n_topics, n_iter=1500, random_state=1)

In [41]:
start = time.time()

document_topic_distributions = topic_model.fit_transform(dtm)

end = time.time()
print((end - start)/60.0,'minutes')

INFO:lda:n_documents: 34149
INFO:lda:vocab_size: 55
INFO:lda:n_words: 24862
INFO:lda:n_topics: 10
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -159794
INFO:lda:<10> log likelihood: -132133
INFO:lda:<20> log likelihood: -123203
INFO:lda:<30> log likelihood: -117534
INFO:lda:<40> log likelihood: -113410
INFO:lda:<50> log likelihood: -110649
INFO:lda:<60> log likelihood: -108695
INFO:lda:<70> log likelihood: -107396
INFO:lda:<80> log likelihood: -106725
INFO:lda:<90> log likelihood: -106626
INFO:lda:<100> log likelihood: -106277
INFO:lda:<110> log likelihood: -105936
INFO:lda:<120> log likelihood: -105458
INFO:lda:<130> log likelihood: -105305
INFO:lda:<140> log likelihood: -105115
INFO:lda:<150> log likelihood: -104403
INFO:lda:<160> log likelihood: -103985
INFO:lda:<170> log likelihood: -103976
INFO:lda:<180> log likelihood: -104281
INFO:lda:<190> log likelihood: -103997
INFO:lda:<200> log likelihood: -103967
INFO:lda:<210> log likelihood: -103735
INFO:lda:<220> log likelihood: -1

0.19066436290740968 minutes


In [42]:
vocab = vectorizer.get_feature_names()
topic_names = ['Topic %d'%k for k in range(1, n_topics + 1)]

topic_word_distributions = pd.DataFrame(topic_model.components_, columns=vocab, index=topic_names)

document_topic_distributions = pd.DataFrame(document_topic_distributions, columns=topic_names, index=doc_ids)

In [43]:
document_topic_distributions.to_csv('data/document_topic_distributions_'+str(n_topics)+'topics.csv')
topic_word_distributions.to_csv('data/topic_word_distributions_'+str(n_topics)+'topics.csv')

In [44]:
document_topic_distributions.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10
0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
1,0.05,0.55,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
3,0.55,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
4,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1


In [45]:
topic_word_distributions.head()

Unnamed: 0,110,altersarmut,arbeit,arbeitslosigkeit,arbeitsmarkt,arbeitsplätze,arm,armut,asylanten,asylpolitik,...,sicherheit,soziale,staatsverschuldung,steuern,ungerechtigkeit,verschuldung,volk,wirtschaft,wirtschaftskrise,zuwanderung
Topic 1,4e-06,0.039177,4e-06,0.152865,0.038326,4e-06,4e-06,4e-06,0.063873,4e-06,...,4e-06,4e-06,4e-06,0.0379,4e-06,0.002133,4e-06,0.026829,4e-06,4e-06
Topic 2,4e-06,4e-06,4e-06,0.285033,4e-06,0.019032,4e-06,0.0004,4e-06,4e-06,...,4e-06,4e-06,0.072153,0.133203,4e-06,0.089992,4e-06,4e-06,0.065414,4e-06
Topic 3,4e-06,4e-06,0.040888,0.009667,4e-06,4e-06,4e-06,0.053524,4e-06,4e-06,...,4e-06,4e-06,4e-06,0.025277,4e-06,4e-06,4e-06,4e-06,4e-06,0.00372
Topic 4,4e-06,0.044068,4e-06,4e-06,0.007705,4e-06,4e-06,4e-06,4e-06,0.142889,...,0.159145,4e-06,4e-06,4e-06,4e-06,4e-06,4e-06,4e-06,4e-06,0.124066
Topic 5,4e-06,0.022665,4e-06,4e-06,4e-06,4e-06,4e-06,0.010758,4e-06,4e-06,...,0.005381,0.512381,4e-06,4e-06,0.159402,4e-06,4e-06,4e-06,4e-06,4e-06


In [46]:
topic_word_distributions.loc['Topic 3'].sort_values(ascending=False).head(15)

geld           0.165026
deutschland    0.128974
politiker      0.078798
euro           0.078055
probleme       0.063931
ausländer      0.060214
menschen       0.056498
armut          0.053524
schulden       0.052409
regierung      0.052409
arbeit         0.040888
bevölkerung    0.033826
problem        0.030481
steuern        0.025277
bürger         0.023047
Name: Topic 3, dtype: float64