In [1]:
import numpy as np
import pandas as pd
import os

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import spacy
nlp = spacy.load("de_core_news_sm")

In [2]:
# install 'pip install pyLDAvis' first (Python library for interactive topic model visualization)
import pyLDAvis 
import pyLDAvis.gensim_models

In [3]:
# Loading speech data:
opendiscourse = pd.read_csv('speeches.csv')

In [4]:
extra_stop = ["herr", "dame", "präsident", "präsidentin", "bundespräsident", "bundespräsidentin", "frau", "damen", "herren", "kollege", "kollegin", "genau", "danke", "lieb", "tagesordnungspunkt", "bitte", "nächster", "nächste", "redner", "rednerin", "deutsch", "deutschland", "staatsangehörigkeit", "staatsbürgerschaft", "gesetzentwurf", "staatsangehörigkeitsrecht", "staatsangehörigkeitsgesetz"]
tokenized_corpus = [] # final tokenized corpus will be saved here

def tokenize(a,b):
    # go through the entries for our timeframes
    for i in range(a,b): 
    
        # iterate through the 'speechContent' column in row i 
        # replace new-line commands with white space
        # remove parentheses, brackets, hyphens
        speech = opendiscourse['speechContent'].iloc[i].replace('\n', ' ').replace('\xa0',' ').replace('({','').replace('})','').replace('-','').replace('--','')       
 
        # feed words to spaCy German language model
        tokenized_speech = nlp(speech) 
    
        # remove stop words, punctuation, numbers; lemmatize words
        speech_corpus = []
        for t in tokenized_speech:
            if not t.is_stop and not t.is_punct and not t.like_num and t.lemma_.lower() not in extra_stop:
                speech_corpus.append(t.lemma_.lower())
        while ' ' in speech_corpus:
            speech_corpus.remove(' ')
        while '  ' in speech_corpus:
            speech_corpus.remove('  ')
        while '   ' in speech_corpus:
            speech_corpus.remove('   ')
            
        tokenized_corpus.append(speech_corpus)

Topic modeling wird für jede Gesetzesänderung einzeln durchgefuhrt. Da dieselbe Liste (tokenized_corpus) jedes Mal benutzt wird, muss der folgende Code (tokenized_corpus.clear()) vor jeder neuen Gesetzesänderung ausgeführt werden, um die Ergebnisse der letzten Gesetzesänderung zu löschen. Wir lassen uns die Liste anzeigen, um sicherzustellen, dass sie leer ist.

In [None]:
# clears the list of all entries (used before tokenizing a new session)
tokenized_corpus.clear()
print(tokenized_corpus)

In [5]:
# 2nd law change/2. Gesetzesänderung
# section relevant for discussion of citizenship law changes
# 2014-06-05 / 18039
tokenize(802050,802084)
# 2014-07-03 / 18046
tokenize(803506,803532)
print(tokenized_corpus)

In [None]:
# 3rd law change/3. Gesetzesänderung
# section relevant for discussion of citizenship law changes
# 2019-05-16 / 19101
tokenize(871973,871995)
# 2019-06-27 / 19107
tokenize(873697,873735)
print(tokenized_corpus)

In [None]:
# 4th law change/4. Gesetzesänderung
# section relevant for discussion of citizenship law changes
# 2021-04-22 / 19224
tokenize(906355,906360)
print(tokenized_corpus)

[['schließen', 'aussprache', 'präsidium', 'kolbe', 'schätzung', 'teilen', 'thema', 'sicher', 'län', 'g', 'beratungszeit', 'verdienen', 'ausdruck', 'nachweislich', 'deutlich', 'län', 'ger', 'debattieren', 'beginn', 'debatte', 'gemeinsam', 'beschließen', 'interfraktionell', 'überweisung', 'gesetzent', 'wurfs', 'drucksache', 'tag', 'ordnung', 'aufgeführt', 'ausschuß', 'vorschlagen', 'alternativ', 'vorschlag', 'fall', 'überweisung', 'beschließen', '--', 'beratung', 'bundesregierung', 'eingebracht', 'entwurf', 'gesetz', 'änderung', 'drucksach', 'überweisungsvorschlag', 'innenausschuss', 'auswärtiger', 'ausschuss', 'ausschuss', 'verbraucherschutz', '--', 'beratung', 'abgeordneter', 'jan', 'korte', 'sevim', 'dağdele', 'dr.', 'andré', 'hahn', 'abgeordneter', 'fraktion', 'linke', 'eingebracht', 'entwurf', 'gesetz', 'aufhebung', 'optionsregelung', 'drucksache', 'überweisungsvorschlag', 'innenausschuss', 'ausschuss', 'verbraucherschutz', 'ausschuss', 'familie', 'senior', 'jugend', 'ausschuss', 'm

In [None]:
# building dictionary/mapping words to IDs
words_id = corpora.Dictionary(tokenized_corpus)

# creating corpus/corpus becomes a bag of words
corpus = [words_id.doc2bow(txt) for txt in tokenized_corpus]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words_id,
                                            # This number of topics may or may not be appropriate
                                           num_topics=3, 
                                           random_state=50, # sets seed for reproducibility
                                           passes=20, # number of times the model is trained on the corpus
                                           per_word_topics=True) 

# per_word_topics=True: extracts the most likely topics given a word, 
# otherwise words that are not indicative for a topic are going to be omitted.

In [None]:
# First 20 words in the topics
# + probability that those words belong to the topics
lda_model.show_topics(num_words=20,num_topics=3)

In [None]:
# check for optimal number of topics k using the c_v coherence score
# in order to use the visualization in the next cell, k_final in this cell 
# needs to be set to the number of topics desired for the visualization
k_init = 1
k_final = 3
for k in range(k_init,k_final+1):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words_id,
                                           num_topics=k, 
                                           random_state=50,
                                           passes=20,
                                           per_word_topics=True)
    # perplexity (lower score is better) 
    per_lda = lda_model.log_perplexity(corpus)
    
    # coherence score (higher score is better)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_corpus, dictionary=words_id, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    #print(coherence_lda)
    
    print('Anzahl der Topics:',k,'Perplexität:',per_lda,'Kohärenz:',coherence_lda)

In [None]:
# visualizing the topics
id2word=words_id
pyLDAvis.enable_notebook(local=True)
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

pyLDAvis.display(vis)