In [1]:
import pandas as pd
import numpy as np
import os

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import spacy
nlp = spacy.load("de_core_news_sm")

In [2]:
# run 'pip install pyLDAvis' first (Python library for interactive topic model visualization)
import pyLDAvis 
import pyLDAvis.gensim_models

In [3]:
# load speech data:
opendiscourse = pd.read_csv('speeches.csv')

In [4]:
extra_stop = ["herr", "dame", "präsident", "präsidentin", "bundespräsident", "bundespräsidentin", "frau", "damen", "herren", "kollege", "kollegin", "genau", "danke", "lieb", "tagesordnungspunkt", "bitte", "nächster", "nächste", "redner", "rednerin", "deutsch", "deutschland", "staatsangehörigkeit", "staatsbürgerschaft", "gesetzentwurf", "staatsangehörigkeitsrecht", "staatsangehörigkeitsgesetz"]
tokenized_corpus = [] # final tokenized corpus will be saved here

def tokenize(a,b):
    # go through the entries for our timeframes
    for i in range(a,b): 
    
        # iterate through the 'speechContent' column in row i 
        # replace new-line commands with white space
        # remove parentheses, brackets, hyphens
        speech = opendiscourse['speechContent'].iloc[i].replace('\n', ' ').replace('\xa0',' ').replace('({','').replace('})','').replace('-','').replace('--','')       
 
        # feed words to spaCy German language model
        tokenized_speech = nlp(speech) 
    
        # remove stop words, punctuation, numbers; lemmatize words
        speech_corpus = []
        for t in tokenized_speech:
            if not t.is_stop and not t.is_punct and not t.like_num and t.lemma_.lower() not in extra_stop:
                speech_corpus.append(t.lemma_.lower())
        while ' ' in speech_corpus:
            speech_corpus.remove(' ')
        while '  ' in speech_corpus:
            speech_corpus.remove('  ')
        while '   ' in speech_corpus:
            speech_corpus.remove('   ')
            
        tokenized_corpus.append(speech_corpus)

The topic modeling process will be performed for each year separately. Since the same list (tokenized_corpus) is used each time, the following code block (tokenized_corpus.clear()) must be executed before the beginning of each new topic modeling process in order to clear the results from the previous process. We'll print the list after it's been cleared in order to be sure that it's empty.

In [None]:
# clears the list of all entries (used before tokenizing a new session)
tokenized_corpus.clear()
print(tokenized_corpus)

In [5]:
# 2nd law change/2. Gesetzesänderung
# section relevant for discussion of citizenship law changes
# 2014-06-05 / 18039
tokenize(802050,802084)
# 2014-07-03 / 18046
tokenize(803506,803532)
print(tokenized_corpus)

[['schließen', 'aussprache', 'präsidium', 'kolbe', 'schätzung', 'teilen', 'thema', 'sicher', 'län', 'g', 'beratungszeit', 'verdienen', 'ausdruck', 'nachweislich', 'deutlich', 'län', 'ger', 'debattieren', 'beginn', 'debatte', 'gemeinsam', 'beschließen', 'interfraktionell', 'überweisung', 'gesetzent', 'wurfs', 'drucksache', 'tag', 'ordnung', 'aufgeführt', 'ausschuß', 'vorschlagen', 'alternativ', 'vorschlag', 'fall', 'überweisung', 'beschließen', '--', 'beratung', 'bundesregierung', 'eingebracht', 'entwurf', 'gesetz', 'änderung', 'drucksach', 'überweisungsvorschlag', 'innenausschuss', 'auswärtiger', 'ausschuss', 'ausschuss', 'verbraucherschutz', '--', 'beratung', 'abgeordneter', 'jan', 'korte', 'sevim', 'dağdele', 'dr.', 'andré', 'hahn', 'abgeordneter', 'fraktion', 'linke', 'eingebracht', 'entwurf', 'gesetz', 'aufhebung', 'optionsregelung', 'drucksache', 'überweisungsvorschlag', 'innenausschuss', 'ausschuss', 'verbraucherschutz', 'ausschuss', 'familie', 'senior', 'jugend', 'ausschuss', 'm

In [None]:
# 3rd law change/3. Gesetzesänderung
# section relevant for discussion of citizenship law changes
# 2019-05-16 / 19101
tokenize(871973,871995)
# 2019-06-27 / 19107
tokenize(873697,873735)
print(tokenized_corpus)

In [None]:
# 4th law change/4. Gesetzesänderung
# section relevant for discussion of citizenship law changes
# 2021-04-22 / 19224
tokenize(906355,906360)
print(tokenized_corpus)

In [6]:
# building dictionary/mapping words to IDs
words_id = corpora.Dictionary(tokenized_corpus)

# creating corpus/corpus becomes a bag of words
corpus = [words_id.doc2bow(txt) for txt in tokenized_corpus]

In [7]:
# running the LDA model for our chosen number of topics (num_topics)
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words_id,
                                            # This number of topics may or may not be appropriate
                                           num_topics=3, 
                                           random_state=50, # seed for reproducibility
                                           passes=20, # the number of times the model is trained on our corpus
                                           per_word_topics=True)

In [8]:
# displays the first 20 words in each topic
# & the probability that those words belong to the topic
lda_model.show_topics(num_words=20,num_topics=3)

[(0,
  '0.009*"mensch" + 0.007*"doppelt" + 0.005*"optionspflicht" + 0.005*"fall" + 0.005*"land" + 0.005*"besonderer" + 0.005*"sagen" + 0.004*"finden" + 0.004*"jung" + 0.004*"entscheiden" + 0.004*"unser" + 0.004*"eltern" + 0.003*"seite" + 0.003*"mehrstaatigkeit" + 0.003*"aufwachsen" + 0.003*"gehören" + 0.003*"entscheidung" + 0.003*"spd" + 0.003*"union" + 0.003*"gemeinwesen"'),
 (1,
  '0.010*"mensch" + 0.009*"sagen" + 0.009*"deutsche" + 0.007*"land" + 0.006*"gesetz" + 0.005*"kind" + 0.005*"fraktion" + 0.005*"optionspflicht" + 0.005*"linke" + 0.004*"gebären" + 0.004*"jung" + 0.004*"beck" + 0.004*"spd" + 0.004*"wort" + 0.004*"eltern" + 0.004*"beratung" + 0.004*"bundesregierung" + 0.004*"bündnis" + 0.004*"schritt" + 0.004*"wissen"'),
 (2,
  '0.009*"gesetz" + 0.009*"optionspflicht" + 0.008*"mensch" + 0.006*"land" + 0.006*"doppelt" + 0.005*"leben" + 0.004*"frage" + 0.004*"klar" + 0.004*"jung" + 0.004*"beck" + 0.003*"lassen" + 0.003*"bleiben" + 0.003*"wert" + 0.003*"unser" + 0.003*"europäisch"

In [9]:
# Check for the optimal number of topics k using the c_v coherence score
# In order to use the visualization in the next cell, k_final in this cell 
# needs to be set to the number of topics desired for the visualization
k_init = 1
k_final = 3
for k in range(k_init,k_final+1):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words_id,
                                           num_topics=k, 
                                           random_state=50,
                                           passes=20,
                                           per_word_topics=True)
    # perplexity (lower score is better) 
    per_lda = lda_model.log_perplexity(corpus)
    
    # coherence score (higher score is better)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_corpus, dictionary=words_id, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    #print(coherence_lda)
    
    print('Anzahl der Topics:',k,'Perplexität:',per_lda,'Kohärenz:',coherence_lda)

Anzahl der Topics: 1 Perplexität: -7.5509288773027725 Kohärenz: 0.3187631648648244
Anzahl der Topics: 2 Perplexität: -7.56097157345689 Kohärenz: 0.3834119191100726
Anzahl der Topics: 3 Perplexität: -7.586579967889806 Kohärenz: 0.32836635586562907


In [10]:
# visualizing the topics
id2word=words_id
pyLDAvis.enable_notebook(local=True)
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

pyLDAvis.display(vis)