In [2]:
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import spacy
nlp = spacy.load("de_core_news_sm")

import numpy as np
import pandas as pd
import os

In [3]:
# Loading speech data:
opendiscourse = pd.read_csv('speeches.csv')

In [1]:
extra_stop = ["herr", "dame", "präsident", "präsidentin", "bundespräsident", "bundespräsidentin", "frau", "damen", "herren", "kollege", "kollegin", "genau", "danke", "tagesordnungspunkt", "bitte", "nächster", "nächste", "redner", "rednerin"] # adapt if necessary
tokenized_corpus = [] # for the final tokenized corpus

def tokenize(a,b):
    # clear all list elements from previous uses
    tokenized_corpus.clear()
    # go through the entries for our time period
    for i in range(a,b): 
    
        # iterate through the 'speechContent' column in row i 
        # and replace all trailing new-line commands there with white space
        speech = opendiscourse['speechContent'].iloc[i].replace('\n', ' ').replace('\xa0',' ').replace('({','').replace('})','').replace('-','').replace('--','')       
    
    #print(i, speech)
    
        tokenized_speech = nlp(speech) # feed to our German language model
    
        speech_corpus = []
        for t in tokenized_speech:
            if not t.is_stop and not t.is_punct and not t.like_num and t.lemma_.lower() not in extra_stop:
                speech_corpus.append(t.lemma_.lower())
        while ' ' in speech_corpus:
            speech_corpus.remove(' ')
        while '  ' in speech_corpus:
            speech_corpus.remove('  ')
        while '   ' in speech_corpus:
            speech_corpus.remove('   ')
            
        tokenized_corpus.append(speech_corpus)

In [None]:
# 2014-06-05 / 18039
# entire session
#tokenize(800829,802312)

# section relevant for discussion of citizenship law changes
tokenize(802050,802084)

In [None]:
# 2014-07-03 / 18046
# entire session
#tokenize(801077,803694)

# section relevant for discussion of citizenship law changes
tokenize(803506,803532)

In [4]:
# 2019-05-16 / 19101
# entire session
#tokenize(869977,872117)

# section relevant for discussion of citizenship law changes
tokenize(871973,871995)

In [5]:
# 2019-06-27 / 19107
# entire session
#tokenize(873269,873963)

# section relevant for discussion of citizenship law changes
tokenize(873697,873735)

In [None]:
# 2021-04-22 / 19224
# entire session
#tokenize(905667,906363)

# section relevant for discussion of citizenship law changes
tokenize(906355,906360)

In [6]:
print(tokenized_corpus)

[['schließen', 'aussprache', 'abstimmung', 'bundesregierung', 'eingebracht', 'gesetzentwurf', 'anpassung', 'betreuer', 'vormündervergütung', 'ausschuss', 'verbraucherschutz', 'empfehlen', 'beschlussempfehlung', 'drucksache', 'gesetzentwurf', 'bundesregierung', 'drucksach', 'annehmen', 'gesetzentwurf', 'zustimmen', 'handzeichen', 'stimmen', 'enthalten', 'gesetzentwurf', 'beratung', 'stimme', 'koalitionsfraktion', 'afdfraktion', 'fraktion', 'linke', 'fraktion', 'bündnis', 'grüne', 'enthaltung', 'fdpfraktion', 'annehmen', 'beratung', 'schlussabstimmung', 'gesetzentwurf', 'zustimmen', 'erheben', 'stimmen', 'enthalten', 'gesetzentwurf', 'stimme', 'de', 'r', 'koalitionsfraktion', 'afdfraktion', 'fraktion', 'linke', 'fraktion', 'bündnis', 'grüne', 'enthaltung', 'fdpfraktion', 'annehmen'], ['verehrt', 'bundesregierung', 'bringen', 'entwurf', 'gesetz', 'änderung', 'staatsangehörigkeitsgesetz', 'klingen', 'rechtstechnik', 'verbergen', 'dringender', 'sicherheitspolitisch', 'anliegen', 'weit', 'ba

In [None]:
# building dictionary/mapping words to IDs
words_id = corpora.Dictionary(tokenized_corpus)

# creating corpus/corpus becomes a bag of words
corpus = [words_id.doc2bow(txt) for txt in tokenized_corpus]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words_id,
                                            # This number of topics may or may not be appropriate
                                           num_topics=7, 
                                           random_state=50, # sets seed for reproducibility
                                           passes=20, # number of times the model is trained on the corpus
                                           per_word_topics=True) 

# per_word_topics=True: extracts the most likely topics given a word, 
# Otherwise, words that are not indicative for a topic are going to be omitted.

In [None]:
# Word distribution in topics: (show the first 20 words in those 9 topics)
# + probability that those words belong to those topics
lda_model.show_topics(num_words=20,num_topics=7)

In [None]:
print(lda_model.get_term_topics('staatsangehörigkeit'))

In [None]:
# checking for "optimal" number of topics k using the c_v coherence score

# be aware that this takes a long time!
k_init = 5
k_final = 7
for k in range(k_init,k_final+1):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words_id,
                                           num_topics=k, 
                                           random_state=50,
                                           passes=20,
                                           per_word_topics=True)
    # let's compute perplexity (= how good is the model; lower score is better) 
    per_lda = lda_model.log_perplexity(corpus)
    
    # and coherence score (= average bigram word-similarity scores of the words in the topic; higher score is better)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_corpus, dictionary=words_id, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    #print(coherence_lda)
    
    print(k,per_lda,coherence_lda)