In [None]:
import pandas as pd
import stanza
from nltk.stem import SnowballStemmer
from tqdm import tqdm

In [None]:
# Initialize German stemmer
stemmer = SnowballStemmer("german")

In [None]:
# Initialize Stanza pipeline
nlp = stanza.Pipeline('de', processors='tokenize,pos', use_gpu=True)

In [None]:
# Load data
topic_list = pd.read_csv('topic_list.csv', sep=';')
topic_list['Score'] = topic_list['Score'].str.replace('.', '')
topic_list['Score'] = topic_list['Score'].astype(float)

In [None]:
# Read pickle file with contributions, consider only after 2018
contributions = pd.read_pickle('speech_content.pkl')
contributions['date'] = pd.to_datetime(contributions['date'], unit='s')
contributions = contributions[contributions['date'] > '2010-01-01']

In [None]:
# Convert speeches to lowercase and remove punctuation
contributions['speech_content'] = contributions['speech_content'].str.lower().str.replace('[^\w\s]', '', regex=True)

In [None]:
# define processing function
def process_speech(row):
    speech = []
    # Analyze the text (using Stanza for POS tagging and lemmatization)
    doc = nlp(row)

    # Extract only the nouns, stem them, and add them to the list
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.upos == 'NOUN':
                stem = stemmer.stem(word.text)
                speech.append(stem)

    # Calculate the number of occurrences of each word from the topic_list in the speech
    occurences = [sum(1 for act_word in speech if act_word == word) for word in topic_list['Word']]

    # Update the topic list with occurrences and calculate the score
    topic_list['Occurences'] = occurences
    topic_list['Eval'] = topic_list['Score'] * topic_list['Occurences']

    # Group and sort the topics by their score
    ranked_topics = topic_list.groupby('Topic')['Eval'].sum().sort_values(ascending=False)

    # Select the highest-ranked topic or "No Topic"
    return 'No Topic' if len(speech) < 10 else ranked_topics.index[0]

In [None]:
# Run topic modelling
topics = [process_speech(row) for row in tqdm(contributions['speech_content'])]

In [None]:
# Add the topics to the DataFrame
contributions['topic'] = topics

In [None]:
# Save dataframe to pickle
contributions.to_pickle('speech_content_topics.pkl')