In [None]:
import pandas as pd 
from IPython.display import display
import textwrap
import json 
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from tqdm import tqdm
from collections import Counter
import numpy as np
from gensim.models import LdaModel
import matplotlib.pyplot as plt

## Explanation

- input: df with lda-topic probabilities, lda model
- enables manual inspection of topic labels
- creates plots of yearly occurence of specified topics
- option to save specific topic dataset

### Import dataset and model

In [None]:
n_topics = 50
dataset_path = "data/parllaw/final.csv"
model_path = f"data/lda/{n_topics}_topics/10/model.model"
df = pd.read_csv(dataset_path)
model = LdaModel.load(model_path)

### Print top k=5 words of every topic to inspect LDA composition

In [None]:
def print_topics(model, n_topics, k_words=5):
    '''
    print all n topics of the lda model 
    '''
    for idx, topic in model.show_topics(formatted=False, num_topics=n_topics):
        label = ", ".join([word for word, prob in topic[:k_words]])
        print(f"Topic {idx}: {label}")

In [None]:
print_topics(model, n_topics)

### Explore topics by their ID

In [None]:
# plotting parameters
party_block_colors = {
    'left': 'purple',
    'greens': 'green',
    'social_democratic': 'red',
    'christian_conservative': 'black',
    'liberal': 'orange',
    'right_populist': 'blue'
}
election_years = [1999, 2004, 2009, 2014, 2019, 2024]

In [None]:
def find_topic_id(keyword, model=model, n_topics=50, top_n=10):
    """
    returns the topic id containing the keyword in its top_n words.
    if the keyword appears in multiple topics, returns the list of topic ids and prints a warning.
    """
    matches = []
    for idx, topic in model.show_topics(formatted=False, num_topics=n_topics, num_words=top_n):
        words = [word for word, prob in topic]
        if keyword in words:
            matches.append(idx)
    if not matches:
        return None
    if len(matches) > 1:
        print(f"Warning: keyword '{keyword}' found in multiple topics: {matches}")
        return matches
    return matches[0]

In [None]:
def plot_abs_numbers_over_time(topic_id, model=model, df=df):
    # TODO 

In [None]:
def explore_topic_id(topic_id, model=model, df=df, top_n_terms=10, speech_number=10, prob_threshold=0.3):
    '''
    Explore topic with id topic_id in LDA model and dataframe df.
    Prints top terms, number of speeches, top speeches, just passing speeches, and plots number and ratio of speeches per year and party block.
    Parameters:
    - topic_id: int, id of topic to explore
    - model: LdaModel, trained LDA model
    - df: pd.DataFrame, dataframe with speeches and topic probabilities
    - top_n_terms: int, number of top terms to print
    - speech_number: int, number of top and just passing speeches to print
    - prob_threshold: float, probability threshold to select speeches
    Returns:
    - df_selected_topic: pd.DataFrame, dataframe with speeches selected for the topic
    '''
    # define df_selected_topic
    df_selected_topic = df[df[f'topic_{topic_id}'] >= prob_threshold]

    # return first 10 words and their probabilities for topic of interest
    topic_terms = model.show_topic(topicid=topic_id, topn=top_n_terms)
    print(f"Topic {topic_id} terms:")
    for word, prob in topic_terms:
        print(f"  {word}: {prob:.4f}")
    # save first three words
    first_3_words = [word for word, prob in topic_terms[:3]]

    # number of speeches in topic
    n_speeches = df_selected_topic.shape[0]
    print(f"Number of speeches in topic {topic_id} ({first_3_words}) with prob >= {prob_threshold}): {n_speeches}")
    
    # return top speeches
    print(f"Top speeches for topic {topic_id} ({first_3_words}):\n")
    df_selected_topic_sorted_desc = df_selected_topic.sort_values(by=f"topic_{topic_id}", ascending=False)
    for i in range(speech_number):
        speech = df_selected_topic_sorted_desc.iloc[i]
        print(f"Speech {i+1} with topic probability {speech[f'topic_{topic_id}']:.4f}:")
        print(textwrap.fill(speech["translatedText"], width=80))
        print("\n" + "-"*80 + "\n")

    # return speeches just above threshold
    print(f"Just passing speeches for topic {topic_id} ({first_3_words}):\n")
    df_selected_topic_sorted = df_selected_topic.sort_values(by=f"topic_{topic_id}")
    for i in range(speech_number):
        speech = df_selected_topic_sorted.iloc[i]
        print(f"Speech {i+1} with topic probability {speech[f'topic_{topic_id}']:.4f}:")
        print(textwrap.fill(speech["translatedText"], width=80))
        print("\n" + "-"*80 + "\n")

    # plot 
    # plots: number and ratio of speeches per year and party block
    party_blocks = df_selected_topic['block'].unique()
    years = sorted(df_selected_topic['year'].unique())
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
    for party_block in party_blocks:
        counts_per_year = []
        normalized_counts_per_year = []
        for year in years:
            count = df_selected_topic[(df_selected_topic['block'] == party_block) & (df_selected_topic['year'] == year)].shape[0]
            counts_per_year.append(count)
            total_count = df[(df['block'] == party_block) & (df['year'] == year)].shape[0]
            normalized_count = count / total_count if total_count > 0 else 0
            normalized_counts_per_year.append(normalized_count)
        ax1.plot(years, counts_per_year, label=party_block, marker='o', color=party_block_colors.get(party_block))
        ax2.plot(years, normalized_counts_per_year, label=party_block, marker='o', color=party_block_colors.get(party_block))
    for year in election_years:
        ax1.axvline(x=year, color='gray', linestyle='-', alpha=0.5)
        ax2.axvline(x=year, color='gray', linestyle='-', alpha=0.5)
    ax1.set_xticks(election_years)
    ax2.set_xticks(election_years)
    # add horizontal grid lines
    ax1.yaxis.grid(True, linestyle='-', color='gray', alpha=0.5)
    ax2.yaxis.grid(True, linestyle='-', color='gray', alpha=0.5)
    ax1.set_title(f"Number of speeches per year for topic {topic_id} ({first_3_words})")
    ax1.set_ylabel("Number of speeches")
    ax1.legend()
    ax2.set_title(f"Ratio of speeches per year for topic {topic_id} ({first_3_words})")
    ax2.set_ylabel("Ratio of speeches")
    ax2.set_xlabel("Year")
    ax2.legend()
    plt.tight_layout()
    plt.show()    

    return df_selected_topic

In [None]:
def combined_ratio_plot(topic_id_list, model=model, df = df, prob_threshold=0.3):
    '''
    plots ratio of speeches per party over the years for each topic in topic_id_list
    Parameters:
    - topic_id_list: list of int, list of topic ids to plot
    - model: LdaModel, trained LDA model
    - df_topics: pd.DataFrame, dataframe with speeches and topic probabilities
    - prob_threshold: float, probability threshold to select speeches
    '''
    # plot ratio of speeches per party over the years for each topic in topic_id_list
    n_topics = len(topic_id_list)
    ncols = 2
    nrows = (n_topics + ncols - 1) // ncols
    fig, axes = plt.subplots(nrows, ncols, figsize=(15, 5 * nrows), sharex=True)
    
    # flatten axes to iterate easily (handles case nrows==1 or >1)
    axes = np.array(axes).reshape(-1)
    
    party_blocks = sorted(df['party_block'].unique())
    years = sorted(df['year'].unique())
    
    for ax, topic_id in zip(axes, topic_id_list):
        for party_block in party_blocks:
            normalized_counts_per_year = []
            for year in years:
                df_selected_topic = df[
                    (df[f'topic_{topic_id}'] >= prob_threshold) &
                    (df['party_block'] == party_block) &
                    (df['year'] == year)
                ]
                count = df_selected_topic.shape[0]
                total_count = df[
                    (df['party_block'] == party_block) &
                    (df['year'] == year)
                ].shape[0]
                normalized_count = count / total_count if total_count > 0 else 0
                normalized_counts_per_year.append(normalized_count)
            ax.plot(years, normalized_counts_per_year, label=party_block,
                    marker='o', color=party_block_colors.get(party_block, 'gray'))
            ax.set_xticks(election_years)
            ax.set_xticklabels([str(y) for y in election_years])
            # ensure ticks/labels are shown for every subplot (even when sharex=True)
            ax.tick_params(axis='x', which='both', bottom=True, top=False, labelbottom=True)
            ax.xaxis.set_visible(True)
            for year in election_years:
                ax.axvline(x=year, color='gray', linestyle='-', alpha=0.5)
            ax.yaxis.grid(True, linestyle='-', color='gray', alpha=0.5)
            first_words = [word for word, prob in model.show_topic(topicid=topic_id, topn=4)]
            ax.set_title(f"Topic {topic_id} {first_words}", fontsize=12)
            idx = list(axes).index(ax)
            if idx % ncols == 0:
                ax.set_ylabel("Ratio of speeches")
            ax.spines['right'].set_visible(False)
            ax.spines['top'].set_visible(False)

    # turn off any unused subplots
    for ax in axes[len(topic_id_list):]:
        ax.axis('off')
    
    # place legend in the first subplot (upper left)
    if len(axes) > 0:
        axes[0].legend(loc='upper left', fontsize = 12)
    
    plt.suptitle(f"Ratio of speeches about {n_topics} selected topics over time", fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()