In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import LdaModel
import plotly.express as px
import plotly.io as pio
import textwrap
import preamble
import src.constants as const
from src.print_topics import print_topics
from tueplots import bundles
from tueplots.constants.color import rgb
import src.constants as const  

In [2]:
# import lda model and data
lda_model = LdaModel.load(const.PATH_MODEL)
df = pd.read_parquet(const.PATH_ALL_SPEECHES)

In [3]:
print_topics(lda_model)

TypeError: print_topics() missing 1 required positional argument: 'n_topics'

In [5]:
# interpreted topics based on top words and top speeches and identified suitable labels for plotting
topic_labels = [
    "EU Security & Defense", # 0
    "Debate Etiquette & Brexit", # 1
    "EU Finances", # 2
    "Workers and Industry", # 3
    "Fishing", # 4
    "Budgetary Control", # 5
    "Economic Development", # 6
    "Human Rights", # 7
    "Rule of Law", # 8
    "Taxation", # 9
    "Gender Equality", # 10
    "Terrorism & Political Violence", # 11
    "Food Safety", # 12
    "Economic Crisis", # 13
    "Climate & Energy", # 14
    "Trade Relations", # 15
    "International Conflicts", # 16
    "Education and Culture", # 17
    "Intra-European disputes", # 18
    "Migration and Asylum", # 19
    "Legislative Process", # 20
    "Russia–Ukraine", # 21
    "Social Policy & Labor", # 22
    "Data Protection", # 23
    "Agriculture", # 24
    "Election Law", # 25
    "Market Regulation", # 26
    "Natural Disasters & Epidemics", # 27
    "Sanctions & Condemnations", # 28
    "Children’s Rights" # 29
]

In [17]:
topic_list = []
for idx, topic in lda_model.show_topics(formatted=False, num_topics=const.N_TOPICS):
    label = ", ".join([word for word, prob in topic[:3]])
    topic_list.append(label)

In [7]:
# select topic with highest probability for each speech
df['dominant_topic_id'] = df.loc[:, 'topic_0':'topic_29'].idxmax(axis=1).apply(lambda x: int(x.split('_')[1])).astype(int)
# give dominant topic its label
df['dominant_topic'] = df['dominant_topic_id'].apply(lambda x: topic_labels[x])

In [8]:
# histogram of dominant topics
plt.figure(figsize=(10,6))
plt.hist(df['dominant_topic'].sort_values(), bins=np.arange(-0.5, 30.5, 1), edgecolor='black')
plt.xticks(rotation=90)
plt.xlabel('Dominant Topic')
plt.ylabel('Number of Speeches')
plt.title('Distribution of Dominant Topics in Speeches')
plt.show()

RuntimeError: latex was not able to process the following string:
b'Climate & Energy'

Here is the full command invocation and its output:

latex -interaction=nonstopmode --halt-on-error file.tex

This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022) (preloaded format=latex)
 restricted \write18 enabled.
entering extended mode
(./file.tex
LaTeX2e <2021-11-15> patch level 1
L3 programming layer <2022-02-24>
(/usr/local/texlive/2022/texmf-dist/tex/latex/base/article.cls
Document Class: article 2021/10/04 v1.4n Standard LaTeX document class
(/usr/local/texlive/2022/texmf-dist/tex/latex/base/size10.clo))
(/usr/local/texlive/2022/texmf-dist/tex/latex/type1cm/type1cm.sty)
(/usr/local/texlive/2022/texmf-dist/tex/latex/cm-super/type1ec.sty
(/usr/local/texlive/2022/texmf-dist/tex/latex/base/t1cmr.fd))
(/usr/local/texlive/2022/texmf-dist/tex/latex/base/inputenc.sty)
(/usr/local/texlive/2022/texmf-dist/tex/latex/geometry/geometry.sty
(/usr/local/texlive/2022/texmf-dist/tex/latex/graphics/keyval.sty)
(/usr/local/texlive/2022/texmf-dist/tex/generic/iftex/ifvtex.sty
(/usr/local/texlive/2022/texmf-dist/tex/generic/iftex/iftex.sty)))
(/usr/local/texlive/2022/texmf-dist/tex/latex/psnfss/times.sty)
(/usr/local/texlive/2022/texmf-dist/tex/latex/underscore/underscore.sty)
(/usr/local/texlive/2022/texmf-dist/tex/latex/base/textcomp.sty)
(/usr/local/texlive/2022/texmf-dist/tex/latex/psnfss/ot1ptm.fd)
(/usr/local/texlive/2022/texmf-dist/tex/latex/l3backend/l3backend-dvips.def)
No file file.aux.
*geometry* driver: auto-detecting
*geometry* detected driver: dvips
! Misplaced alignment tab character &.
l.29 {\rmfamily Climate &
                          Energy}%
No pages of output.
Transcript written on file.log.




<Figure size 1000x600 with 1 Axes>

In [None]:
df_dominant = df.groupby(['dominant_topic', 'year']).agg({
    'text': 'count'}).reset_index().rename(columns={'text': 'count'})

# Calculate total speeches per year
year_totals = df_dominant.groupby('year')['count'].sum().reset_index()
year_totals.columns = ['year', 'total_year']

# Merge and calculate ratio
df_dominant = df_dominant.merge(year_totals, on='year')
df_dominant['ratio'] = df_dominant['count'] / df_dominant['total_year']

# add most probable speech for each topic and year to the dataframe
df_dominant['most_probable_speech'] = None
for i, row in df_dominant.iterrows():
    topic = row['dominant_topic']
    year = row['year']
    prob_col = f"topic_{topic_labels.index(topic)}"
    filtered_df = df[(df['year'] == year)]
    top_speech = filtered_df.sort_values(by=prob_col, ascending=False).iloc[0]
    speech_text = top_speech['translatedText'][:500] + "..." if len(top_speech['translatedText']) > 500 else top_speech['translatedText']
    speech_text = "<br>".join(textwrap.wrap(speech_text, width=80))
    df_dominant.at[i, 'most_probable_speech'] = speech_text

# add column with top 5 words for each topic to later inspect on hover
for i in range(const.N_TOPICS):
    topic_terms = lda_model.show_topic(i, topn=5)
    top_words = ", ".join([word for word, prob in topic_terms])
    df_dominant.loc[df_dominant['dominant_topic'] == topic_labels[i], 'top_words'] = top_words

# sort in reverse alphabetical order to line up legend with plot order
df_dominant = df_dominant.sort_values(by='dominant_topic', ascending=False)

NameError: name 'df' is not defined

In [10]:
# delete original dataframe to save memory
del df

In [11]:
# sanity check: make sure that for each year, the sum of ratios equals 1
check = df_dominant.groupby('year')['ratio'].sum().reset_index()
assert all(np.isclose(check['ratio'], 1.0)), "Ratios do not sum to 1 for all years!"

# sanity check: make sure that for each year, each topic has only one entry
check2 = df_dominant.groupby(['year', 'dominant_topic']).size().reset_index(name='counts')
assert all(check2['counts'] == 1), "There are duplicate entries for some year-topic combinations!"

### Construct Stacked Area Chart

In [12]:
pio.renderers.default = "browser"
fig = px.area(df_dominant, 
              x='year', 
              y='ratio', 
              color='dominant_topic',
              hover_data={'dominant_topic': True, 
                         'top_words': True, 
                        'ratio': ':.1%',
                         'count': True, 
                         'most_probable_speech': True},
              labels={'dominant_topic': 'Topic', 
                    'top_words': 'Top 5 Keywords', 
                     'year': 'Year', 
                     'ratio': 'Proportion', 
                     'count': 'Number of Speeches', 
                     'most_probable_speech': 'Most Representative Speech'},
              title='Topics Discussed in the European Parliament (2004-2024)',
              subtitle='As identified by LDA Topic Modeling, expressed as proportion of total speeches per year',
              range_y=[0,1]
              )

# Format y-axis as percentage
fig.update_yaxes(tickformat='.0%', title='Proportion of Speeches')

# Customize legend
fig.update_layout(
    legend=dict(
        title=dict(text='Topics', font=dict(size=15, family='Arial')),
        orientation="v",
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=1.02,
        traceorder="reversed",
        font=dict(size=10)
    ),
    title=dict(
        font=dict(size=16, family='Arial'),
        x=0.5,
        xanchor='center'
    ),
    hovermode='closest',
    
)

fig.show()

### Construct report figure

In [15]:
df_migration = pd.read_parquet("data/speech_embeddings.parquet")

In [16]:
pio.renderers.default = "notebook"
fig = px.area(df_migration, 
              x='year', 
              y='ratio',
              color = 'block',
              color_discrete_map= const.COLOR_MAP_BLOCK)
fig.show()

ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['Unnamed: 0', 'speaker', 'text', 'date', 'agenda', 'speechnumber', 'procedure_ID', 'partyfacts_ID', 'period', 'chair', 'MEP', 'commission', 'written', 'multispeaker', 'link', 'translatedText', 'translationSource', 'year', 'block', 'party', 'migration_prob', 'jinaai/jina-embeddings-v3', 'jinaai/jina-embeddings-v4', 'Snowflake/snowflake-arctic-embed-l-v2.0', 'BAAI/bge-m3', 'sentence-transformers/all-MiniLM-L6-v2', 'sentence-transformers/all-mpnet-base-v2', 'Qwen/Qwen3-Embedding-0.6B', 'google/embeddinggemma-300m'] but received: ratio

In [None]:
# figure for the report: selected topics on the left, migration grouped by parties on the right
plt.rcParams.update(bundles.icml2024(column="half", nrows=1, ncols=2))
fig, (ax1, ax2) = plt.subplots(1, 2)
