In [40]:
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

import pandas as pd
import numpy as np

import plotly.express as px

import os
import re

from tqdm.auto import tqdm

DATA_DIR = "../../data"

In [41]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("luerhard/PopBERT")
# load model
model = AutoModelForSequenceClassification.from_pretrained("luerhard/PopBERT")


def predict_popbert(model, tokenizer, text):

    label_dict = {
        0: "Anti-Elitism",
        1: "People-Centrism",
        2: "Left-Wing Ideology",
        3: "Right-Wing Ideology"
    }

    num_classes = len(label_dict)

    # encode text with tokenizer
    encodings = tokenizer(text, return_tensors="pt")

    # predict
    with torch.inference_mode():
        out = model(**encodings)

    # get probabilties
    probs = torch.nn.functional.sigmoid(out.logits).numpy()[0]

    # Create output dict
    output = {label_dict[i]: probs[i] for i in range(num_classes)}

    return output


def plot_probabilities(probabilities):
    # Visualize probabilities in bar chart
    fig = px.bar(x=list(probabilities.keys()), y=list(probabilities.values()))
    return fig

# Test on custom input

In [42]:
# define text to be predicted
text = (
    "Das ist Klassenkampf von oben, das ist Klassenkampf im Interesse von "
    "Vermögenden und Besitzenden gegen die Mehrheit der Steuerzahlerinnen und "
    "Steuerzahler auf dieser Erde."
)

probabilities = predict_popbert(model, tokenizer, text)

fig = plot_probabilities(probabilities)
fig.show()

# Test on europarl speeches

In [43]:
df = pd.read_csv(os.path.join(DATA_DIR, "debates/europarl_speeches.csv"))

# Example: Speeches by Manfred Weber (CSU/EVP)
df_weber = df[df.familyName == "Weber"]

# Only look at speeches from 2019-2024
df_weber = df_weber[df_weber["date"] > "2020"]

In [44]:
# This code runs ~3 minutes on my local machine (macbook M2)

# Split speeches into sentences
df_weber.loc[:, "sentences"] = df_weber.loc[:,
                                            "text"].apply(lambda x: re.split(r'[.!?]', x))

# Create empty arrays to save results
rede_id_list = []
topic_list = []
confidence_list = []

# Loop through all speeches
for rede_id in tqdm(df_weber.index):

    sentences = df_weber.loc[rede_id].sentences
    num_sentences = len(sentences)

    # Loop through all sentences
    for sentence in sentences:

        # Get manifestoberta output
        probabilities = predict_popbert(
            model, tokenizer, sentence)

        # Extract topic and probability for most probable category
        topic, confidence = max(probabilities, key=probabilities.get), max(
            probabilities.values())

        rede_id_list.append(rede_id)
        topic_list.append(topic)
        confidence_list.append(confidence)

  0%|          | 0/32 [00:00<?, ?it/s]

In [50]:
df_topics_weber = pd.DataFrame(
    {"topic": topic_list, "confidence": confidence_list, "rede_id": rede_id_list})

# Define confidence threshold, only keep predictions with above-threshold confidence
threshold = 0.7
df_topics_weber = df_topics_weber[df_topics_weber["confidence"] >= threshold]

# Create histogram of most-common topics
histogram = df_topics_weber.topic.value_counts()
limit = 10
px.bar(
    x=histogram.index[:limit],
    y=histogram.values[:limit],
    title="Manfred Weber (CSU/EVP) - 2020-2024"
)