In [2]:
import pandas as pd
import plotly.express as px

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

import os
import re

from tqdm.auto import tqdm


DATA_DIR = "../../data"

In [8]:
# Load model and tokenizer from huggingfacae
model = AutoModelForSequenceClassification.from_pretrained(
    "manifesto-project/manifestoberta-xlm-roberta-56policy-topics-context-2023-1-1", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")


def predict_manifestoberta(model, tokenizer, sentence, context):
    # Function to predict manifesto categories using manifestoberta

    # For sentences without additional context, just use the sentence itself as the context.
    # Example: context = "These principles are under threat."

    # Tokenize inputs
    inputs = tokenizer(sentence,
                       context,
                       return_tensors="pt",
                       max_length=300,  # we limited the input to 300 tokens during finetuning
                       padding="max_length",
                       truncation=True
                       )
    # Get logits
    logits = model(**inputs).logits

    # Convert to probabilities
    probabilities = torch.softmax(logits, dim=1).tolist()[0]

    # Convert to percent and round to two digitis
    probabilities = {
        model.config.id2label[index]: probability for index, probability in enumerate(probabilities)}

    # Create dict sorted by probabilities
    probabilities = dict(sorted(probabilities.items(),
                                key=lambda item: item[1], reverse=True))

    return probabilities


def plot_probabilities(probabilities, limit=5):
    # Visualize probabilities in bar chart
    fig = px.bar(x=list(probabilities.keys())[
                 :limit], y=list(probabilities.values())[:limit])
    return fig

# Test on custom input

In [9]:

sentence = "These principles are under threat."
context = "Human rights and international humanitarian law are fundamental pillars of a secure global system. These principles are under threat. Some of the world's most powerful states choose to sell arms to human-rights abusing states."

# Get manifestoberta output
probabilities = predict_manifestoberta(model, tokenizer, sentence, context)

# Visualize probabilities
fig = plot_probabilities(probabilities, limit=5)
fig.show()

# Test on europarl speeches

In [10]:
df = pd.read_csv(os.path.join(DATA_DIR, "debates/europarl_speeches.csv"))

# Example: Speeches by Manfred Weber (CSU/EVP)
df_weber = df[df.familyName == "Weber"]

# Only look at speeches from 2019-2024
df_weber = df_weber[df_weber["date"] > "2020"]

In [11]:
# This code runs ~9 minutes on my local machine (macbook M2)


# Split speeches into sentences
df_weber.loc[:, "sentences"] = df_weber.loc[:,
                                            "text"].apply(lambda x: re.split(r'[.!?]', x))

# Create empty arrays to save results
rede_id_list = []
topic_list = []
confidence_list = []

# Loop through all speeches
for rede_id in tqdm(df_weber.index):

    sentences = df_weber.loc[rede_id].sentences
    num_sentences = len(sentences)

    # Loop through all sentences
    for i in range(1, num_sentences-1):
        sentence = sentences[i]
        # Context includes the sentence before and after
        context = " ".join(sentences[i-1:i+2])
        # Get manifestoberta output
        probabilities = predict_manifestoberta(
            model, tokenizer, sentence, context)
        # Extract topic and probability for most probable category
        topic, confidence = max(probabilities, key=probabilities.get), max(
            probabilities.values())

        rede_id_list.append(rede_id)
        topic_list.append(topic)
        confidence_list.append(confidence)

  0%|          | 0/32 [00:00<?, ?it/s]

In [13]:
df_topics_weber = pd.DataFrame(
    {"topic": topic_list, "confidence": confidence_list, "rede_id": rede_id_list})

# Define confidence threshold, only keep predictions with above-threshold confidence
threshold = 0.7
df_topics_weber = df_topics_weber[df_topics_weber["confidence"] >= threshold]

# Create histogram of most-common topics
histogram = df_topics_weber.topic.value_counts()
limit = 10
px.bar(
    x=histogram.index[:limit],
    y=histogram.values[:limit],
    title="Manfred Weber (CSU/EVP) - 2020-2024"
)