# Apply Hix Høyland (2024) on policy summaries
This notebook is based on the script `eu-policy-feedback/existing_measurements/hix_hoyland_2024/hix_hoyland_2024.R` and is optimised to process a large amount (>70k) of EU legislations using Google Colab hardware, e.g., a GPU.


Connect to a GPU for best performance.

### Setup

In [1]:
pip install datasets tqdm

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[

In [2]:
import pandas as pd
import re
from itertools import islice
import math
from transformers import pipeline, AutoTokenizer
from datasets import Dataset
from tqdm import tqdm

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Load data

In [4]:
all_dir_reg = pd.read_csv("/content/drive/MyDrive/EU Policy Feedback/all_dir_reg_summaries.csv")

### Preprocessing

"We split the preambles into segments of 100 words…"

In [5]:
# Define split function
def split_into_segments(text, segment_size=100):
    if not isinstance(text, str):
        return []  # Return an empty list if the input is not a valid string

    words = re.split(r'\s+', text)
    segments = [
        " ".join(words[i:i + segment_size])
        for i in range(0, len(words), segment_size)
    ]
    return segments

In [6]:
# Assuming all_dir_reg is a pandas DataFrame and 'preamble' is the column with text
all_dir_reg['summary_segment'] = all_dir_reg['eurlex_summary_clean'].apply(split_into_segments)

# Unnest the segments into separate columns
max_segments = all_dir_reg['summary_segment'].apply(len).max()
segment_columns = [f'summary_segment_{i+1}' for i in range(max_segments)]

# Expand the list of segments into separate columns
summary_segments_df = pd.DataFrame(all_dir_reg['summary_segment'].to_list(), columns=segment_columns)
all_dir_reg = pd.concat([all_dir_reg, summary_segments_df], axis=1).drop(columns=['summary_segment'])


### 1. Classification

RoBERT-classifier trained on the corpus of party manifestos.

"We […] classify each segment as left, neutral, or right"

In [7]:
# Define the Hugging Face pipeline
RoBERT_classifier = pipeline(
    task="text-classification",
    model="niksmer/RoBERTa-RILE",
    device=0 # CPU: -1 | GPU: 0
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/879 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [8]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("niksmer/RoBERTa-RILE")

# Fix error: Function to truncate text if it exceeds 512 tokens
def truncate_text(text, max_tokens=510): # play it safe
    # Tokenize the text
    tokens = tokenizer(text, truncation=False)["input_ids"]

    # Check if the token length exceeds the max allowed tokens
    if len(tokens) > max_tokens:
        # Truncate the text to the maximum number of tokens and decode back to string
        truncated_text = tokenizer.decode(tokens[:max_tokens], skip_special_tokens=True)
        return truncated_text
    else:
        return text

# Iterate over the DataFrame and truncate text in relevant columns
def truncate_long_segments(df, max_tokens=510): # play it safe
    # Iterate over each row
    for idx, row in df.iterrows():
        # Iterate over each column that starts with "preamble_segment"
        for col in df.columns:
            if col.startswith("summary_segment"):
                original_text = row[col]
                if isinstance(original_text, str):  # Ensure it's a string before processing
                    # Truncate the text if necessary
                    df.at[idx, col] = truncate_text(original_text, max_tokens=max_tokens)
    return df

# Apply the truncation to the DataFrame
all_dir_reg_truncated = truncate_long_segments(all_dir_reg)

In [9]:
# Define the classify_batch function
def classify_batch(batch):
    return [RoBERT_classifier(text)[0]['label'] for text in batch["text"]]

# Create a dataset from the DataFrame
def prepare_dataset(df):
    # Flatten segments into a list with associated indices
    data = []
    for idx, row in df.iterrows():
        segments = [row[col] for col in row.index if col.startswith('summary_segment')]
        segments = [seg for seg in segments if isinstance(seg, str)]  # Ensure the segment is a string
        for segment in segments:
            data.append({"idx": idx, "text": segment})
    return Dataset.from_list(data)

# Prepare the dataset for classification
#dataset = prepare_dataset(all_dir_reg)
dataset = prepare_dataset(all_dir_reg_truncated)

# Set the desired batch size
batch_size = 32  # Adjust this value as needed

# Apply the classifier to the dataset with progress display
results = []
dataset_dict = dataset.to_dict()["text"]
for i in tqdm(range(0, len(dataset_dict), batch_size), desc="Classifying segments"):
    batch = dataset_dict[i:i + batch_size]
    results.extend(classify_batch({"text": batch}))

# Map the results back to the original DataFrame
label_map = {idx: [] for idx in all_dir_reg.index}
for item, label in zip(dataset, results):
    label_map[item["idx"]].append(label)

# Create the final DataFrame
RoBERT_df = pd.DataFrame({
    'CELEX': all_dir_reg['CELEX'],
    'RoBERT_rile_labels': [", ".join(label_map[idx]) for idx in all_dir_reg.index]
})

Classifying segments:   0%|          | 0/492 [00:00<?, ?it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Classifying segments: 100%|██████████| 492/492 [02:51<00:00,  2.86it/s]


In [10]:
RoBERT_df.to_csv("/content/drive/MyDrive/EU Policy Feedback/RoBERT_df_summaries.csv", index=False)

### ManiBERT
Classifier fine-tuned to identify the Comparative Manifesto Project (CMP) policy-issue codes

In [11]:
# Define the Hugging Face pipeline for ManiBERT with GPU utilization
ManiBERT_classifier = pipeline(
    task="text-classification",
    model="niksmer/ManiBERT",
    device=0 # CPU: -1 | GPU: 0
)

config.json:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [12]:
# Select relevant columns and reshape the DataFrame
def prepare_maniBERT_dataset(df):
    # Reshape the DataFrame: pivot longer and drop NA values
    df_long = df.melt(id_vars=["CELEX"], value_vars=[col for col in df.columns if col.startswith("summary_segment")],
                      var_name="segment", value_name="text").dropna(subset=["text"])

    # Convert the reshaped DataFrame to a Hugging Face Dataset
    return Dataset.from_pandas(df_long)

# Prepare the dataset
maniBERT_dataset = prepare_maniBERT_dataset(all_dir_reg_truncated)

# Function to classify text using ManiBERT
def classify_text(batch):
    return [ManiBERT_classifier(text)[0]['label'] for text in batch['text']]

# Apply the classifier to the dataset using batched processing with progress display
batch_size = 32  # Adjust this value as needed
maniBERT_dataset = maniBERT_dataset.map(lambda batch: {'ManiBERT_label': classify_text(batch)},
                                        batched=True,
                                        batch_size=batch_size,
                                        desc="Processing segments")

# Convert the dataset back to a DataFrame
ManiBERT_df = maniBERT_dataset.to_pandas()

Processing segments:   0%|          | 0/15713 [00:00<?, ? examples/s]

In [13]:
ManiBERT_df.to_csv("/content/drive/MyDrive/EU Policy Feedback/ManiBERT_df_summaries.csv", index=False)


### Object Checks

In [None]:
# prompt: print head of all_dir_reg

#print(all_dir_reg.head())
#print(RoBERT_df.shape)
#print(RoBERT_df.head())

print(ManiBERT_df.shape)
print(ManiBERT_df.tail())

In [None]:
# prompt: get size of dataframe

print(all_dir_reg.shape)
print(all_dir_reg.head())

