# Recreate Hix Hoyland (2024)
This notebook is based on the script `eu-policy-feedback/existing_measurements/hix_hoyland_2024/hix_hoyland_2024.R` and is optimised to process a large amount (>70k) of EU legislations using Google Colab hardware, e.g., a GPU.


Connect to a GPU for best performance.

### Setup

In [None]:
pip install datasets tqdm

In [None]:
import pandas as pd
import re
from itertools import islice
import math
from transformers import pipeline, AutoTokenizer
from datasets import Dataset
from tqdm import tqdm
import nltk
import string
from nltk.corpus import stopwords
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import random

# Download NLTK stopwords and punkt tokenizer if you haven't already
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Load data

In [None]:
all_dir_reg = pd.read_csv("/content/drive/MyDrive/EU Policy Feedback/all_dir_reg.csv")
#all_dir_reg = pd.read_csv("/content/drive/MyDrive/EU Policy Feedback/all_dir_reg_sample.csv")
all_dir_reg = all_dir_reg.sample(n=5000, random_state=42).reset_index(drop=True)

### Preprocessing
"For each piece of legislation, we classified each sentence in the preamble, until the phrase “Adopted this directive/regulation”, using a RoBERT-classifier trained on the corpus of party manifestos"

Get preamble string until “Adopted this directive/regulation”

In [None]:
# Function to extract the preamble text
def extract_preamble(text):
    if not isinstance(text, str):
        return None  # or you could return "" if you prefer an empty string

    # Use a case-insensitive regex to find the first occurrence of the keywords
    match = re.search(r'(?i)(Adopted this directive|Adopted this regulation)', text)

    # If the keyword is found, truncate the string
    if match:
        return text[:match.start()]
    else:
        return text

In [None]:
# Assuming all_dir_reg is a pandas DataFrame and 'act_raw_text' is the column with text
all_dir_reg['preamble'] = all_dir_reg['act_raw_text'].apply(extract_preamble)


Additional Preprocessing (Appropriate for word embedding)

In [None]:
# Clean the text

# Custom function to process each segment
def process_text(text, procedural_stop_words):
    # Check if the input is a string
    if not isinstance(text, str):
        return text  # If not a string, return the original input

    # Tokenize and remove unwanted characters
    tokens = nltk.word_tokenize(text)

    # Remove punctuations, symbols, numbers, and URLs
    tokens = [token for token in tokens if token.isalnum()]

    # Remove stopwords (you can replace 'marimo' with another source if needed)
    marimo_stopwords = set(stopwords.words('english'))  # Assuming marimo is similar to 'english' stopwords
    tokens = [token for token in tokens if token.lower() not in marimo_stopwords]

    # Remove corpus-specific irrelevant words
    tokens = [token for token in tokens if token.lower() not in procedural_stop_words]

    # Remove mixed letter-number tokens
    tokens = [token for token in tokens if not re.match(r'\b(?=\w*[A-Za-z])(?=\w*\d)\w+\b', token)]

    # Remove mixed letter-punctuation tokens
    tokens = [token for token in tokens if not re.match(r'\b(?=.*\d)(?=.*[{}])\S+\b'.format(re.escape(string.punctuation)), token)]

    # Remove tokens shorter than 3 characters
    tokens = [token for token in tokens if len(token) >= 3]

    # Remove tokens that are sequences of numbers possibly separated by slashes
    tokens = [token for token in tokens if not re.match(r'^\d+(/\d+)*$', token)]

    return ' '.join(tokens)

procedural_stop_words = ["article*", "shall", "annex", "commission", "decision", "member", "european", "state*", "measure*", "regard", "directive", "ii", "iii", "first", "second", "third", "fourth", "1st", "2nd", "3rd", "4th", "thereof", "act*", "add*", "adopt*", "also", "dateformat"]

# Check if the 'preamble' column exists in the dataframe
if 'preamble' in all_dir_reg.columns:
    # Apply the process_text function to the 'preamble' column
    all_dir_reg['preamble'] = all_dir_reg['preamble'].apply(lambda x: process_text(x, procedural_stop_words))


In [None]:
# Perform Subsampling

# Assuming 'all_dir_reg' is your dataframe and 'procedural_stop_words' is your stop word list
# First, you need to have a combined list of tokens for each document
# Let's assume all_dir_reg now contains a column 'preamble' which is the result of our previous processing

# Calculate word frequencies across all documents
word_frequencies = Counter()
all_dir_reg['preamble'].str.split().apply(word_frequencies.update)

total_words = sum(word_frequencies.values())

# Set the threshold
t = 1e-5

# Calculate word probabilities for subsampling
word_probs = {word: 1 - np.sqrt(t / (freq / total_words)) for word, freq in word_frequencies.items()}
word_probs = {word: max(prob, 0) for word, prob in word_probs.items()}  # Ensure probabilities are non-negative

# Function to subsample tokens in a document
def subsample_document(tokens, word_probs):
    return [token for token in tokens if random.uniform(0, 1) >= word_probs.get(token, 0)]

# Apply subsampling to each document
all_dir_reg['subsampled_text'] = all_dir_reg['preamble'].apply(lambda text: ' '.join(subsample_document(text.split(), word_probs)))


"We split the preambles into segments of 100 words…"

In [None]:
# Define split function
def split_into_segments(text, segment_size=100):
    if not isinstance(text, str):
        return []  # Return an empty list if the input is not a valid string

    words = re.split(r'\s+', text)
    segments = [
        " ".join(words[i:i + segment_size])
        for i in range(0, len(words), segment_size)
    ]
    return segments

In [None]:
# Assuming all_dir_reg is a pandas DataFrame and 'preamble' is the column with text
# all_dir_reg['preamble_segment'] = all_dir_reg['preamble'].apply(split_into_segments)
all_dir_reg['preamble_segment'] = all_dir_reg['subsampled_text'].apply(split_into_segments) # Use text with additional preprocessing

# Unnest the segments into separate columns
max_segments = all_dir_reg['preamble_segment'].apply(len).max()
segment_columns = [f'preamble_segment_{i+1}' for i in range(max_segments)]

# Expand the list of segments into separate columns
preamble_segments_df = pd.DataFrame(all_dir_reg['preamble_segment'].to_list(), columns=segment_columns)
all_dir_reg = pd.concat([all_dir_reg, preamble_segments_df], axis=1).drop(columns=['preamble_segment'])


### 1. Classification

RoBERT-classifier trained on the corpus of party manifestos.

"We […] classify each segment as left, neutral, or right"

In [None]:
# Define the Hugging Face pipeline
RoBERT_classifier = pipeline(
    task="text-classification",
    model="niksmer/RoBERTa-RILE",
    device=0 # CPU: -1 | GPU: 0
)

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("niksmer/RoBERTa-RILE")

# Fix error: Function to truncate text if it exceeds 512 tokens
def truncate_text(text, max_tokens=510): # play it safe
    # Tokenize the text
    tokens = tokenizer(text, truncation=False)["input_ids"]

    # Check if the token length exceeds the max allowed tokens
    if len(tokens) > max_tokens:
        # Truncate the text to the maximum number of tokens and decode back to string
        truncated_text = tokenizer.decode(tokens[:max_tokens], skip_special_tokens=True)
        return truncated_text
    else:
        return text

# Iterate over the DataFrame and truncate text in relevant columns
def truncate_long_segments(df, max_tokens=510): # play it safe
    # Iterate over each row
    for idx, row in df.iterrows():
        # Iterate over each column that starts with "preamble_segment"
        for col in df.columns:
            if col.startswith("preamble_segment"):
                original_text = row[col]
                if isinstance(original_text, str):  # Ensure it's a string before processing
                    # Truncate the text if necessary
                    df.at[idx, col] = truncate_text(original_text, max_tokens=max_tokens)
    return df

# Apply the truncation to the DataFrame
all_dir_reg_truncated = truncate_long_segments(all_dir_reg)

In [None]:
# Define the classify_batch function
def classify_batch(batch):
    return [RoBERT_classifier(text)[0]['label'] for text in batch["text"]]

# Create a dataset from the DataFrame
def prepare_dataset(df):
    # Flatten segments into a list with associated indices
    data = []
    for idx, row in df.iterrows():
        segments = [row[col] for col in row.index if col.startswith('preamble_segment')]
        segments = [seg for seg in segments if isinstance(seg, str)]  # Ensure the segment is a string
        for segment in segments:
            data.append({"idx": idx, "text": segment})
    return Dataset.from_list(data)

# Prepare the dataset for classification
#dataset = prepare_dataset(all_dir_reg)
dataset = prepare_dataset(all_dir_reg_truncated)

# Set the desired batch size
batch_size = 32  # Adjust this value as needed

# Apply the classifier to the dataset with progress display
results = []
dataset_dict = dataset.to_dict()["text"]
for i in tqdm(range(0, len(dataset_dict), batch_size), desc="Classifying segments"):
    batch = dataset_dict[i:i + batch_size]
    results.extend(classify_batch({"text": batch}))

# Map the results back to the original DataFrame
label_map = {idx: [] for idx in all_dir_reg.index}
for item, label in zip(dataset, results):
    label_map[item["idx"]].append(label)

# Create the final DataFrame
RoBERT_df = pd.DataFrame({
    'CELEX': all_dir_reg['CELEX'],
    'RoBERT_rile_labels': [", ".join(label_map[idx]) for idx in all_dir_reg.index]
})

In [None]:
RoBERT_df.to_csv("/content/drive/MyDrive/EU Policy Feedback/RoBERT_df_add_preprocessing.csv", index=False)

### ManiBERT
Classifier fine-tuned to identify the Comparative Manifesto Project (CMP) policy-issue codes

In [None]:
# Define the Hugging Face pipeline for ManiBERT with GPU utilization
ManiBERT_classifier = pipeline(
    task="text-classification",
    model="niksmer/ManiBERT",
    device=0 # CPU: -1 | GPU: 0
)

In [None]:
# Select relevant columns and reshape the DataFrame
def prepare_maniBERT_dataset(df):
    # Reshape the DataFrame: pivot longer and drop NA values
    df_long = df.melt(id_vars=["CELEX"], value_vars=[col for col in df.columns if col.startswith("preamble_segment")],
                      var_name="segment", value_name="text").dropna(subset=["text"])

    # Convert the reshaped DataFrame to a Hugging Face Dataset
    return Dataset.from_pandas(df_long)

# Prepare the dataset
maniBERT_dataset = prepare_maniBERT_dataset(all_dir_reg_truncated)

# Function to classify text using ManiBERT
def classify_text(batch):
    return [ManiBERT_classifier(text)[0]['label'] for text in batch['text']]

# Apply the classifier to the dataset using batched processing with progress display
batch_size = 32  # Adjust this value as needed
maniBERT_dataset = maniBERT_dataset.map(lambda batch: {'ManiBERT_label': classify_text(batch)},
                                        batched=True,
                                        batch_size=batch_size,
                                        desc="Processing segments")

# Convert the dataset back to a DataFrame
ManiBERT_df = maniBERT_dataset.to_pandas()

In [None]:
ManiBERT_df.to_csv("/content/drive/MyDrive/EU Policy Feedback/ManiBERT_df_add_preprocessing.csv", index=False)


### Object Checks

In [None]:
# prompt: print head of all_dir_reg

#print(all_dir_reg.head())
#print(RoBERT_df.shape)
#print(RoBERT_df.head())

#print(ManiBERT_df.shape)
#print(ManiBERT_df.tail())

In [None]:
# prompt: get size of dataframe

#print(all_dir_reg.shape)

