# Recreate Hix Hoyland (2024)
This notebook is based on the script `eu-policy-feedback/existing_measurements/hix_hoyland_2024/hix_hoyland_2024.R` and is optimised to process a large amount (>70k) of EU legislations using Google Colab hardware, e.g., a GPU.


Connect to a GPU for best performance.

### Setup

In [None]:
pip install datasets tqdm

In [None]:
import pandas as pd
import re
from itertools import islice
import math
from transformers import pipeline
from datasets import Dataset
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load data

In [23]:
all_dir_reg = pd.read_csv("/content/drive/MyDrive/EU Policy Feedback/all_dir_reg.csv")

### Preprocessing
"For each piece of legislation, we classified each sentence in the preamble, until the phrase “Adopted this directive/regulation”, using a RoBERT-classifier trained on the corpus of party manifestos"

Get preamble string until “Adopted this directive/regulation”

In [None]:
# Function to extract the preamble text
def extract_preamble(text):
    # Use a case-insensitive regex to find the first occurrence of the keywords
    match = re.search(r'(?i)(Adopted this directive|Adopted this regulation)', text)

    # If the keyword is found, truncate the string
    if match:
        return text[:match.start()]
    else:
        return text

# Assuming all_dir_reg is a pandas DataFrame and 'act_raw_text' is the column with text
all_dir_reg['preamble'] = all_dir_reg['act_raw_text'].apply(extract_preamble)


"We split the preambles into segments of 100 words…"

In [None]:
# Define split function
def split_into_segments(text, segment_size=100):
    words = re.split(r'\s+', text)
    segments = [
        " ".join(words[i:i + segment_size])
        for i in range(0, len(words), segment_size)
    ]
    return segments

# Assuming all_dir_reg is a pandas DataFrame and 'preamble' is the column with text
all_dir_reg['preamble_segment'] = all_dir_reg['preamble'].apply(split_into_segments)

# Unnest the segments into separate columns
max_segments = all_dir_reg['preamble_segment'].apply(len).max()
segment_columns = [f'preamble_segment_{i+1}' for i in range(max_segments)]

# Expand the list of segments into separate columns
preamble_segments_df = pd.DataFrame(all_dir_reg['preamble_segment'].to_list(), columns=segment_columns)
all_dir_reg = pd.concat([all_dir_reg, preamble_segments_df], axis=1).drop(columns=['preamble_segment'])


### 1. Classification

RoBERT-classifier trained on the corpus of party manifestos.

"We […] classify each segment as left, neutral, or right"

In [None]:
# Define the Hugging Face pipeline
RoBERT_classifier = pipeline(
    task="text-classification",
    model="niksmer/RoBERTa-RILE",
    device=0  # Use GPU (device 0)
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Create a dataset from the DataFrame
def prepare_dataset(df):
    # Flatten segments into a list with associated indices
    data = []
    for idx, row in df.iterrows():
        segments = [row[col] for col in row.index if col.startswith('preamble_segment')]
        segments = [seg for seg in segments if seg is not None]
        for segment in segments:
            data.append({"idx": idx, "text": segment})
    return Dataset.from_list(data)

# Prepare the dataset for classification
dataset = prepare_dataset(all_dir_reg)

# Set the desired batch size
batch_size = 32  # Adjust this value as needed

# Apply the classifier to the dataset with progress display
results = []
dataset_dict = dataset.to_dict()["text"]
for i in tqdm(range(0, len(dataset_dict), batch_size), desc="Classifying segments"):
    batch = dataset_dict[i:i + batch_size]
    results.extend(classify_batch({"text": batch}))

# Map the results back to the original DataFrame
label_map = {idx: [] for idx in all_dir_reg.index}
for item, label in zip(dataset, results):
    label_map[item["idx"]].append(label["label"])

# Create the final DataFrame
RoBERT_df = pd.DataFrame({
    'CELEX': all_dir_reg['CELEX'],
    'RoBERT_rile_labels': [", ".join(label_map[idx]) for idx in all_dir_reg.index]
})

Classifying segments: 100%|██████████| 41/41 [00:09<00:00,  4.51it/s]


In [None]:
RoBERT_df.to_csv("/content/drive/MyDrive/EU Policy Feedback/RoBERT_df.csv", index=False)

### ManiBERT
Classifier fine-tuned to identify the Comparative Manifesto Project (CMP) policy-issue codes

In [None]:
# Define the Hugging Face pipeline for ManiBERT with GPU utilization
ManiBERT_classifier = pipeline(
    task="text-classification",
    model="niksmer/ManiBERT",
    device=0  # Use GPU (device 0)
)

config.json:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
# Select relevant columns and reshape the DataFrame
def prepare_maniBERT_dataset(df):
    # Reshape the DataFrame: pivot longer and drop NA values
    df_long = df.melt(id_vars=["CELEX"], value_vars=[col for col in df.columns if col.startswith("preamble_segment")],
                      var_name="segment", value_name="text").dropna(subset=["text"])

    # Convert the reshaped DataFrame to a Hugging Face Dataset
    return Dataset.from_pandas(df_long)

# Prepare the dataset
maniBERT_dataset = prepare_maniBERT_dataset(all_dir_reg)

# Function to classify text using ManiBERT
def classify_text(batch):
    return [ManiBERT_classifier(text)[0]['label'] for text in batch['text']]

# Apply the classifier to the dataset using batched processing with progress display
batch_size = 32  # Adjust this value as needed
maniBERT_dataset = maniBERT_dataset.map(lambda batch: {'ManiBERT_label': classify_text(batch)},
                                        batched=True,
                                        batch_size=batch_size,
                                        desc="Processing segments")

# Convert the dataset back to a DataFrame
ManiBERT_df = maniBERT_dataset.to_pandas()

Processing segments:   0%|          | 0/653 [00:00<?, ? examples/s]

In [None]:
ManiBERT_df.to_csv("/content/drive/MyDrive/EU Policy Feedback/ManiBERT_df.csv", index=False)


In [25]:
# prompt: print head of all_dir_reg

print(all_dir_reg.head())
#print(RoBERT_df.head())
#print(ManiBERT_df.head())


        CELEX Date_document  \
0  32019L2121    2019-11-27   
1  32020L0262    2019-12-19   
2  32019L1922    2019-11-18   
3  32019L2034    2019-11-27   
4  32019L1995    2019-11-21   

                                        act_raw_text   Act_type  
0  DIRECTIVE (EU) 2019/2121 OF THE EUROPEAN PARLI...  Directive  
1  COUNCIL DIRECTIVE (EU) 2020/262\n\nof 19 Decem...  Directive  
2  COMMISSION DIRECTIVE (EU) 2019/1922\n\nof 18 N...  Directive  
3  DIRECTIVE (EU) 2019/2034 OF THE EUROPEAN PARLI...  Directive  
4  COUNCIL DIRECTIVE (EU) 2019/1995\n\nof 21 Nove...  Directive  


In [24]:
# prompt: get size of dataframe

print(all_dir_reg.shape)



(75570, 4)
