# Protest event detection replication

This replicates the core result of "A Generalized Approach to Protest Event Detection in German Local News" by Wiedemann et al. 2022.

This notebook is to be run on Google Colab with GPU support. The original outputs are missing, but I copied the results as text.

In [None]:
!pip install transformers datasets evaluate

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "csv",
    data_files={
        "train": "drive/MyDrive/0 Protest Impact/Protest Event Replication/data/glpn_train.csv",
        "dev": "drive/MyDrive/0 Protest Impact/Protest Event Replication/data/glpn_dev.csv",
        "test": "drive/MyDrive/0 Protest Impact/Protest Event Replication/data/glpn_test.csv",
        "test.time": "drive/MyDrive/0 Protest Impact/Protest Event Replication/data/glpn_test-time.csv",
        "test.loc": "drive/MyDrive/0 Protest Impact/Protest Event Replication/data/glpn_test-loc.csv",
    },
)
dataset

In [None]:
dataset["train"]["labels"][:10]

In [None]:
from datasets import ClassLabel

dataset = dataset.cast_column("labels", ClassLabel(names=["irrelevant", "relevant"]))
dataset = dataset.rename_column("labels", "label")
dataset["train"]["label"][:10]

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# model_name = "deepset/gelectra-base"
model_name = "deepset/gelectra-large"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name)

model_name = model_name.split("/")[1]

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["excerpt"], padding="max_length", truncation=True, max_length=512
    )


tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

In [None]:
import evaluate
import numpy as np

metric = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=f"drive/MyDrive/0 Protest Impact/Protest Event Replication/model/{model_name}/checkpoints",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    learning_rate=5e-6,
    weight_decay=0.2,
    num_train_epochs=6,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    compute_metrics=compute_metrics,
)

In [None]:
model_location = (
    f"drive/MyDrive/0 Protest Impact/Protest Event Replication/model/{model_name}/"
)

In [None]:
from pathlib import Path

if Path(model_location).exists():
    model = AutoModelForSequenceClassification.from_pretrained(model_location)
else:
    trainer.train()

In [None]:
trainer.save_model(model_location)

In [None]:
from collections import Counter

Counter(dataset["test"]["label"])

In [None]:
from evaluate import TextClassificationEvaluator

for test_set in ["test", "test.time", "test.loc"]:
    eval_results = TextClassificationEvaluator().compute(
        model_or_pipeline=model,
        data=dataset[test_set],
        input_column="excerpt",
        label_column="label",
        label_mapping={"LABEL_0": 0, "LABEL_1": 1},
        # label_mapping={"irrelevant": 0, "relevant": 1},
        # label_mapping={"LABEL_0": "irrelevant", "LABEL_1": "relevant"},
        tokenizer=tokenizer,
        metric=metric,
    )
    print(test_set)
    print(eval_results)

gelectra-large:

- **test**: `{'f1': 0.9345238095238095, 'total_time_in_seconds': 33.56910016100028, 'samples_per_second': 16.29474717453078, 'latency_in_seconds': 0.06136947012979941}`
- **test.time**: `{'f1': 0.8907882241215573, 'total_time_in_seconds': 41.59012401600012, 'samples_per_second': 18.08121562010006, 'latency_in_seconds': 0.05530601597872356}`
- **test.loc**: `{'f1': 0.7636363636363636, 'total_time_in_seconds': 23.54653733000032, 'samples_per_second': 20.59750838107598, 'latency_in_seconds': 0.0485495615051553}`