In [1]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [1]:
%%capture
from protest_impact.data.protests.detection import load_aglpn_dataset, load_glpn_dataset

glpn = load_glpn_dataset()
aglpn = load_aglpn_dataset()

In [2]:
aglpn

DatasetDict({
    train: Dataset({
        features: ['text', 'meta', '_input_hash', '_task_hash', 'spans', 'options', 'accept', '_view_id', 'config', 'answer', '_timestamp', 'label'],
        num_rows: 650
    })
    train.positive: Dataset({
        features: ['text', 'meta', 'score', '_input_hash', '_task_hash', 'spans', 'options', 'accept', '_view_id', 'config', 'answer', '_timestamp', 'label'],
        num_rows: 500
    })
    test: Dataset({
        features: ['text', 'meta', '_input_hash', '_task_hash', 'spans', 'options', 'accept', '_view_id', 'config', 'answer', '_timestamp', 'label'],
        num_rows: 500
    })
})

In [10]:
from datasets import DatasetDict, concatenate_datasets
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from protest_impact.data.news import kwic_dataset
from protest_impact.data.protests.detection.train import evaluate_, train_model

In [16]:
train = concatenate_datasets(
    [
        aglpn["train"],
        aglpn["train.positive"],
    ]
)
train = kwic_dataset(train, n=4)
dataset = DatasetDict({"train": train})
model_name = "deepset/gelectra-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_vanilla = AutoModelForSequenceClassification.from_pretrained(model_name).to(
    device
)
model = train_model(
    model_vanilla, tokenizer, "aglpn_train_and_test", dataset, n_epochs=6
)

Step,Training Loss


In [18]:
from sklearn.metrics import classification_report
from transformers import pipeline


def evaluate_detail(model, tokenizer, test_set):
    classifier = pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
        device=device,
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    predictions = list(classifier(a["text"] for a in test_set))
    y_pred = [int(a["label"][-1]) for a in predictions]
    y_true = [a["label"] for a in test_set]
    print(sum(y_true) / len(y_true))
    print(sum(y_pred) / len(y_pred))
    print(classification_report(y_true, y_pred))
    return predictions, y_pred, y_true


evaluate_detail(model, tokenizer, kwic_dataset(aglpn["test"], n=4))
None

  0%|          | 0/500 [00:00<?, ?ex/s]

0.154
0.144
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       423
           1       0.81      0.75      0.78        77

    accuracy                           0.93       500
   macro avg       0.88      0.86      0.87       500
weighted avg       0.93      0.93      0.93       500



In [21]:
evaluate_detail(model, tokenizer, glpn["test"])
None

0.603290676416819
0.28153564899451555
              precision    recall  f1-score   support

           0       0.54      0.97      0.69       217
           1       0.96      0.45      0.61       330

    accuracy                           0.66       547
   macro avg       0.75      0.71      0.65       547
weighted avg       0.79      0.66      0.64       547



In [22]:
evaluate_detail(model, tokenizer, glpn["test.loc"])
None

0.18556701030927836
0.061855670103092786
              precision    recall  f1-score   support

           0       0.85      0.98      0.92       395
           1       0.80      0.27      0.40        90

    accuracy                           0.85       485
   macro avg       0.83      0.63      0.66       485
weighted avg       0.84      0.85      0.82       485



In [None]:
train = concatenate_datasets(
    [
        aglpn["train"],
        aglpn["train.positive"],
        aglpn["test"],
    ]
)
train = kwic_dataset(train, n=4)
dataset = DatasetDict({"train": train})
model_name = "deepset/gelectra-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_vanilla = AutoModelForSequenceClassification.from_pretrained(model_name).to(
    device
)
model = train_model(
    model_vanilla, tokenizer, "aglpn_train_and_test", dataset, n_epochs=6
)

  0%|          | 0/1650 [00:00<?, ?ex/s]

Some weights of the model checkpoint at deepset/gelectra-large were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at deepset/gelectra-large and are newly initialized: ['classifier.dense.bias', 'classifie

In [None]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=device,
    padding="max_length",
    truncation=True,
    max_length=512,
)

In [None]:
from datasets import load_dataset

from protest_impact.util import project_root

data = load_dataset(
    str(project_root),
    data_files={"main": "protest_news_shuffled_v2.jsonl"},
)

Using custom data configuration protest-impact-data-55e8ce6b4a6effe8


Downloading and preparing dataset json/protest-impact-data to /root/.cache/huggingface/datasets/json/protest-impact-data-55e8ce6b4a6effe8/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating main split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/protest-impact-data-55e8ce6b4a6effe8/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
predictions = classifier(data["main"]["text"])

In [None]:
import json

with open("predictions.jsonl", "w") as f:
    for prediction in predictions:
        json.dump(prediction, f)
        f.write("\n")

In [None]:
n = len([a for a in predictions if a["label"] == "LABEL_1"])
n, n / len(predictions)

(20879, 0.10787059042344334)

In [None]:
idx_pos = [i for i, a in enumerate(predictions) if a["label"] == "LABEL_1"]
idx_neg = [i for i, a in enumerate(predictions) if a["label"] == "LABEL_0"]

In [None]:
import random

from protest_impact.data.news import kwic

pos = data["main"][idx_pos]["text"]
random.seed(20230212)
random.shuffle(pos)
for p in pos[:10]:
    print(kwic(p, n=4), "\n\n---\n")

In [None]:
neg = data["main"][idx_neg]["text"]
random.seed(20230212)
random.shuffle(neg)
for n in neg[:30]:
    print(n, "\n\n---\n")