In [2]:
from datasets import load_dataset

dataset = load_dataset("grammarly/detexd-benchmark", split='test')
print(dataset)
print(dataset.to_pandas().label.value_counts())

Dataset({
    features: ['text', 'annotator_1', 'annotator_2', 'annotator_3', 'label'],
    num_rows: 1023
})
label
0    687
1    336
Name: count, dtype: int64


In [None]:
# Download hatebert models
# https://arxiv.org/pdf/2010.12472.pdf
# https://osf.io/tbd58/
!wget https://files.de-1.osf.io/v1/resources/tbd58/providers/osfstorage/?zip= -O hatebert.zip
!mkdir hatebert
!unzip hatebert.zip -d hatebert
!rm hatebert.zip

!unzip hatebert/HateBERT_fine_tuned_models/HateBERT_abuseval.zip -d hatebert/HateBERT_fine_tuned_models
!unzip hatebert/HateBERT_fine_tuned_models/HateBERT_hateval.zip -d hatebert/HateBERT_fine_tuned_models
!unzip hatebert/HateBERT_fine_tuned_models/HateBERT_offenseval.zip -d hatebert/HateBERT_fine_tuned_models
!rm hatebert/HateBERT_fine_tuned_models/HateBERT_abuseval.zip
!rm hatebert/HateBERT_fine_tuned_models/HateBERT_hateval.zip
!rm hatebert/HateBERT_fine_tuned_models/HateBERT_offenseval.zip

In [None]:
from transformers import pipeline
from sklearn.metrics import precision_recall_fscore_support
from tqdm.auto import tqdm
from sklearn.metrics import precision_recall_curve, f1_score
import numpy as np
from transformers.pipelines.pt_utils import KeyDataset
import pandas as pd

metrics = []
for name in tqdm(['hatebert/HateBERT_fine_tuned_models/HateBERT_abuseval',
                  'hatebert/HateBERT_fine_tuned_models/HateBERT_hateval',
                  'hatebert/HateBERT_fine_tuned_models/HateBERT_offenseval']):
    pipe = pipeline("text-classification", model=name, device=0, batch_size=8)
    pipe.model.config.id2label = [0, 1]
    preds = tqdm(pipe(KeyDataset(dataset, 'text'), truncation=True, top_k=None), total=len(dataset))
    scores = np.array([next(p['score']
                            for p in pr if p['label'] == 1)
                       for pr in preds])

    precision, recall, thresholds = precision_recall_curve(dataset['label'], scores)
    f_scores = 2*(precision*recall)/(precision+recall)
    optimal_threshold_index = np.argmax(f_scores)
    optimal_threshold = thresholds[optimal_threshold_index]
    for tag, threshold in [('', 0.5), ('_opt', optimal_threshold)]:
        preds = scores > threshold
        metrics.append((name + tag,) + precision_recall_fscore_support(dataset['label'], preds, average='binary')[:-1])
    
metrics = pd.DataFrame(metrics, columns=['model', 'precision', 'recall', 'f1'])
metrics.model = metrics.model.str.split('/').str[-1]

In [4]:
metrics.style.format('{:.1%}', subset=['precision', 'recall', 'f1'])

Unnamed: 0,model,precision,recall,f1
0,HateBERT_abuseval,86.7%,11.6%,20.5%
1,HateBERT_abuseval_opt,57.0%,70.2%,62.9%
2,HateBERT_hateval,95.2%,6.0%,11.2%
3,HateBERT_hateval_opt,41.1%,86.0%,55.6%
4,HateBERT_offenseval,75.4%,31.0%,43.9%
5,HateBERT_offenseval_opt,60.1%,72.6%,65.8%


In [5]:
# evaluate HateBERT on HatEval (Basile et al., 2019) (reported f-score was .645±.001)
# run this first: founta_basile_comparison.ipynb
import datasets

df = pd.read_csv('basile_data/preds.csv')
pipe = pipeline("text-classification", model='hatebert/HateBERT_fine_tuned_models/HateBERT_hateval', device=0, batch_size=8)
pipe.model.config.id2label = [0, 1]
preds = tqdm(pipe(KeyDataset(datasets.Dataset.from_pandas(df), 'text'), truncation=True), total=len(df))
preds = [p['label'] for p in preds]
df['hateval_pred'] = preds
df.to_csv('basile_data/preds.csv', index=False)
print(dict(zip(['precision', 'recall', 'f-score'],
               [f'{x:.1%}' for x in precision_recall_fscore_support(df.real, preds, average='binary')[:-1]])))

  0%|          | 0/2805 [00:00<?, ?it/s]

{'precision': '48.3%', 'recall': '96.4%', 'f-score': '64.3%'}


In [19]:
# Note, there's a mismatch with the paper: 64.3% != .645
# possible reasons?
# 1) some floating-point fluctuations, different versions of torch, etc
# 2) two positive classes are treated separately