Autor@ Félix Fautsch
### 'Superset' Dataset Ver1.0: 14.05.2025
Creation of a superset dataset consisting of GAHD(2024), HASOC(2019), Bretschneider(2017), IWG(2016), RP-Mod & RP-Crowd(2021) and HOCON34k(2025).
Initial superset (no HOCON34k) has been forked from Huggingface: https://huggingface.co/datasets/manueltonneau/german-hate-speech-superset

Notes: In future versions maybe remove RP-Crowd (As of 16.05.2025 RP-Mod & RP-Crowd has been completely removed)

In [2]:
import pandas as pd
import fsspec
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import emoji
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, fbeta_score, matthews_corrcoef
import joblib

#### Functions for cleaning the Datasets

In [3]:
def normalize_split_words(text):
    def fix_word(word):
        if re.fullmatch(r'(?:[a-zA-Z][\W_]{0,2}){2,}[a-zA-Z]', word):
            return re.sub(r'[\W_]+', '', word)
        else:
            return word

    words = text.split()
    words = [fix_word(word) for word in words]
    return ' '.join(words)

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = remove_emojis(text)
    text = normalize_split_words(text)
    return text

#### Datasets

In [4]:
# ------- Huggingface Dataset -------
path = "hf://datasets/manueltonneau/german-hate-speech-superset/de_hf_112024.csv" # Huggingface Token needed
with fsspec.open(path, mode="rt") as f:
    df_hf = pd.read_csv(f)
df_hf = df_hf[df_hf['dataset'] != 'RP-mod-crowd'] # eliminating RP-mod-crowd dataset
df_hf = df_hf.dropna(subset=["labels"])
df_hf["labels"] = df_hf["labels"].astype(int)
df_hf = df_hf.rename(columns={"labels": "label"})
df_hf = df_hf[["text", "label"]]
df_hf["text"] = df_hf["text"].apply(clean_text)


# ------- HOCON34k Dataset -------
df_hocon34k = pd.read_csv("/Users/felixfautsch/VS_Python/Projektstudium/hatespeech_hocon34k.csv")
print(df_hf.head())
print(df_hocon34k.head())
df_hocon34k = df_hocon34k[["text", "label_hs", "split_all"]]
df_hocon34k = df_hocon34k.rename(columns={"label_hs": "label"})
df_hocon34k["text"] = df_hocon34k["text"].apply(clean_text)
df_hocon34k = df_hocon34k[df_hocon34k["split_all"].isin(["train", "val", "test"])]
df_hocon34k = df_hocon34k.reset_index(drop=True)
print("-" * 90,"\nDatasets after cleaning")
print(df_hf.head())
print(df_hocon34k.head())


# ------- Splitting Huggingface Dataset -------
df_hf = df_hf.sample(frac=1, random_state=42).reset_index(drop=True)
train_df, temp_df = train_test_split(df_hf, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
train_df["split_all"] = "train"
val_df["split_all"] = "val"
test_df["split_all"] = "test"
df_hf = pd.concat([train_df, val_df, test_df], ignore_index=True)
print("-" * 90,"\nDatasets after splitting")
print(df_hf.head())
print(df_hocon34k.head())


# ------- Combining both datasets -------
df_superset = pd.concat([df_hf,df_hocon34k], ignore_index=True).reset_index(drop=True)
df_superset["text"] = df_superset["text"].astype(str)
df_superset["label"] = df_superset["label"].astype(int)  
df_superset = df_superset.drop(columns=["split_all"])
print(df_superset["label"].dtype)
print(pd.Series(df_superset["label"]).unique())
print("-" * 90,"\nSuperset")
print(df_superset.head())
print("-" * 90,f"\n{df_superset["label"].value_counts().sort_index()}")


# ------- Saving the Superset -------
#df_superset.to_csv("/Users/felixfautsch/VS_Python/Projektstudium/superset.csv", index=False) 


# ------- Balanced Dataset -------
df_hatespeech = df_superset[df_superset["label"] == 1][["text", "label"]] 
df_nohatespeech = df_superset[df_superset["label"] == 0].sample(frac=0.4, random_state=42) # 29.295 (with RP-mod-crowd) | 18108 (without RP-mod-crowd)
df_balanced = pd.concat([df_hatespeech, df_nohatespeech]).sample(frac=1, random_state=42)
print("-" * 90,f"\nBalanced Distribution:\n{df_balanced["label"].value_counts().sort_index()}")
train_df, temp_df = train_test_split(df_balanced, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
print("-" * 90,f"\nBalanced Train Distribution:\n{train_df["label"].value_counts(normalize=True)}")


# ------- Saving the balanced Dataset -------
#df_balanced.to_csv("/Users/felixfautsch/VS_Python/Projektstudium/balanced.csv", index=False) 

  text = BeautifulSoup(text, "html.parser").get_text()


                                                text  label
0       gleich an die wand stellen und erschiessen..      1
1  nicht dass ich der Grundbotschaft dieses Posts...      0
2  Das mit dem "an die Wand stellen und erschiessen"      0
3  Seit dem "an die Wand stellen und erschiessen"...      0
4  Ja ja die Kriminelle Heimatpartei FPÖ von Kind...      0
   newspaper_id    post_id  annotator_id  phase split_all split_12  \
0             6  463609874            34      2       NaN      NaN   
1             6  463609874            35      2       NaN      NaN   
2             6  463609874            36      2       NaN      NaN   
3             6  463609874            37      2       NaN      NaN   
4             6  463609874            38      2       NaN      NaN   

                         label  label_hs  label_context  \
0  Hatespeech (enough context)         1              1   
1  Hatespeech (enough context)         1              1   
2  Hatespeech (enough context)         1  

  text = BeautifulSoup(text, "html.parser").get_text()


------------------------------------------------------------------------------------------ 
Datasets after cleaning
                                                text  label
0       gleich an die wand stellen und erschiessen..      1
1  nicht dass ich der Grundbotschaft dieses Posts...      0
2  Das mit dem "an die Wand stellen und erschiessen"      0
3  Seit dem "an die Wand stellen und erschiessen"...      0
4  Ja ja die Kriminelle Heimatpartei FPÖ von Kind...      0
                                                text  label split_all
0  Das würde ich auch befürworten. Jedochhier geh...      0     train
1  Das finde ich auch. Je kleiner das Hirn, desto...      1     train
2  Was wäre daran schlimmdiese Subjekte ins Gulag...      1     train
3      irgentwas ist dran, daß Löwen a Katzenart is.      0     train
4  Nicht die Schwachmaten und Namenstänzer von Li...      0     train
------------------------------------------------------------------------------------------ 
Datasets aft

#### Quick Logistic Regression with n-gram(1,3) vectorization
S-Score of 0.7468997106112016 on the HOCON34k split_all testset

In [5]:
train_numpy = np.array(train_df)
m, n = train_numpy.shape # m=23018(texts) n=2(labels)
print(train_numpy[:,0]) # all rows from column 0
print(train_numpy[:,1]) # all rows from column 1
texts_train = train_numpy[:,0] # texts
labels_train = train_numpy[:,1] # labels
labels_train = labels_train.astype(int)

val_numpy = np.array(val_df)
m, n = train_numpy.shape 
texts_val = val_numpy[:,0]
labels_val = val_numpy[:,1]
labels_val = labels_val.astype(int)

test_numpy = np.array(test_df)
m, n = test_numpy.shape 
texts_test = test_numpy[:,0]
labels_test = test_numpy[:,1]
labels_test = labels_test.astype(int)

vectorizer = TfidfVectorizer(ngram_range=(1, 3))
vectors_train = vectorizer.fit_transform(texts_train)
vectors_val = vectorizer.transform(texts_val)
vectors_test = vectorizer.transform(texts_test)

def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    f2 = fbeta_score(labels, preds, beta=2, average='binary')
    mcc = matthews_corrcoef(labels, preds)
    mcc_normalized = (mcc + 1) / 2
    S = (f2 + mcc_normalized) / 2

    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "f2": f2,
        "mcc": mcc,
        "mcc_normalized": mcc_normalized,
        "S": S,
    }

    return metrics

model = LogisticRegression(max_iter=10000,class_weight='balanced',multi_class='ovr')
model.fit(vectors_train, labels_train)
joblib.dump(model, 'balanced_superset_logreg.pkl')

y_pred = model.predict(vectors_val)
#print(classification_report(labels_val, y_pred))
metrics_val = compute_metrics(labels_val, y_pred)
print(metrics_val)
loaded_model = joblib.load('balanced_superset_logreg.pkl')
y_pred = loaded_model.predict(vectors_test)
metrics_test = compute_metrics(labels_test, y_pred)
print(metrics_test)


# ------- baseline test (on standard hocon34k test split_all) -------
baseline_test = df_hocon34k[df_hocon34k["split_all"].isin(["test"])]
baseline_test = baseline_test.drop(columns="split_all")
baseline_numpy = np.array(baseline_test)
texts_baseline = baseline_numpy[:,0]
labels_baseline = baseline_numpy[:,1]
labels_baseline = labels_baseline.astype(int)
vectors_baseline = vectorizer.transform(texts_baseline)
y_pred = loaded_model.predict(vectors_baseline)
metrics_test = compute_metrics(labels_baseline, y_pred)
print("-" * 90,f"\nBaseline Test:\n{metrics_test}")
satz = "Das Wetter ist heute wirklich schön."
satz_vector = vectorizer.transform([satz])  # needs to be wrapped inside of a list
print(loaded_model.predict(satz_vector))
prob = loaded_model.predict_proba(satz_vector)
print("Prediction:", loaded_model.predict(satz_vector)[0])
print("Probability:", prob[0])

['Danke für Info. Sollte die Redaktion von RO24 auch lesen und entsprechend journalistisch "reagieren" und das Foto jetzt nicht verwenden um nicht falsche Eindrücke von den Regierenden zu vermitteln. Gehört sich nicht in Bayern und auch woanders.'
 'Egal, was die Leute sagen, ich weigere mich zu akzeptieren, dass schwarze Affen den Weißen gleichgestellt sind'
 'Das ist Achmed. Achmed ist IstanbulTerrorist und Mehrfachmörder. Früher war Ahmed Flüchtling in Österreich. NurSo LINK'
 ...
 'Das ist richtig, nur was ist Ihr Schluss daraus? Afghanen nach Österreich oder Belgien abschieben? Das DublinAbkommen war eine Farce und hat u.a. auch Deutschland ein Alibi gegeben, die Länder an der Außengrenze im Stich zu lassen. Es hat nicht mehr funktioniert, als die Länder an den Außengrenzen (nachvollziehbarerweise) nicht mehr alles alleine schultern wollten.'
 'Und wieder hat es nichts mit Otte zu tun, bestimmt zum 20 Mal. Sie sind nichts weiter als ein Troll-Clown, der unfähig ist, in der Sache z



{'accuracy': 0.6934306569343066, 'precision': 0.5796862097440132, 'recall': 0.6530232558139535, 'f1': 0.6141732283464567, 'f2': 0.6369080021774632, 'mcc': 0.3631029724077295, 'mcc_normalized': 0.6815514862038647, 'S': 0.659229744190664}
{'accuracy': 0.6883252258512856, 'precision': 0.5557418273260687, 'recall': 0.6436893203883495, 'f1': 0.5964912280701754, 'f2': 0.6239412761151891, 'mcc': 0.34728592946714726, 'mcc_normalized': 0.6736429647335737, 'S': 0.6487921204243814}
------------------------------------------------------------------------------------------ 
Baseline Test:
{'accuracy': 0.8247151621384751, 'precision': 0.46010362694300516, 'recall': 0.8489483747609943, 'f1': 0.5967741935483871, 'f2': 0.7262021589793916, 'mcc': 0.5351945244860234, 'mcc_normalized': 0.7675972622430117, 'S': 0.7468997106112016}
[0]
Prediction: 0
Probability: [0.6956111 0.3043889]


#### Analysis of the logistic regression model

In [6]:
print(model.coef_) # (classes, weights) weights for every feature, in this case there are 305653 x terms and corresponding weights
print(model.coef_.shape) 

feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_[0]
top_features = sorted(zip(coefficients, feature_names), key=lambda x: abs(x[0]), reverse=True)
for coef, feat in top_features[:20]:
    print(f"{feat}: {coef:.4f}")

print(model.intercept_)

print(model.classes_)  

print(model.get_params())

[[-0.02029426 -0.05669088 -0.05669088 ... -0.04630341 -0.04630341
  -0.04630341]]
(1, 693043)
link: -9.3790
frauen: 6.2110
sind: 4.7105
du: 4.1511
ausländer: 4.1418
schwarze: 3.7550
juden: 3.3737
sollten: 3.2935
menschen: 3.2200
muslime: 3.1012
schwarzen: 2.9493
alle: 2.7297
einwanderer: 2.6533
diese: 2.6332
pack: 2.6191
das: -2.6056
auch: -2.5703
sie: 2.5313
lügenpresse: 2.3744
würde: 2.3639
[-0.34516065]
[0 1]
{'C': 1.0, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 10000, 'multi_class': 'ovr', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


#### simple BERT

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
from torch.nn import functional as F
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, fbeta_score, matthews_corrcoef


MODEL_NAME = 'bert-base-german-cased'


# ------- Dataset -------
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = TextDataset(texts_train, labels_train, tokenizer)
val_dataset = TextDataset(texts_val, labels_val, tokenizer)
test_dataset = TextDataset(texts_test, labels_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


# ------- Training -------
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    f2 = fbeta_score(labels, preds, beta=2, average='binary')
    mcc = matthews_corrcoef(labels, preds)
    mcc_normalized = (mcc + 1) / 2
    S = (f2 + mcc_normalized) / 2

    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "f2": f2,
        "mcc": mcc,
        "mcc_normalized": mcc_normalized,
        "S": S,
    }

    return metrics

def evaluate(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    metrics = compute_metrics(all_labels, all_preds)
    return metrics

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} | Train Loss: {avg_loss:.4f}")

    val_metrics = evaluate(model, val_loader, device)
    print(f"Validation Metrics: {val_metrics}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1:   0%|          | 1/1439 [00:40<16:04:36, 40.25s/it]


KeyboardInterrupt: 