# Training custom BERT-BiGRU on the Balanced Superset Version 1.5 With Cross-Validation

Autor@ Félix Fautsch
### 'Superset' Dataset Ver1.5: 21.05.2025
Creation of a superset dataset consisting of GAHD(2024), HASOC(2019), Bretschneider(2017), IWG(2016), RP-Mod & RP-Crowd(2021), DeTox(2022) and HOCON34k(2025).
Initial superset (no HOCON34k) has been forked from Huggingface: https://huggingface.co/datasets/manueltonneau/german-hate-speech-superset

Notes: As of 16.05.2025 RP-Mod & RP-Crowd has been completely removed

We have taken only the comments from DeTox with a hatespeech value > 0.2.

The inspiration for the architecture comes from the following research paper: <br>
Islam, S., Hosen, M. J., Taznin, F. R., Sultana, N., Haque, M. I., & Rana, S. (2024, June). An Efficient Framework for Transliteration Sentence Identification of Low Resource Languages Using Hybrid BERT-BiGRU. In 2024 15th International Conference on Computing Communication and Networking Technologies (ICCCNT) (pp. 1-8). IEEE.

In [None]:
%pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/590.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m583.7/590.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [None]:
import pandas as pd
import fsspec
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import emoji
import numpy as np
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
from torch.nn import functional as F
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, fbeta_score, matthews_corrcoef
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, fbeta_score, matthews_corrcoef, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

#### Functions for cleaning the Datasets

In [None]:
def normalize_split_words(text):
    def fix_word(word):
        if re.fullmatch(r'(?:[a-zA-Z][\W_]{0,2}){2,}[a-zA-Z]', word):
            return re.sub(r'[\W_]+', '', word)
        else:
            return word

    words = text.split()
    words = [fix_word(word) for word in words]
    return ' '.join(words)

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = remove_emojis(text)
    text = normalize_split_words(text)
    return text

#### Datasets

In [None]:
# ------- Balanced Dataset -------
df_balanced = pd.read_csv("/content/balanced_ver1.5.csv")
print(df_balanced.head())
print("-" * 90,f"\nBalanced Distribution:\n{df_balanced['label'].value_counts().sort_index()}")
train_df, temp_df = train_test_split(df_balanced, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
print("-" * 90,f"\nBalanced Train Distribution:\n{train_df['label'].value_counts(normalize=True)}")

                                                text  label
0  Ich hatte damals Russisch anstatt Latein genom...      0
1  [user] sollen Sie dir die Adresse hinschreiben...      0
2  Jeder Abgeordnete, jeder Minister müßte eine F...      0
3  Hauptsache wir Deutschen werden lückenlos über...      0
4  Die ganze Bereicherung verdanken wir nur chris...      1
------------------------------------------------------------------------------------------ 
Balanced Distribution:
label
0    18108
1    13541
Name: count, dtype: int64
------------------------------------------------------------------------------------------ 
Balanced Train Distribution:
label
0    0.572771
1    0.427229
Name: proportion, dtype: float64


#### Quick Logistic Regression with n-gram(1,3) vectorization
S-Score of 0.7468997106112016 on the HOCON34k split_all testset

In [None]:
train_numpy = np.array(train_df)
m, n = train_numpy.shape # m=23018(texts) n=2(labels)
print(train_numpy[:,0]) # all rows from column 0
print(train_numpy[:,1]) # all rows from column 1
texts_train = train_numpy[:,0] # texts
labels_train = train_numpy[:,1] # labels
labels_train = labels_train.astype(int)

val_numpy = np.array(val_df)
m, n = train_numpy.shape
texts_val = val_numpy[:,0]
labels_val = val_numpy[:,1]
labels_val = labels_val.astype(int)

test_numpy = np.array(test_df)
m, n = test_numpy.shape
texts_test = test_numpy[:,0]
labels_test = test_numpy[:,1]
labels_test = labels_test.astype(int)

texts_train = texts_train.astype(str)
texts_val = texts_val.astype(str)
texts_test = texts_test.astype(str)

vectorizer = TfidfVectorizer(ngram_range=(1, 3))
vectors_train = vectorizer.fit_transform(texts_train)
vectors_val = vectorizer.transform(texts_val)
vectors_test = vectorizer.transform(texts_test)

def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    f2 = fbeta_score(labels, preds, beta=2, average='binary')
    mcc = matthews_corrcoef(labels, preds)
    mcc_normalized = (mcc + 1) / 2
    S = (f2 + mcc_normalized) / 2

    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "f2": f2,
        "mcc": mcc,
        "mcc_normalized": mcc_normalized,
        "S": S,
    }

    return metrics

model = LogisticRegression(max_iter=10000,class_weight='balanced',multi_class='ovr')
model.fit(vectors_train, labels_train)
joblib.dump(model, 'balanced_superset_logreg.pkl')

y_pred = model.predict(vectors_val)
#print(classification_report(labels_val, y_pred))
metrics_val = compute_metrics(labels_val, y_pred)
print(metrics_val)
loaded_model = joblib.load('balanced_superset_logreg.pkl')
y_pred = loaded_model.predict(vectors_test)
metrics_test = compute_metrics(labels_test, y_pred)
print(metrics_test)


# ------- baseline test (on standard hocon34k test split_all) -------
baseline_test = df_hocon34k[df_hocon34k["split_all"].isin(["test"])]
baseline_test = baseline_test.drop(columns="split_all")
baseline_numpy = np.array(baseline_test)
texts_baseline = baseline_numpy[:,0]
labels_baseline = baseline_numpy[:,1]
labels_baseline = labels_baseline.astype(int)
vectors_baseline = vectorizer.transform(texts_baseline)
y_pred = loaded_model.predict(vectors_baseline)
metrics_test = compute_metrics(labels_baseline, y_pred)
print("-" * 90,f"\nBaseline Test:\n{metrics_test}")
satz = "Das Wetter ist heute wirklich schön."
satz_vector = vectorizer.transform([satz])  # needs to be wrapped inside of a list
print(loaded_model.predict(satz_vector))
prob = loaded_model.predict_proba(satz_vector)
print("Prediction:", loaded_model.predict(satz_vector)[0])
print("Probability:", prob[0])

['Kimmich ist aus meiner Sicht ein guter Durchschnittsspieler und einer der Probleme. Als AV zu langsam im Mittelfeld nichts besonderes. Warum soll der Kapitän sein? Weil er es selber will - für mich ein Wichtigtuer. Bayern ist kein Gramm schlechter wenn er nicht spielt. Kapitän wird man wegen der Persönlichkeit und Klasse , oder weil es der Trainer will.'
 'Geehrter HansJoachim "Unterlage" Elflein, Unterlagen sind kein Unfug.'
 'Kaffee kann nicht schaden. Pitis Geschwafel ist ziemlich ermüdend ;-)'
 ... 'Hindus sind eine unterdurchschnittliche Art'
 'Hab das heute morgen auch gehört und dachte schon, meine Nachbarin hat wieder mal einen AsthmaAnfall ...'
 'Ja man kann eben nicht in ein paar Wochen aufholen was man Monate lang vernachlässigt hat. Zudem was man aus Trainingsberichten von Fans hier so liest, scheint das Trainerteam bei den Übungen ja eh nur stumm daneben zu stehen.']
[0 0 0 ... 1 0 0]




{'accuracy': 0.70521327014218, 'precision': 0.6345332519829164, 'recall': 0.7569141193595342, 'f1': 0.6903418519747759, 'f2': 0.728801681850035, 'mcc': np.float64(0.41904842860666736), 'mcc_normalized': np.float64(0.7095242143033337), 'S': np.float64(0.7191629480766843)}
{'accuracy': 0.6900473933649289, 'precision': 0.6145251396648045, 'recall': 0.7333333333333333, 'f1': 0.668693009118541, 'f2': 0.7060333761232349, 'mcc': np.float64(0.387002384431295), 'mcc_normalized': np.float64(0.6935011922156475), 'S': np.float64(0.6997672841694412)}


NameError: name 'df_hocon34k' is not defined

#### Analysis of the logistic regression model

In [None]:
print(model.coef_) # (classes, weights) weights for every feature, in this case there are 305653 x terms and corresponding weights
print(model.coef_.shape)

feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_[0]
top_features = sorted(zip(coefficients, feature_names), key=lambda x: abs(x[0]), reverse=True)
for coef, feat in top_features[:20]:
    print(f"{feat}: {coef:.4f}")

print(model.intercept_)

print(model.classes_)

print(model.get_params())

[[-0.05437683 -0.06179214 -0.06179214 ...  0.07137101  0.07137101
   0.07137101]]
(1, 784176)
link: -10.9564
querdenker: 6.0836
frauen: 5.4967
sind: 4.5355
du: 4.1007
ausländer: 3.4787
juden: 3.4527
schwarze: 3.3897
menschen: 3.2251
ihr: 3.2123
muslime: 2.9844
schwarzen: 2.9596
fresse: 2.9505
alle: 2.9490
haltdiefressebild: 2.9409
nazi: 2.8984
baerbock: 2.8255
fckafd: 2.8194
sollten: 2.7924
nazis: 2.7714
[-0.38286147]
[0 1]
{'C': 1.0, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 10000, 'multi_class': 'ovr', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


#### simple BERT

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
from torch.nn import functional as F
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, fbeta_score, matthews_corrcoef


MODEL_NAME = 'bert-base-german-cased'


# ------- Dataset -------
train_numpy = np.array(train_df)
m, n = train_numpy.shape
print(train_numpy[:,0]) # all rows from column 0
print(train_numpy[:,1]) # all rows from column 1
texts_train = train_numpy[:,0] # texts
labels_train = train_numpy[:,1] # labels
labels_train = labels_train.astype(int)

val_numpy = np.array(val_df)
m, n = train_numpy.shape
texts_val = val_numpy[:,0]
labels_val = val_numpy[:,1]
labels_val = labels_val.astype(int)

test_numpy = np.array(test_df)
m, n = test_numpy.shape
texts_test = test_numpy[:,0]
labels_test = test_numpy[:,1]
labels_test = labels_test.astype(int)

texts_train = texts_train.astype(str)
texts_val = texts_val.astype(str)
texts_test = texts_test.astype(str)

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = TextDataset(texts_train, labels_train, tokenizer)
val_dataset = TextDataset(texts_val, labels_val, tokenizer)
test_dataset = TextDataset(texts_test, labels_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


# ------- Training -------
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    f2 = fbeta_score(labels, preds, beta=2, average='binary')
    mcc = matthews_corrcoef(labels, preds)
    mcc_normalized = (mcc + 1) / 2
    S = (f2 + mcc_normalized) / 2

    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "f2": f2,
        "mcc": mcc,
        "mcc_normalized": mcc_normalized,
        "S": S,
    }

    return metrics

def evaluate(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    metrics = compute_metrics(all_labels, all_preds)
    return metrics

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 1
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} | Train Loss: {avg_loss:.4f}")

    val_metrics = evaluate(model, val_loader, device)
    print(f"Validation Metrics: {val_metrics}")

# Create a directory to save the model
save_directory = "./bert-superset1.5-classification"

# Save model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model saved to {save_directory}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


Training Epoch 1: 100%|██████████| 1583/1583 [37:07<00:00,  1.41s/it]



Epoch 1 | Train Loss: 0.5185


Evaluating: 100%|██████████| 198/198 [01:32<00:00,  2.14it/s]


Validation Metrics: {'accuracy': 0.7605055292259084, 'precision': 0.735474006116208, 'recall': 0.7001455604075691, 'f1': 0.7173750932140194, 'f2': 0.7069370958259847, 'mcc': np.float64(0.5102732476479005), 'mcc_normalized': np.float64(0.7551366238239503), 'S': np.float64(0.7310368598249675)}
Model saved to ./bert-superset1.5-classification


### BERT-CNN

In [None]:
torch.cuda.empty_cache()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MODEL_NAME = 'bert-base-german-cased'
NUM_FOLDS = 5
EPOCHS = 1
BATCH_SIZE = 64
MAX_LEN = 512

# Tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Model
class BERT_BiGRU(nn.Module):
    def __init__(self, bert_model_name, num_labels, gru_hidden_size=128, dropout_rate=0.3, class_weights=None):
        super(BERT_BiGRU, self).__init__()

        self.bert = BertModel.from_pretrained(bert_model_name)
        self.gru = nn.GRU(
            input_size=self.bert.config.hidden_size,
            hidden_size=gru_hidden_size,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(gru_hidden_size * 2, num_labels)
        self.class_weights = class_weights

    def forward(self, input_ids, attention_mask, labels=None):
        # Get BERT embeddings
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state  # shape: (batch_size, seq_len, hidden_size)

        # GRU: shape → (batch_size, seq_len, hidden_size*2)
        gru_output, _ = self.gru(sequence_output)

        # Use the last token’s forward and backward GRU hidden states (or mean/max pooling instead)
        # Here we use the final timestep
        pooled_output = gru_output[:, -1, :]  # shape: (batch_size, hidden_size*2)

        x = self.dropout(pooled_output)
        logits = self.classifier(x)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))
            return loss, logits

        return logits

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Metrics
def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    f2 = fbeta_score(labels, preds, beta=2, average='binary')
    mcc = matthews_corrcoef(labels, preds)
    mcc_normalized = (mcc + 1) / 2
    S = (f2 + mcc_normalized) / 2
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "f2": f2,
        "mcc": mcc,
        "mcc_normalized": mcc_normalized,
        "S": S,
    }

# Evaluation function
def evaluate(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits = outputs
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return compute_metrics(all_labels, all_preds)

# Cross-Validation
texts = df_balanced['text'].values.astype(str)
labels = df_balanced['label'].values.astype(int)

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
    print(f"\n========== Fold {fold + 1}/{NUM_FOLDS} ==========")

    texts_train, texts_val = texts[train_idx], texts[val_idx]
    labels_train, labels_val = labels[train_idx], labels[val_idx]

    # Compute class weights for training data
    class_counts = np.bincount(labels_train)
    total_samples = len(labels_train)
    num_classes = len(class_counts)
    class_weights = total_samples / (num_classes * class_counts)
    class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

    # Datasets and Dataloaders
    train_dataset = TextDataset(texts_train, labels_train, tokenizer, max_len=MAX_LEN)
    val_dataset = TextDataset(texts_val, labels_val, tokenizer, max_len=MAX_LEN)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Initialize model
    model = BERT_BiGRU(
        bert_model_name=MODEL_NAME,
        num_labels=num_classes,
        class_weights=class_weights
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=3.4e-5)

    # Training
    for epoch in range(EPOCHS):
        print(f"\n--- Training Epoch {epoch+1} ---")
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Fold {fold+1}, Epoch {epoch+1}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_batch = batch['labels'].to(device)

            loss, logits = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} | Train Loss: {avg_loss:.4f}")

    # Evaluation
    val_metrics = evaluate(model, val_loader, device)
    print(f"Fold {fold+1} Validation Metrics: {val_metrics}")

    # Save model and tokenizer per fold
    output_dir = f"./BERT_BiGRU_fold{fold+1}"
    os.makedirs(output_dir, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(output_dir, "model_state_dict.pt"))
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/485k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]




model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]


--- Training Epoch 1 ---


Training Fold 1, Epoch 1: 100%|██████████| 396/396 [07:48<00:00,  1.18s/it]


Epoch 1 | Train Loss: 0.5449


Evaluating: 100%|██████████| 99/99 [00:40<00:00,  2.47it/s]


Fold 1 Validation Metrics: {'accuracy': 0.7687203791469195, 'precision': 0.7200990799716914, 'recall': 0.7514771048744461, 'f1': 0.7354535598120708, 'f2': 0.7449846243959584, 'mcc': np.float64(0.5305551476913255), 'mcc_normalized': np.float64(0.7652775738456628), 'S': np.float64(0.7551310991208106)}
Model saved to ./BERT_CNN_fold1


--- Training Epoch 1 ---


Training Fold 2, Epoch 1: 100%|██████████| 396/396 [07:47<00:00,  1.18s/it]


Epoch 1 | Train Loss: 0.5466


Evaluating: 100%|██████████| 99/99 [00:40<00:00,  2.47it/s]


Fold 2 Validation Metrics: {'accuracy': 0.7737756714060031, 'precision': 0.7027972027972028, 'recall': 0.8164697193500738, 'f1': 0.7553809361120601, 'f2': 0.7908856774932036, 'mcc': np.float64(0.5524840901349756), 'mcc_normalized': np.float64(0.7762420450674878), 'S': np.float64(0.7835638612803457)}
Model saved to ./BERT_CNN_fold2


--- Training Epoch 1 ---


Training Fold 3, Epoch 1: 100%|██████████| 396/396 [07:47<00:00,  1.18s/it]


Epoch 1 | Train Loss: 0.5475


Evaluating: 100%|██████████| 99/99 [00:40<00:00,  2.47it/s]


Fold 3 Validation Metrics: {'accuracy': 0.7410742496050553, 'precision': 0.6516312056737589, 'recall': 0.8482274741506647, 'f1': 0.7370447617519653, 'f2': 0.7999582085393885, 'mcc': np.float64(0.5071436001651695), 'mcc_normalized': np.float64(0.7535718000825847), 'S': np.float64(0.7767650043109866)}
Model saved to ./BERT_CNN_fold3


--- Training Epoch 1 ---


Training Fold 4, Epoch 1: 100%|██████████| 396/396 [07:47<00:00,  1.18s/it]


Epoch 1 | Train Loss: 0.5451


Evaluating: 100%|██████████| 99/99 [00:40<00:00,  2.46it/s]


Fold 4 Validation Metrics: {'accuracy': 0.7575039494470774, 'precision': 0.684359296482412, 'recall': 0.8043558508674787, 'f1': 0.7395214661462752, 'f2': 0.7771041369472182, 'mcc': np.float64(0.521321231544693), 'mcc_normalized': np.float64(0.7606606157723466), 'S': np.float64(0.7688823763597824)}
Model saved to ./BERT_CNN_fold4


--- Training Epoch 1 ---


Training Fold 5, Epoch 1: 100%|██████████| 396/396 [07:47<00:00,  1.18s/it]


Epoch 1 | Train Loss: 0.5425


Evaluating: 100%|██████████| 99/99 [00:40<00:00,  2.47it/s]


Fold 5 Validation Metrics: {'accuracy': 0.7560436087849581, 'precision': 0.690694626474443, 'recall': 0.7784342688330872, 'f1': 0.7319444444444444, 'f2': 0.7591472198213771, 'mcc': np.float64(0.5126415358257258), 'mcc_normalized': np.float64(0.756320767912863), 'S': np.float64(0.75773399386712)}
Model saved to ./BERT_CNN_fold5


### Evaluation on HOCON34k Test

In [None]:
MODEL_NAME = "bert-base-german-cased"
tokenizer = BertTokenizer.from_pretrained("/content/BERT_BiGRU")
NUM_LABELS = 2
model = BERT_BiGRU(
    bert_model_name=MODEL_NAME,
    num_labels=NUM_LABELS,
    class_weights=class_weights
)
model.load_state_dict(torch.load("/content/BERT_BiGRU/model_state_dict.pt", map_location=device))
model.to(device)
model.eval()

csv_path = "/content/hatespeech_hocon34k.csv"
df = pd.read_csv(csv_path)

df = df[["text", "label_hs", "split_all"]]
df = df.rename(columns={"label_hs": "labels"})
df["text"] = df["text"].astype(str)

df["text"] = df["text"].apply(clean_text)

train_df = df[df["split_all"] == "train"][["text", "labels"]].reset_index(drop=True)
test_df = df[df["split_all"] == "test"][["text", "labels"]].reset_index(drop=True)
val_df = df[df["split_all"] == "val"][["text", "labels"]].reset_index(drop=True)


print("Model and tokenizer loaded successfully!")

predictions = []
true_labels = []
if "test_df" not in locals():
    raise ValueError("test_df is not defined. Make sure to load your test dataset.")

for text, label in zip(test_df["text"], test_df["labels"]):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]

    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs
    probs = torch.nn.functional.softmax(logits, dim=-1)

    predicted_class = torch.argmax(probs, dim=-1).item()

    predictions.append(predicted_class)
    true_labels.append(label)

results_df = pd.DataFrame({"Text": test_df["text"], "True Label": true_labels, "Predicted Label": predictions})
print(results_df.head())
#results_df.to_csv("projektstudium/results", index=False) # Pfad anpassen
print("Predictions gespeichert unter: /content/results") # Pfad anpassen


def compute_metrics_from_df(results_df, save_cm=False, cm_filename="confusion_matrix.png"):
    """
    Computes evaluation metrics for classification based on a Pandas DataFrame.
    """

    labels = results_df["True Label"].values
    preds = results_df["Predicted Label"].values

    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    f2 = fbeta_score(labels, preds, beta=2, average='binary')
    mcc = matthews_corrcoef(labels, preds)
    mcc_normalized = (mcc + 1) / 2
    S = (f2 + mcc_normalized) / 2

    cm = confusion_matrix(labels, preds)
    class_labels = ["No Hate-Speech", "Hate-Speech"]

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")

    if save_cm:
        plt.savefig(cm_filename, bbox_inches="tight")
    plt.show()

    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "f2": f2,
        "mcc": mcc,
        "mcc_normalized": mcc_normalized,
        "S": S,
    }

    return metrics

metrics = compute_metrics_from_df(results_df, save_cm=True, cm_filename="cm_results.png")
print("Evaluation Metrics:", metrics)


RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory