In [None]:
# First, we install necessary libraries and tools.
!pip install transformers datasets evaluate
!pip install accelerate -U

In [None]:
# Import the function to login to HuggingFace hub from within the notebook.
from huggingface_hub import notebook_login

# Login to the HuggingFace hub.
notebook_login()

In [None]:
# Load the GermEval 2018 dataset from HuggingFace's datasets library.
from datasets import load_dataset
germeval18 = load_dataset("philschmid/germeval18")

In [None]:
# Import the tokenizer for the BERT-base German model.
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

In [None]:
# Define a function to preprocess the dataset by tokenizing the text data.
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
# Apply the preprocessing function to the dataset.
tokenized_germeval18 = germeval18.map(preprocess_function, batched=True)

In [None]:
# Import a data collator that will pad the tokenized sequences.
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Import the evaluate library and load the accuracy metric.
import evaluate
accuracy = evaluate.load("accuracy")

In [None]:
# Define a function to compute metrics, in this case, accuracy.
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
# Define mapping between labels and their IDs.
id2label = {0: "OTHER", 1: "OFFENSE"}
label2id = {"OTHER": 0, "OFFENSE": 1}

In [None]:
# Import and initialize the model for sequence classification.
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-german-cased", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
# Remove and rename certain columns from the dataset.
tokenized_germeval18 = tokenized_germeval18.remove_columns(["multi"])
tokenized_germeval18 = tokenized_germeval18.rename_column("binary", "label")

In [None]:
# Define class names for the classification task.
class_names = ['OTHER', 'OFFENSE']

In [None]:
# Define a function to convert label names to their corresponding IDs.
def label_to_id(example):
    example['label'] = class_names.index(example['label'])
    return example

In [None]:
# Convert labels in the dataset to their respective IDs.
from datasets import ClassLabel
for split in tokenized_germeval18.keys():
    tokenized_germeval18[split] = tokenized_germeval18[split].map(label_to_id)
    tokenized_germeval18[split].features['label'] = ClassLabel(names=class_names)

In [None]:
# Define training arguments such as learning rate, batch size, etc.
training_args = TrainingArguments(
    output_dir="bert-base-german-cased-hatespeech-GermEval18",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [None]:
# Initialize the Trainer class to handle model training.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_germeval18["train"],
    eval_dataset=tokenized_germeval18["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Begin the model training process.
trainer.train()