In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, load_metric
from sklearn.model_selection import train_test_split
from transformers import pipeline

### Hyperparameters

In [None]:
batch_size = 5
seed       = 42
model_name = "bert-base-german-cased"
task = 'Sub3_FactClaiming'#'Sub3_FactClaiming' # Sub2_Engaging, Sub1_Toxic
metric     = load_metric('accuracy')
epochs     = 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:

df = pd.read_csv('./SharedTask-main/Data Sets/GermEval21_TrainData.csv').rename(columns={task: "labels"})

train_set, dev_test = train_test_split(df, test_size=0.001) # Set to 0.25 for correct distribution
dev_set, test_set = train_test_split(dev_test, test_size=0.5)

In [None]:
train_data = Dataset.from_pandas(train_set)
dev_data   = Dataset.from_pandas(dev_set)
test_data  = Dataset.from_pandas(test_set)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["comment_text"], padding="max_length", truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


train_processed = train_data.map(tokenize_function, batched=True, batch_size = batch_size)
dev_processed = dev_data.map(tokenize_function, batched=True, batch_size = batch_size)
test_processed = test_data.map(tokenize_function, batched=True, batch_size = batch_size)

In [None]:
unused_columns = df.loc[:, df.columns != 'labels'].columns
train_full = train_processed.shuffle(seed=seed).remove_columns(unused_columns)
dev_full   = dev_processed.shuffle(seed=seed).remove_columns(unused_columns)
test_full  = test_processed.shuffle(seed=seed).remove_columns(unused_columns)

In [None]:
train_args = TrainingArguments(
    output_dir = 'base_testing_germ_eval_' + task,
    per_device_train_batch_size = batch_size,
    num_train_epochs = epochs,
    evaluation_strategy='epoch'
)

In [None]:
trainer = Trainer(
    model = model,
    args = train_args,
    train_dataset = train_full,
    eval_dataset = dev_full,
    compute_metrics = compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
eval_csv = pd.read_csv('predictions.csv')
eval_set = Dataset.from_pandas(eval_csv)

In [None]:
eval_processed = eval_set.map(tokenize_function, batched=True, batch_size = batch_size)

In [None]:
res = trainer.predict(eval_processed)
predictions = res.predictions.argmax(-1)

In [None]:
eval_csv[task] = predictions
eval_csv.to_csv('predictions.csv') # Save the results

# Evaluation code

This is just a slight adaptation of SharedTask-main/evaluate.py

In [None]:
#!/usr/bin/env python

# for each zip file in the current directory:
#     extract zip file
#     load answer.csv into dataframe
#     calculate score
#     save score
# save all scores to csv file

import zipfile
import os
import pandas as pd

import sys
import os.path
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report
import numpy as np

df_truth = pd.read_csv("./SharedTask-main/Data Sets/GermEval21_TestData.csv")

scores = []


df_answer = pd.read_csv('predictions.csv')

y_true_1 = np.array(df_truth["Sub1_Toxic"])
y_true_2 = np.array(df_truth["Sub2_Engaging"])
y_true_3 = np.array(df_truth["Sub3_FactClaiming"])
y_pred_1 = np.array(df_answer["Sub1_Toxic"])
y_pred_2 = np.array(df_answer["Sub2_Engaging"])
y_pred_3 = np.array(df_answer["Sub3_FactClaiming"])

report = classification_report(y_true_1, y_pred_1, output_dict=True)
precision_score_1 = report["macro avg"]["precision"]
recall_score_1 = report["macro avg"]["recall"]
f1_score_1 = 0
if precision_score_1+recall_score_1 > 0:
    f1_score_1 = 2*precision_score_1*recall_score_1/(precision_score_1+recall_score_1)

report = classification_report(y_true_2, y_pred_2, output_dict=True)
precision_score_2 = report["macro avg"]["precision"]
recall_score_2 = report["macro avg"]["recall"]
f1_score_2 = 0
if precision_score_2+recall_score_2 > 0:
    f1_score_2 = 2*precision_score_2*recall_score_2/(precision_score_2+recall_score_2)

report = classification_report(y_true_3, y_pred_3, output_dict=True)
precision_score_3 = report["macro avg"]["precision"]
recall_score_3 = report["macro avg"]["recall"]
f1_score_3 = 0
if precision_score_3+recall_score_3 > 0:
    f1_score_3 = 2*precision_score_3*recall_score_3/(precision_score_3+recall_score_3)

scores.append({
"ID": 'predictions.csv',
"Sub1_F1": f1_score_1,
"Sub1_P":precision_score_1,
"Sub1_R":recall_score_1,

"Sub2_F1":f1_score_2,
"Sub2_P":precision_score_2,
"Sub2_R":recall_score_2,

"Sub3_F1":f1_score_3,
"Sub3_P":precision_score_3,
"Sub3_R":recall_score_3
})
print(scores)
df = pd.DataFrame(scores)
df.to_csv("all_scores.csv", index=False)
