### HuggingFace dependencies

In [None]:
from datasets import *
import datasets

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, get_peft_model, LoraConfig
import evaluate


### Load gnad10 dataset and create datasets with only one class

In [None]:
# load dataset
dataset = load_dataset('community-datasets/gnad10')

dataset_onlyWeb = dataset.filter(lambda x: x['label'] == 0)
dataset_onlyPanorama = dataset.filter(lambda x: x['label'] == 1)
dataset_onlyInternational = dataset.filter(lambda x: x['label'] == 2)
dataset_onlyWirtschaft = dataset.filter(lambda x: x['label'] == 3)
dataset_onlySport = dataset.filter(lambda x: x['label'] == 4)
dataset_onlyInland = dataset.filter(lambda x: x['label'] == 5)
dataset_onlyEtat = dataset.filter(lambda x: x['label'] == 6)
dataset_onlyWissenschaft = dataset.filter(lambda x: x['label'] == 7)
dataset_onlyKultur = dataset.filter(lambda x: x['label'] == 8)

### Load distilbert model and connect with LoRA Adapter weights

In [None]:
model_output = 'evaluation'
model_checkpoint = 'allenai/longformer-base-4096'
adapter_name = 'cyrp/longformer-base-4096-gnad10'

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=9)
model = PeftModel.from_pretrained(model, adapter_name)
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

### Preprocess data

In [None]:
# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [None]:
# create tokenize function
def tokenize(batch):

    #tokenize and truncate text
    tokens = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    print(batch['label'])
    return tokens


# tokenize training and validation datasets
dataset_onlyWeb = dataset_onlyWeb.map(tokenize, batched=True)
dataset_onlyPanorama = dataset_onlyPanorama.map(tokenize, batched=True)
dataset_onlyInternational = dataset_onlyInternational.map(tokenize, batched=True)
dataset_onlyWirtschaft = dataset_onlyWirtschaft.map(tokenize, batched=True)
dataset_onlySport = dataset_onlySport.map(tokenize, batched=True)
dataset_onlyInland = dataset_onlyInland.map(tokenize, batched=True)
dataset_onlyEtat = dataset_onlyEtat.map(tokenize, batched=True)
dataset_onlyWissenschaft = dataset_onlyWissenschaft.map(tokenize, batched=True)
dataset_onlyKultur = dataset_onlyKultur.map(tokenize, batched=True)

In [None]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# show metrics for f1 and accuracy
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    return {
    'f1': f1,
    'accuracy': accuracy
    }

### Create Trainer Objects

In [None]:
trainer0 = Trainer(
    model=model,
    eval_dataset=dataset_onlyWeb["test"],
    processing_class=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics
)

trainer1 = Trainer(
    model=model,
    eval_dataset=dataset_onlyPanorama["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer2 = Trainer(
    model=model,
    eval_dataset=dataset_onlyInternational["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer3 = Trainer(
    model=model,
    eval_dataset=dataset_onlyWirtschaft["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer4 = Trainer(
    model=model,
    eval_dataset=dataset_onlySport["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer5 = Trainer(
    model=model,
    eval_dataset=dataset_onlyInland["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer6 = Trainer(
    model=model,
    eval_dataset=dataset_onlyEtat["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer7 = Trainer(
    model=model,
    eval_dataset=dataset_onlyWissenschaft["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer8 = Trainer(
    model=model,
    eval_dataset=dataset_onlyKultur["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

### Evaluate

In [None]:
print("WEB")
trainer0.evaluate()

In [None]:
print("PANORAMA")
trainer1.evaluate()

In [None]:
print("INTERNATIONAL")
trainer2.evaluate()

In [None]:
print("WIRTSCHAFT")
trainer3.evaluate()

In [None]:
print("SPORT")
trainer4.evaluate()

In [None]:
print("INLAND")
trainer5.evaluate()

In [None]:
print("ETAT")
trainer6.evaluate()

In [None]:
print("WISSENSCHAFT")
trainer7.evaluate()

In [None]:
print("KULTUR")
trainer8.evaluate()