# Fine-tuning Test

In [101]:
from datasets import *
import datasets

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, get_peft_model, LoraConfig
import evaluate


### Load gnad10 dataset and create datasets with only one class

In [102]:
# load dataset
dataset = load_dataset('community-datasets/gnad10')

dataset_onlyWeb = dataset.filter(lambda x: x['label'] == 0)
dataset_onlyPanorama = dataset.filter(lambda x: x['label'] == 1)
dataset_onlyInternational = dataset.filter(lambda x: x['label'] == 2)
dataset_onlyWirtschaft = dataset.filter(lambda x: x['label'] == 3)
dataset_onlySport = dataset.filter(lambda x: x['label'] == 4)
dataset_onlyInland = dataset.filter(lambda x: x['label'] == 5)
dataset_onlyEtat = dataset.filter(lambda x: x['label'] == 6)
dataset_onlyWissenschaft = dataset.filter(lambda x: x['label'] == 7)
dataset_onlyKultur = dataset.filter(lambda x: x['label'] == 8)

print(dataset_onlyWeb)
print(dataset_onlyPanorama)
print(dataset_onlyInternational)
print(dataset_onlyWirtschaft)
print(dataset_onlySport)
print(dataset_onlyInland)
print(dataset_onlyEtat)
print(dataset_onlyWissenschaft)
print(dataset_onlyKultur)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1509
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 168
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1510
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 168
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1360
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 151
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1270
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 141
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1081
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 120
    })
})
DatasetDict({
    train: Dataset({
        fe

### Load distilbert-base-uncased and connect with LoRA Adapter weights

In [103]:
model_output = 'evaluation'
model_checkpoint = 'distilbert/distilbert-base-uncased'
adapter_name = 'cyrp/distilbert-base-uncased-gnad10'

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=9)
model = PeftModel.from_pretrained(model, adapter_name)
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenize datasets

In [104]:
# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [105]:
# create tokenize function
def tokenize(batch):

    #tokenize and truncate text
    tokens = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    print(batch['label'])
    return tokens


# tokenize training and validation datasets
dataset_onlyWeb = dataset_onlyWeb.map(tokenize, batched=True)
dataset_onlyPanorama = dataset_onlyPanorama.map(tokenize, batched=True)
dataset_onlyInternational = dataset_onlyInternational.map(tokenize, batched=True)
dataset_onlyWirtschaft = dataset_onlyWirtschaft.map(tokenize, batched=True)
dataset_onlySport = dataset_onlySport.map(tokenize, batched=True)
dataset_onlyInland = dataset_onlyInland.map(tokenize, batched=True)
dataset_onlyEtat = dataset_onlyEtat.map(tokenize, batched=True)
dataset_onlyWissenschaft = dataset_onlyWissenschaft.map(tokenize, batched=True)
dataset_onlyKultur = dataset_onlyKultur.map(tokenize, batched=True)

print(dataset_onlyWeb)
print(dataset_onlyPanorama)
print(dataset_onlyInternational)
print(dataset_onlyWirtschaft)
print(dataset_onlySport)
print(dataset_onlyInland)
print(dataset_onlyEtat)
print(dataset_onlyWissenschaft)
print(dataset_onlyKultur)

Map:   0%|          | 0/1510 [00:00<?, ? examples/s]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [106]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [107]:
# show metrics for f1 and accuracy
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    return {
    'f1': f1,
    'accuracy': accuracy
    }

### Create Trainer Objects

In [108]:
trainer0 = Trainer(
    model=model,
    eval_dataset=dataset_onlyWeb["test"],
    processing_class=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics
)

trainer1 = Trainer(
    model=model,
    eval_dataset=dataset_onlyPanorama["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer2 = Trainer(
    model=model,
    eval_dataset=dataset_onlyInternational["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer3 = Trainer(
    model=model,
    eval_dataset=dataset_onlyWirtschaft["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer4 = Trainer(
    model=model,
    eval_dataset=dataset_onlySport["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer5 = Trainer(
    model=model,
    eval_dataset=dataset_onlyInland["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer6 = Trainer(
    model=model,
    eval_dataset=dataset_onlyEtat["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer7 = Trainer(
    model=model,
    eval_dataset=dataset_onlyWissenschaft["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer8 = Trainer(
    model=model,
    eval_dataset=dataset_onlyKultur["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

### Evaluate

In [109]:
print("WEB")
trainer0.evaluate()

WEB


{'eval_loss': 0.3095804750919342,
 'eval_model_preparation_time': 0.002,
 'eval_f1': 0.9597523219814241,
 'eval_accuracy': 0.9226190476190477,
 'eval_runtime': 2.8364,
 'eval_samples_per_second': 59.231,
 'eval_steps_per_second': 7.404}

In [110]:
print("PANORAMA")
trainer1.evaluate()

PANORAMA


{'eval_loss': 0.8499999046325684,
 'eval_model_preparation_time': 0.002,
 'eval_f1': 0.8493150684931505,
 'eval_accuracy': 0.7380952380952381,
 'eval_runtime': 2.6876,
 'eval_samples_per_second': 62.509,
 'eval_steps_per_second': 7.814}

In [111]:
print("INTERNATIONAL")
trainer2.evaluate()

INTERNATIONAL


{'eval_loss': 0.7888514399528503,
 'eval_model_preparation_time': 0.001,
 'eval_f1': 0.8731343283582089,
 'eval_accuracy': 0.7748344370860927,
 'eval_runtime': 2.4291,
 'eval_samples_per_second': 62.163,
 'eval_steps_per_second': 7.822}

In [112]:
print("WIRTSCHAFT")
trainer3.evaluate()

WIRTSCHAFT


{'eval_loss': 0.7450016736984253,
 'eval_model_preparation_time': 0.003,
 'eval_f1': 0.8629032258064516,
 'eval_accuracy': 0.7588652482269503,
 'eval_runtime': 2.2828,
 'eval_samples_per_second': 61.767,
 'eval_steps_per_second': 7.885}

In [113]:
print("SPORT")
trainer4.evaluate()

SPORT


{'eval_loss': 0.13413070142269135,
 'eval_model_preparation_time': 0.003,
 'eval_f1': 0.9915966386554622,
 'eval_accuracy': 0.9833333333333333,
 'eval_runtime': 1.9383,
 'eval_samples_per_second': 61.91,
 'eval_steps_per_second': 7.739}

In [114]:
print("INLAND")
trainer5.evaluate()

INLAND


{'eval_loss': 0.6003679037094116,
 'eval_model_preparation_time': 0.001,
 'eval_f1': 0.847457627118644,
 'eval_accuracy': 0.7352941176470589,
 'eval_runtime': 1.6484,
 'eval_samples_per_second': 61.877,
 'eval_steps_per_second': 7.886}

In [115]:
print("ETAT")
trainer6.evaluate()

ETAT


{'eval_loss': 1.184307336807251,
 'eval_model_preparation_time': 0.002,
 'eval_f1': 0.864406779661017,
 'eval_accuracy': 0.7611940298507462,
 'eval_runtime': 1.0839,
 'eval_samples_per_second': 61.814,
 'eval_steps_per_second': 8.303}

In [116]:
print("WISSENSCHAFT")
trainer7.evaluate()

WISSENSCHAFT


{'eval_loss': 0.6182005405426025,
 'eval_model_preparation_time': 0.001,
 'eval_f1': 0.9142857142857143,
 'eval_accuracy': 0.8421052631578947,
 'eval_runtime': 0.9162,
 'eval_samples_per_second': 62.213,
 'eval_steps_per_second': 8.732}

In [117]:
print("KULTUR")
trainer8.evaluate()

KULTUR


{'eval_loss': 0.6667774319648743,
 'eval_model_preparation_time': 0.0036,
 'eval_f1': 0.8865979381443299,
 'eval_accuracy': 0.7962962962962963,
 'eval_runtime': 0.8893,
 'eval_samples_per_second': 60.723,
 'eval_steps_per_second': 7.871}