In [1]:
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
import warnings
warnings.filterwarnings('ignore')

import random
random.seed(42)


## Week 07

- Exercise 1: Read chapter *1.4 Applying BERT Models* in 
[Foundations of Large Language Models](https://arxiv.org/abs/2501.09223). How can you use a pre-trained BERT model for:

    - Text classification?
    - Named entity recognition?
    - Question answering?
    - Sequence Labeling? POS, NER?
    - Span-Prediction? Make an example of a span-prediction task.
    - What is catastrophic forgetting and how can it be "avoided"?
- Exercise 2: Go through the [Hugging Face tutorial](https://huggingface.co/blog/sentiment-analysis-python) on sequence classification and choose your own model from the link in the tutorial [here](https://huggingface.co/models?pipeline_tag=text-classification&sort=downloads&search=sentiment). Import the `movies` data, choose a pretrained model and try to do a prediction. Can you also finetune the your model? Does it get better?
    

## Exercise 1

**Text classification?**

Man fügt eine Klassifikationsschicht auf den [CLS] Token von BERT hinzu und trainiert diese für die spezifische Anzahl von Klassen.

**Named entity recognition?**

Man verwendet BERT mit einer Token-Klassifikationsschicht, die jedem Token im Text ein Label (z.B. B-PER, I-PER, O) zuweist.

**Question answering?**

BERT wird mit zwei Ausgabeköpfen versehen, die die Start- und Endposition der Antwort im gegebenen Kontext vorhersagen.

**Sequence Labeling? POS, NER?**

Man nutzt BERT's Token-Repräsentationen und fügt eine Klassifikationsschicht hinzu, die jedem Token ein Label aus einem vordefinierten Tagset zuweist.

**Span-Prediction? Make an example of a span-prediction task.**

Span-Prediction identifiziert zusammenhängende Textabschnitte (Spans) für Aufgaben wie Coreference Resolution, wo z.B. "John" und "he" als referenzgleiche Spans erkannt werden.

**What is catastrophic forgetting and how can it be "avoided"?**

Catastrophic Forgetting ist das Phänomen, dass ein Modell beim Training auf neue Aufgaben das Wissen aus vorherigen Aufgaben verliert - vermieden wird es durch Techniken wie niedrige Lernraten, Adapter-Module oder Elastic Weight Consolidation.

## Exercise 2


In [2]:
print("Lade IMDB Movie Reviews Datensatz...")
imdb_dataset = load_dataset("imdb")
print(f"Anzahl Trainingsbeispiele: {len(imdb_dataset['train'])}")
print(f"Anzahl Testbeispiele: {len(imdb_dataset['test'])}")

print("\nBeispiel aus dem Datensatz:")
print(f"Text: {imdb_dataset['train'][0]['text'][:200]}...")
print(f"Label: {imdb_dataset['train'][0]['label']} (0=negativ, 1=positiv)")


Lade IMDB Movie Reviews Datensatz...
Anzahl Trainingsbeispiele: 25000
Anzahl Testbeispiele: 25000

Beispiel aus dem Datensatz:
Text: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ev...
Label: 0 (0=negativ, 1=positiv)


In [3]:
model_options = {
    "distilbert": "distilbert-base-uncased-finetuned-sst-2-english",
    "roberta": "cardiffnlp/twitter-roberta-base-sentiment-latest",
    "bert": "nlptown/bert-base-multilingual-uncased-sentiment",
    "xlm-roberta": "cardiffnlp/twitter-xlm-roberta-base-sentiment"
}

selected_model = model_options["distilbert"]
print(f"\nVerwende Modell: {selected_model}")

sentiment_pipeline = pipeline("sentiment-analysis", model=selected_model)

test_texts = [
    "This movie was absolutely fantastic! I loved every minute of it.",
    "Terrible film. Complete waste of time.",
    "The movie was okay, nothing special but not bad either.",
    "Best movie I've seen all year! Highly recommend!",
    "Boring and predictable. Would not watch again."
]

print("\n--- Vorhersagen mit dem vortrainierten Modell ---")
for text in test_texts:
    result = sentiment_pipeline(text[:512])
    print(f"\nText: {text}")
    print(f"Vorhersage: {result[0]['label']} (Konfidenz: {result[0]['score']:.4f})")



Verwende Modell: distilbert-base-uncased-finetuned-sst-2-english

--- Vorhersagen mit dem vortrainierten Modell ---

Text: This movie was absolutely fantastic! I loved every minute of it.
Vorhersage: POSITIVE (Konfidenz: 0.9999)

Text: Terrible film. Complete waste of time.
Vorhersage: NEGATIVE (Konfidenz: 0.9998)

Text: The movie was okay, nothing special but not bad either.
Vorhersage: POSITIVE (Konfidenz: 0.8738)

Text: Best movie I've seen all year! Highly recommend!
Vorhersage: POSITIVE (Konfidenz: 0.9999)

Text: Boring and predictable. Would not watch again.
Vorhersage: NEGATIVE (Konfidenz: 0.9995)


In [4]:
def evaluate_imdb(pipeline):
    print("\n--- Evaluation auf IMDB Testdaten ---")
    shuffled_test = imdb_dataset['test'].shuffle(seed=42)
    test_sample = shuffled_test.select(range(100))
    test_texts_sample = test_sample['text']
    test_labels_sample = test_sample['label']

    print(f"Label-Verteilung im Test-Sample:")
    print(f"Negative (0): {sum(1 for l in test_labels_sample if l == 0)}")
    print(f"Positive (1): {sum(1 for l in test_labels_sample if l == 1)}")

    predictions = []
    for i, text in enumerate(test_texts_sample):
        try:
            result = pipeline(text[:512])[0]
            pred_label = 1 if result['label'] == 'POSITIVE' else 0
            predictions.append(pred_label)
        except:
            predictions.append(0)

    print(f"\nVorhersage-Verteilung:")
    print(f"Negative (0): {sum(1 for p in predictions if p == 0)}")
    print(f"Positive (1): {sum(1 for p in predictions if p == 1)}")

    accuracy = accuracy_score(test_labels_sample, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(test_labels_sample, predictions, average='binary')

    print(f"\nGenauigkeit: {accuracy:.4f}")
    print(f"Präzision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    
evaluate_imdb(sentiment_pipeline)


--- Evaluation auf IMDB Testdaten ---
Label-Verteilung im Test-Sample:
Negative (0): 53
Positive (1): 47

Vorhersage-Verteilung:
Negative (0): 52
Positive (1): 48

Genauigkeit: 0.8300
Präzision: 0.8125
Recall: 0.8298
F1-Score: 0.8211


In [5]:
tokenizer = AutoTokenizer.from_pretrained(selected_model)
model = AutoModelForSequenceClassification.from_pretrained(selected_model)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)

small_train_dataset = imdb_dataset['train'].shuffle(seed=42).select(range(1000))
small_eval_dataset = imdb_dataset['test'].shuffle(seed=42).select(range(200))

tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
tokenized_eval = small_eval_dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("\nStarte Finetuning...")
trainer.train()

print("\n--- Evaluation nach Finetuning ---")
eval_results = trainer.evaluate()
print(f"Eval Results: {eval_results}")

trainer.save_model("./finetuned_model")


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]


Starte Finetuning...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.374533,0.895,0.886486,0.921348,0.854167
2,No log,0.46295,0.895,0.890052,0.894737,0.885417



--- Evaluation nach Finetuning ---


Eval Results: {'eval_loss': 0.3745329678058624, 'eval_accuracy': 0.895, 'eval_f1': 0.8864864864864865, 'eval_precision': 0.9213483146067416, 'eval_recall': 0.8541666666666666, 'eval_runtime': 8.5284, 'eval_samples_per_second': 23.451, 'eval_steps_per_second': 2.931, 'epoch': 2.0}


In [7]:
finetuned_pipeline = pipeline(
    "sentiment-analysis",
    model="./finetuned_model",
    tokenizer=tokenizer
)

print("\n--- Vorhersagen ohne finetuning ---")
evaluate_imdb(sentiment_pipeline)

print("\n--- Vorhersagen mit dem finetuned Modell ---")
evaluate_imdb(finetuned_pipeline)


--- Vorhersagen ohne finetuning ---

--- Evaluation auf IMDB Testdaten ---
Label-Verteilung im Test-Sample:
Negative (0): 53
Positive (1): 47

Vorhersage-Verteilung:
Negative (0): 52
Positive (1): 48

Genauigkeit: 0.8300
Präzision: 0.8125
Recall: 0.8298
F1-Score: 0.8211

--- Vorhersagen mit dem finetuned Modell ---

--- Evaluation auf IMDB Testdaten ---
Label-Verteilung im Test-Sample:
Negative (0): 53
Positive (1): 47

Vorhersage-Verteilung:
Negative (0): 53
Positive (1): 47

Genauigkeit: 0.8600
Präzision: 0.8511
Recall: 0.8511
F1-Score: 0.8511
