In [177]:
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, DataCollatorWithPadding, Trainer, EarlyStoppingCallback
import evaluate
import warnings
import torch

torch.manual_seed(42)
warnings.filterwarnings("ignore")

In [178]:
model_name = "google-bert/bert-base-german-cased"

In [179]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [180]:
data_files = {"train": "articlesTrain_10kGNAD_10perSegment.csv", "test": "articlesTest_10kGNAD_10perSegment.csv"}
dataset = load_dataset("csv", data_files=data_files)

def tokenize_function(set):
    return tokenizer(set["Article"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize_function, batched=True);

def label_mapping(x):
    return [ "Etat", "Inland", "International", "Kultur", "Panorama", "Sport", "Web", "Wirtschaft", "Wissenschaft", ].index(x)

dataset = dataset.map(lambda x: {"label": label_mapping(x["Segment"])})

In [181]:
# Initialize a BERT model for binary classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=9)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [182]:
# Freeze all layers except the classifier
for param in model.bert.parameters():
    param.requires_grad = False

# Keep only the classification head trainable
for param in model.bert.parameters():
    param.requires_grad = True


In [183]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",           # Directory for saving model checkpoints
    # learning_rate=5e-5,              # Start with a small learning rate
    per_device_train_batch_size=16,  # Batch size per GPU
    per_device_eval_batch_size=16,
    use_mps_device=True,
    num_train_epochs=20,              # Number of epochs
    weight_decay=0.01,               # Regularization
    save_total_limit=2,              # Limit checkpoints to save space
    load_best_model_at_end=True,     # Automatically load the best checkpoint
    logging_dir="./logs",            # Directory for logs
    logging_steps=100,               # Log every 100 steps
    eval_strategy="epoch",
    save_strategy="epoch"
)

In [184]:
metric = evaluate.load("accuracy")

In [185]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [186]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,                        # Pre-trained BERT model
    args=training_args,                 # Training arguments
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,        # Efficient batching
    compute_metrics=compute_metrics     # Custom metric function
)
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))


In [187]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.97864,0.454545
2,No log,1.753952,0.515152
3,No log,1.536484,0.585859
4,No log,1.378392,0.646465
5,No log,1.250689,0.626263
6,No log,1.266021,0.636364
7,No log,1.303836,0.616162


TrainOutput(global_step=49, training_loss=0.856272249805684, metrics={'train_runtime': 27.6444, 'train_samples_per_second': 71.624, 'train_steps_per_second': 5.064, 'total_flos': 45586855302912.0, 'train_loss': 0.856272249805684, 'epoch': 7.0})

In [188]:
trainer.evaluate()

{'eval_loss': 1.2506893873214722,
 'eval_accuracy': 0.6262626262626263,
 'eval_runtime': 0.5952,
 'eval_samples_per_second': 166.341,
 'eval_steps_per_second': 11.761,
 'epoch': 7.0}