In [152]:
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, DataCollatorWithPadding, Trainer, EarlyStoppingCallback, get_scheduler
import evaluate
import warnings
warnings.filterwarnings("ignore")

In [153]:
model_name = "google-bert/bert-base-german-cased"

In [154]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [155]:
data_files = {"train": "articlesTrain_10kGNAD_10perSegment.csv", "test": "articlesTest_10kGNAD_10perSegment.csv"}
dataset = load_dataset("csv", data_files=data_files)

def tokenize_function(set):
    return tokenizer(set["Article"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize_function, batched=True);

def label_mapping(x):
    return [ "Etat", "Inland", "International", "Kultur", "Panorama", "Sport", "Web", "Wirtschaft", "Wissenschaft", ].index(x)

dataset = dataset.map(lambda x: {"label": label_mapping(x["Segment"])})

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

In [156]:
# Initialize a BERT model for binary classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=9)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [157]:
# Freeze all layers except the classifier
for param in model.bert.parameters():
    param.requires_grad = False

# Keep only the classification head trainable
for param in model.bert.parameters():
    param.requires_grad = True


In [158]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",           # Directory for saving model checkpoints
    # learning_rate=5e-5,              # Start with a small learning rate
    per_device_train_batch_size=16,  # Batch size per GPU
    per_device_eval_batch_size=16,
    use_mps_device=True,
    num_train_epochs=20,              # Number of epochs
    weight_decay=0.01,               # Regularization
    save_total_limit=2,              # Limit checkpoints to save space
    load_best_model_at_end=True,     # Automatically load the best checkpoint
    logging_dir="./logs",            # Directory for logs
    logging_steps=100,               # Log every 100 steps
    eval_strategy="epoch",
    save_strategy="epoch"
)

In [159]:
metric = evaluate.load("accuracy")

In [160]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,                        # Pre-trained BERT model
    args=training_args,                 # Training arguments
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,        # Efficient batching
    compute_metrics=compute_metrics     # Custom metric function
)
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))


In [162]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.948194,0.414141
2,No log,1.717945,0.515152
3,No log,1.500844,0.555556
4,No log,1.384715,0.565657
5,No log,1.315216,0.606061
6,No log,1.405437,0.565657
7,No log,1.434353,0.565657


TrainOutput(global_step=49, training_loss=0.8057561601911273, metrics={'train_runtime': 27.6934, 'train_samples_per_second': 71.497, 'train_steps_per_second': 5.055, 'total_flos': 45586855302912.0, 'train_loss': 0.8057561601911273, 'epoch': 7.0})

In [163]:
trainer.evaluate()

{'eval_loss': 1.3152155876159668,
 'eval_accuracy': 0.6060606060606061,
 'eval_runtime': 0.6029,
 'eval_samples_per_second': 164.217,
 'eval_steps_per_second': 11.611,
 'epoch': 7.0}