# Fine-tune Biber LM

In [1]:
import os


os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
import wandb
from typing import Dict, List

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
# Settings
BASE_DIR = "/shared/3/projects/hiatus/tagged_data/models"
MODEL_NAME = "roberta-base"
RUN_NAME = "finetune"
NUM_EPOCHS=5

RUN_DIR = os.path.join(BASE_DIR, MODEL_NAME, RUN_NAME)
os.makedirs(RUN_DIR, exist_ok=True)

# Configuration variables
OUTPUT_DIR = os.path.join(RUN_DIR, "results")
WANDB_RUN_NAME = f"{MODEL_NAME}-{RUN_NAME}"
MODEL_SAVE_PATH = os.path.join(RUN_DIR, "best_model")
TAG_PERFORMANCE_SUMMARY_PATH = os.path.join(RUN_DIR, "tag_level_performance_summary.csv")

# Ensure all directories exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_SAVE_PATH, exist_ok=True)

**Setup the data**

In [None]:
train_df = pd.read_csv('/shared/3/projects/hiatus/tagged_data/binary_train.tsv', sep='\t')
dev_df = pd.read_csv('/shared/3/projects/hiatus/tagged_data/binary_dev.tsv',  sep='\t')
test_df = pd.read_csv('/shared/3/projects/hiatus/tagged_data/binary_test.tsv',  sep='\t')

text_column = 'text'
label_columns = train_df.columns[1:].tolist()

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

**Setup the model**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    problem_type="multi_label_classification",
    num_labels=len(label_columns)
)

model = model.to('cuda')

In [None]:
def tokenize_function(examples):
    return tokenizer(examples[text_column], padding="max_length", truncation=True, max_length=512)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_dev = dev_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.array(predictions >= 0.5, dtype=float)
    
    f1_micro = f1_score(labels, predictions, average='micro')
    f1_macro = f1_score(labels, predictions, average='macro')
    
    # ROC AUC score (ignoring errors for any all-zero columns)
    roc_auc = roc_auc_score(labels, predictions, average='macro', multi_class='ovr')
    
    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'roc_auc': roc_auc
    }

**Train the model**

In [None]:
wandb.init(project="biber-multidimensional-register-analysis", name=RUN_NAME)

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    push_to_hub=False,
    load_best_model_at_end=True,
    report_to="wandb",  # Report to wandb
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
test_results = trainer.evaluate(tokenized_test)
print(f"Test results: {test_results}")

# Save the model
trainer.save_model(MODEL_SAVE_PATH)

**Full report on the test data**

In [None]:
def compute_tag_level_metrics(predictions: np.ndarray, labels: np.ndarray, tag_names: List[str]) -> Dict[str, Dict[str, float]]:
    tag_metrics = {}
    for i, tag in enumerate(tag_names):
        tag_predictions = predictions[:, i]
        tag_labels = labels[:, i]
        tag_metrics[tag] = {
            'precision': precision_score(tag_labels, tag_predictions),
            'recall': recall_score(tag_labels, tag_predictions),
            'f1': f1_score(tag_labels, tag_predictions),
            'auc': roc_auc_score(tag_labels, tag_predictions)
        }
    return tag_metrics

# Get predictions on the test set
test_predictions = trainer.predict(tokenized_test)
test_predictions_binary = np.array(test_predictions.predictions >= 0.5, dtype=float)

# Compute tag-level metrics
tag_level_metrics = compute_tag_level_metrics(test_predictions_binary, test_predictions.label_ids, label_columns)

# Print and log tag-level metrics
print("\nTag-level performance on test set:")
for tag, metrics in tag_level_metrics.items():
    print(f"{tag}:")
    for metric_name, metric_value in metrics.items():
        print(f"  {metric_name}: {metric_value:.4f}")
    print()

    # Log to wandb
    wandb.log({f"{tag}_{metric_name}": metric_value for metric_name, metric_value in metrics.items()})

# Create a summary DataFrame and save to CSV
summary_data = []
for tag, metrics in tag_level_metrics.items():
    summary_data.append({
        'Tag': tag,
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1 Score': metrics['f1'],
        'AUC': metrics['auc']
    })

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(TAG_PERFORMANCE_SUMMARY_PATH, index=False)

In [None]:
wandb.finish()