# BERT Authorship Attribution - Azure ML Training

Train BERT model with automatic experiment tracking in Azure ML.

**Requirements:**
- Azure ML Compute Instance with GPU
- Data uploaded to compute instance or Azure Blob Storage

**What this notebook does:**
1. Loads prepared datasets
2. Fine-tunes BERT for 7-author classification
3. Logs all metrics to Azure ML automatically
4. Saves model to Azure ML Model Registry
5. Evaluates on test set

**Time:** ~30 minutes on T4 GPU

## 1. Setup & Imports

In [None]:
# Install dependencies (first time only)
!pip install -q transformers datasets scikit-learn mlflow azureml-mlflow

print("‚úÖ Dependencies installed")

In [None]:
from pathlib import Path
import json
import numpy as np
import torch
import mlflow
import mlflow.pytorch

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    TrainerCallback
)
from datasets import load_from_disk
from sklearn.metrics import classification_report, f1_score

print("‚úÖ Imports successful")

## 2. Configuration

In [None]:
# Paths - adjust if needed
PROJECT_DIR = Path.home() / 'cloudfiles' / 'code' / 'DigHums' / 'burney-attribution'
DATA_DIR = PROJECT_DIR / 'data' / 'bert_data'
OUTPUT_DIR = PROJECT_DIR / 'models' / 'bert_authorship'

# Training config
MODEL_NAME = 'bert-base-uncased'
EPOCHS = 3
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EXPERIMENT_NAME = 'burney-attribution'

print(f"üìÇ Project dir: {PROJECT_DIR}")
print(f"üìä Data dir: {DATA_DIR}")
print(f"üíæ Output dir: {OUTPUT_DIR}")
print(f"\nü§ñ Model: {MODEL_NAME}")
print(f"üìà Epochs: {EPOCHS}, Batch size: {BATCH_SIZE}, LR: {LEARNING_RATE}")

## 3. Check GPU

In [None]:
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"‚úÖ GPU available: {gpu_name}")
    print(f"   Memory: {gpu_memory:.2f} GB")
else:
    print("‚ö†Ô∏è  No GPU detected! Training will be slow.")
    print("   Check: Compute Instance ‚Üí GPU enabled?")

## 4. Start Azure ML Experiment

In [None]:
# Start MLflow tracking
mlflow.set_experiment(EXPERIMENT_NAME)
run = mlflow.start_run()

print(f"‚úÖ Azure ML experiment started: {EXPERIMENT_NAME}")
print(f"   Run ID: {run.info.run_id}")
print(f"\nüìä View progress at: https://ml.azure.com")
print(f"   Navigate to: Experiments ‚Üí {EXPERIMENT_NAME}")

# Log parameters
mlflow.log_param("model_name", MODEL_NAME)
mlflow.log_param("epochs", EPOCHS)
mlflow.log_param("batch_size", BATCH_SIZE)
mlflow.log_param("learning_rate", LEARNING_RATE)

## 5. Load Data

In [None]:
print("üìñ Loading datasets...")

train_dataset = load_from_disk(str(DATA_DIR / 'chunked_datasets' / 'train'))
val_dataset = load_from_disk(str(DATA_DIR / 'chunked_datasets' / 'validation'))
test_dataset = load_from_disk(str(DATA_DIR / 'chunked_datasets' / 'test'))

# Load label mapping
with open(DATA_DIR / 'label_mapping.json', 'r') as f:
    label_info = json.load(f)

num_labels = len(label_info['author_to_id'])
id_to_author = {int(k): v for k, v in label_info['id_to_author'].items()}

print(f"‚úÖ Datasets loaded:")
print(f"   Train: {len(train_dataset):,} samples")
print(f"   Validation: {len(val_dataset):,} samples")
print(f"   Test: {len(test_dataset):,} samples")
print(f"\nüìö Authors ({num_labels}): {', '.join(label_info['author_to_id'].keys())}")

# Log dataset info
mlflow.log_param("num_authors", num_labels)
mlflow.log_param("train_size", len(train_dataset))
mlflow.log_param("val_size", len(val_dataset))
mlflow.log_param("test_size", len(test_dataset))

## 6. Load Model

In [None]:
print(f"ü§ñ Loading {MODEL_NAME}...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)

print("‚úÖ Model loaded")
print(f"   Parameters: {model.num_parameters():,}")

## 7. Setup Training

In [None]:
# Metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    f1_weighted = f1_score(labels, predictions, average='weighted')
    f1_macro = f1_score(labels, predictions, average='macro')
    accuracy = (predictions == labels).mean()
    
    return {
        'accuracy': accuracy,
        'f1_weighted': f1_weighted,
        'f1_macro': f1_macro
    }

# Azure ML logging callback
class AzureMLCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return
        for key, value in logs.items():
            if isinstance(value, (int, float)):
                mlflow.log_metric(key, value, step=state.global_step)

# Training arguments
training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir=str(OUTPUT_DIR / 'logs'),
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='f1_weighted',
    save_total_limit=2,
    report_to='none',  # Using mlflow directly
    fp16=torch.cuda.is_available(),
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[AzureMLCallback()]
)

print("‚úÖ Trainer configured")

## 8. Train Model

In [None]:
print("üèÉ Starting training...")
print(f"   This will take ~30 minutes on T4 GPU")
print(f"\nüìä Watch progress in real-time:")
print(f"   https://ml.azure.com ‚Üí Experiments ‚Üí {EXPERIMENT_NAME}")
print()

train_result = trainer.train()

print("\n‚úÖ Training complete!")
print(f"   Time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"   Samples/sec: {train_result.metrics['train_samples_per_second']:.2f}")

## 9. Evaluate on Test Set

In [None]:
print("üß™ Evaluating on test set...")

test_results = trainer.evaluate(test_dataset)

print("\n" + "="*70)
print("TEST SET RESULTS")
print("="*70)
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 (weighted): {test_results['eval_f1_weighted']:.4f}")
print(f"F1 (macro): {test_results['eval_f1_macro']:.4f}")

# Detailed results
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

target_names = [id_to_author[i] for i in range(num_labels)]

print("\n" + "="*70)
print("PER-AUTHOR PERFORMANCE")
print("="*70)
report = classification_report(
    true_labels,
    pred_labels,
    target_names=target_names,
    digits=4,
    output_dict=True
)
print(classification_report(
    true_labels,
    pred_labels,
    target_names=target_names,
    digits=4
))

# Log test metrics
mlflow.log_metric("test_accuracy", test_results['eval_accuracy'])
mlflow.log_metric("test_f1_weighted", test_results['eval_f1_weighted'])
mlflow.log_metric("test_f1_macro", test_results['eval_f1_macro'])

for author in target_names:
    mlflow.log_metric(f"test_{author}_f1", report[author]['f1-score'])

## 10. Save Model

In [None]:
final_model_dir = OUTPUT_DIR / 'final'
final_model_dir.mkdir(parents=True, exist_ok=True)

print(f"üíæ Saving model to {final_model_dir}...")
trainer.save_model(str(final_model_dir))
tokenizer.save_pretrained(str(final_model_dir))

# Save label mapping
with open(final_model_dir / 'label_mapping.json', 'w') as f:
    json.dump(label_info, f, indent=2)

print("‚úÖ Model saved locally")

# Log to Azure ML Model Registry
print("\nüì¶ Logging model to Azure ML Model Registry...")
mlflow.pytorch.log_model(
    model,
    "model",
    registered_model_name="burney-authorship-attribution"
)

print("‚úÖ Model logged to Azure ML")
print("   Access at: https://ml.azure.com ‚Üí Models")

## 11. Finish Experiment

In [None]:
mlflow.end_run()

print("\n" + "="*70)
print("TRAINING COMPLETE ‚úÖ")
print("="*70)
print(f"\nüìä View full results:")
print(f"   https://ml.azure.com")
print(f"   ‚Üí Experiments ‚Üí {EXPERIMENT_NAME}")
print(f"\nüì¶ Model location:")
print(f"   Local: {final_model_dir}")
print(f"   Azure ML: Models ‚Üí burney-authorship-attribution")
print(f"\nüéØ Test Accuracy: {test_results['eval_accuracy']:.2%}")
print("="*70)

## Next Steps

**Test on anonymous works:**
```python
# Run in new notebook or terminal:
python scripts/test_anonymous_attribution.py
```

**Compare experiments:**
- Go to Azure ML Studio
- Experiments ‚Üí burney-attribution
- Select multiple runs
- Click "Compare"

**Deploy as API (optional):**
```python
# See deploy_model.py for full script
```