# BERT Authorship Attribution Training (Google Drive Integration)

This notebook trains the BERT model with data stored in Google Drive, avoiding slow upload/download cycles.

**Setup Requirements:**
1. Upload `burney_colab_data.zip` to your Google Drive root
2. Run this notebook in Google Colab with GPU enabled
3. Model will save directly to Google Drive
4. Download final model from Drive (faster than Colab direct download)

**Time Estimate:** ~30 minutes on T4 GPU, ~15 minutes on V100

## 1. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

print("‚úÖ Google Drive mounted at /content/drive/MyDrive")

## 2. Extract Data from Drive (First Time Only)

In [None]:
import os

# Check if data already exists in Drive
drive_data_path = '/content/drive/MyDrive/burney_data'

if os.path.exists(drive_data_path):
    print(f"‚úÖ Data already exists at {drive_data_path}")
    print("   Skipping extraction. Delete folder to re-extract.")
else:
    print("üì¶ Extracting data from zip file...")
    zip_path = '/content/drive/MyDrive/burney_colab_data.zip'
    
    if not os.path.exists(zip_path):
        print(f"‚ùå ERROR: {zip_path} not found!")
        print("   Please upload burney_colab_data.zip to your Google Drive root.")
    else:
        # Extract to Drive (persists between sessions)
        !unzip -q "$zip_path" -d /content/drive/MyDrive/burney_data
        print(f"‚úÖ Data extracted to {drive_data_path}")
        print("   This will persist between Colab sessions!")

## 3. Install Dependencies

In [None]:
!pip install -q transformers datasets scikit-learn tqdm
print("‚úÖ Dependencies installed")

## 4. Setup Paths

In [None]:
from pathlib import Path

# All paths point to Google Drive
DRIVE_BASE = Path('/content/drive/MyDrive')
DATA_DIR = DRIVE_BASE / 'burney_data' / 'data' / 'bert_data'
OUTPUT_DIR = DRIVE_BASE / 'burney_models' / 'bert_authorship'

# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"üìÇ Data location: {DATA_DIR}")
print(f"üíæ Model will save to: {OUTPUT_DIR}")
print("\n‚ö†Ô∏è  IMPORTANT: Model saves directly to Drive (no download needed!)")
print("   Access it from any device via Google Drive.")

## 5. Check GPU

In [None]:
import torch

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"‚úÖ GPU available: {gpu_name}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ùå No GPU detected!")
    print("   Go to Runtime > Change runtime type > Select GPU")

## 6. Load Data from Drive

In [None]:
from datasets import load_from_disk
import json

# Load datasets from Drive
print("üìñ Loading datasets from Google Drive...")
train_dataset = load_from_disk(str(DATA_DIR / 'chunked_datasets' / 'train'))
val_dataset = load_from_disk(str(DATA_DIR / 'chunked_datasets' / 'validation'))
test_dataset = load_from_disk(str(DATA_DIR / 'chunked_datasets' / 'test'))

# Load label mapping
with open(DATA_DIR / 'label_mapping.json', 'r') as f:
    label_info = json.load(f)

print(f"‚úÖ Loaded datasets:")
print(f"   Train: {len(train_dataset):,} samples")
print(f"   Validation: {len(val_dataset):,} samples")
print(f"   Test: {len(test_dataset):,} samples")
print(f"\nüìö Authors: {', '.join(label_info['author_to_id'].keys())}")

## 7. Train Model (Saves to Drive)

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import classification_report, f1_score
import numpy as np

# Load model and tokenizer
model_name = 'bert-base-uncased'
num_labels = len(label_info['author_to_id'])

print(f"ü§ñ Loading {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# Training arguments - saves directly to Google Drive!
training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),  # ‚Üê Saves to Drive!
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir=str(OUTPUT_DIR / 'logs'),
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='f1_weighted',
    save_total_limit=2,  # Keep only best 2 checkpoints
    report_to='none'
)

# Metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    f1_weighted = f1_score(labels, predictions, average='weighted')
    f1_macro = f1_score(labels, predictions, average='macro')
    accuracy = (predictions == labels).mean()
    return {
        'accuracy': accuracy,
        'f1_weighted': f1_weighted,
        'f1_macro': f1_macro
    }

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print("\nüèÉ Starting training...")
print(f"   Model saves to: {OUTPUT_DIR}")
print("   This may take 30-45 minutes on T4 GPU\n")

trainer.train()

print("\n‚úÖ Training complete!")
print(f"   Model saved to Google Drive: {OUTPUT_DIR}")

## 8. Evaluate on Test Set

In [None]:
print("üß™ Evaluating on test set...")

# Evaluate
test_results = trainer.evaluate(test_dataset)

print("\n" + "="*70)
print("TEST SET RESULTS")
print("="*70)
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 (weighted): {test_results['eval_f1_weighted']:.4f}")
print(f"F1 (macro): {test_results['eval_f1_macro']:.4f}")

# Detailed per-author results
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

id_to_author = {int(k): v for k, v in label_info['id_to_author'].items()}
target_names = [id_to_author[i] for i in range(num_labels)]

print("\n" + "="*70)
print("PER-AUTHOR PERFORMANCE")
print("="*70)
print(classification_report(
    true_labels,
    pred_labels,
    target_names=target_names,
    digits=4
))
print("="*70)

## 9. Save Final Model to Drive

In [None]:
# Save final model explicitly
final_model_dir = OUTPUT_DIR / 'final'
final_model_dir.mkdir(exist_ok=True)

print(f"üíæ Saving final model to {final_model_dir}...")
trainer.save_model(str(final_model_dir))
tokenizer.save_pretrained(str(final_model_dir))

print("\n‚úÖ COMPLETE!")
print("\n" + "="*70)
print("NEXT STEPS")
print("="*70)
print(f"1. Model is saved in your Google Drive:")
print(f"   {final_model_dir}")
print("\n2. To use this model:")
print("   - Access via Google Drive on any device")
print("   - Download to local: Right-click folder > Download")
print("   - Or mount Drive in another Colab notebook")
print("\n3. Files in the model directory:")
print("   - model.safetensors (~418 MB) - model weights")
print("   - config.json - model configuration")
print("   - tokenizer files - for text processing")
print("\n4. To test on anonymous works:")
print("   - Upload test_anonymous_attribution.py to Colab")
print("   - Point it to this Drive model directory")
print("   - Run tests directly in Colab (no download!)")
print("="*70)

## Optional: Download Model to Local (if needed)

In [None]:
# Only run this if you want to download via Colab
# (Usually faster to download directly from Google Drive)

# Uncomment to create zip and download:
# !zip -r final_model.zip "/content/drive/MyDrive/burney_models/bert_authorship/final"
# from google.colab import files
# files.download('final_model.zip')

print("‚ö†Ô∏è  Recommendation: Download from Google Drive instead")
print("   It's faster and doesn't count against Colab usage limits")