# 🚀 LLaMA Fine-tuning with LoRA for Insurance Tasks

This notebook performs the main fine-tuning of LLaMA using LoRA (Low-Rank Adaptation) for insurance-specific tasks:

## What this notebook does:
1. Load the base LLaMA model with quantization
2. Set up LoRA configuration for efficient fine-tuning
3. Load tokenized datasets
4. Configure training arguments and trainer
5. Monitor training progress with W&B
6. Save checkpoints and final model
7. Merge and export the fine-tuned model

**⚠️ Important: Make sure GPU is enabled and you have sufficient memory**

## 1. Import Libraries and Check Environment

In [None]:
import os
import json
import torch
import gc
from pathlib import Path
from typing import Dict, List, Optional, Union
import warnings
from datetime import datetime
import time

# Core ML libraries
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)

# PEFT and LoRA
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType,
    PeftModel
)

# Quantization
from transformers import BitsAndBytesConfig
import bitsandbytes as bnb

# Datasets
from datasets import Dataset, DatasetDict, load_from_disk

# Monitoring
import wandb
from tqdm.auto import tqdm

# Utilities
import numpy as np
import pandas as pd
from huggingface_hub import login

warnings.filterwarnings('ignore')

print(f"✅ Libraries imported successfully")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"Current GPU memory usage: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
else:
    print("⚠️ No GPU detected - training will be very slow!")

## 2. Configuration and Setup

In [None]:
# Model and data configuration
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
TOKENIZED_DATA_DIR = Path("data/tokenized")
OUTPUT_DIR = Path("outputs")
CHECKPOINT_DIR = OUTPUT_DIR / "checkpoints"
FINAL_MODEL_DIR = OUTPUT_DIR / "final_model"
LOGS_DIR = OUTPUT_DIR / "logs"

# Create output directories
OUTPUT_DIR.mkdir(exist_ok=True)
CHECKPOINT_DIR.mkdir(exist_ok=True)
FINAL_MODEL_DIR.mkdir(exist_ok=True)
LOGS_DIR.mkdir(exist_ok=True)

# Load training configuration
config_file = Path("config/training_args.json")
if config_file.exists():
    with open(config_file, 'r') as f:
        TRAINING_CONFIG = json.load(f)
else:
    # Default configuration for Colab
    TRAINING_CONFIG = {
        "output_dir": str(CHECKPOINT_DIR),
        "num_train_epochs": 3,
        "per_device_train_batch_size": 2,
        "per_device_eval_batch_size": 2,
        "gradient_accumulation_steps": 8,
        "gradient_checkpointing": True,
        "learning_rate": 2e-4,
        "weight_decay": 0.01,
        "fp16": True,
        "max_grad_norm": 0.3,
        "warmup_ratio": 0.03,
        "lr_scheduler_type": "cosine",
        "save_steps": 500,
        "eval_steps": 500,
        "logging_steps": 50,
        "save_total_limit": 3,
        "load_best_model_at_end": True,
        "metric_for_best_model": "eval_loss",
        "greater_is_better": False,
        "evaluation_strategy": "steps",
        "save_strategy": "steps"
    }

# Load LoRA configuration
lora_config_file = Path("config/lora_config.json")
if lora_config_file.exists():
    with open(lora_config_file, 'r') as f:
        LORA_CONFIG = json.load(f)
else:
    # Default LoRA configuration
    LORA_CONFIG = {
        "r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.05,
        "bias": "none",
        "task_type": "CAUSAL_LM",
        "target_modules": [
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ]
    }

# Quantization configuration for 4-bit training
QUANTIZATION_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
)

# Training parameters
USE_WANDB = True  # Set to False if you don't want W&B logging
WANDB_PROJECT = "llama-insurance-finetune"
RUN_NAME = f"llama-insurance-{datetime.now().strftime('%Y%m%d-%H%M%S')}"

print(f"Configuration loaded:")
print(f"- Model: {MODEL_NAME}")
print(f"- Output directory: {OUTPUT_DIR}")
print(f"- Batch size: {TRAINING_CONFIG['per_device_train_batch_size']}")
print(f"- Gradient accumulation: {TRAINING_CONFIG['gradient_accumulation_steps']}")
print(f"- Learning rate: {TRAINING_CONFIG['learning_rate']}")
print(f"- LoRA rank: {LORA_CONFIG['r']}")
print(f"- Quantization: 4-bit with {QUANTIZATION_CONFIG.bnb_4bit_compute_dtype}")
print(f"- W&B logging: {USE_WANDB}")

## 3. Load Tokenized Datasets

In [None]:
def load_tokenized_datasets() -> tuple[DatasetDict, AutoTokenizer]:
    """Load tokenized datasets and tokenizer"""
    
    print(f"Loading tokenized datasets from {TOKENIZED_DATA_DIR}...")
    
    # Load tokenizer
    tokenizer_dir = TOKENIZED_DATA_DIR / "tokenizer"
    if not tokenizer_dir.exists():
        print(f"❌ Tokenizer not found at {tokenizer_dir}")
        print("Please run 02_tokenization.ipynb first")
        return None, None
    
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
    print(f"✅ Tokenizer loaded from {tokenizer_dir}")
    
    # Load combined dataset (or first available dataset)
    dataset_dict = None
    
    # Try to load combined dataset first
    combined_dir = TOKENIZED_DATA_DIR / "combined"
    if combined_dir.exists():
        print(f"Loading combined dataset from {combined_dir}...")
        try:
            dataset_dict = DatasetDict()
            for split in ['train', 'validation', 'test']:
                split_dir = combined_dir / split
                if split_dir.exists():
                    dataset = load_from_disk(split_dir)
                    dataset_dict[split] = dataset
                    print(f"  {split}: {len(dataset)} examples")
            
            if dataset_dict:
                print(f"✅ Combined dataset loaded")
        except Exception as e:
            print(f"❌ Error loading combined dataset: {e}")
            dataset_dict = None
    
    # If no combined dataset, try individual task datasets
    if not dataset_dict:
        task_dirs = [d for d in TOKENIZED_DATA_DIR.iterdir() 
                    if d.is_dir() and d.name not in ['tokenizer', 'combined']]
        
        if task_dirs:
            # Use the first available task dataset
            task_dir = task_dirs[0]
            print(f"Loading {task_dir.name} dataset from {task_dir}...")
            
            try:
                dataset_dict = DatasetDict()
                for split in ['train', 'validation', 'test']:
                    split_dir = task_dir / split
                    if split_dir.exists():
                        dataset = load_from_disk(split_dir)
                        dataset_dict[split] = dataset
                        print(f"  {split}: {len(dataset)} examples")
                
                print(f"✅ {task_dir.name} dataset loaded")
            except Exception as e:
                print(f"❌ Error loading {task_dir.name} dataset: {e}")
                return None, None
    
    if not dataset_dict:
        print(f"❌ No tokenized datasets found in {TOKENIZED_DATA_DIR}")
        return None, None
    
    # Validate dataset
    for split_name, dataset in dataset_dict.items():
        if len(dataset) > 0:
            sample = dataset[0]
            required_keys = ['input_ids', 'labels', 'attention_mask']
            missing_keys = [key for key in required_keys if key not in sample]
            if missing_keys:
                print(f"⚠️ Missing keys in {split_name}: {missing_keys}")
            else:
                print(f"✅ {split_name} dataset validated")
    
    return dataset_dict, tokenizer

# Load datasets
train_dataset, tokenizer = load_tokenized_datasets()

if train_dataset is None:
    print("❌ Failed to load datasets. Cannot proceed with training.")
    print("Please run the previous notebooks first:")
    print("1. 01_data_preprocessing.ipynb")
    print("2. 02_tokenization.ipynb")
else:
    print(f"\n📊 Dataset Summary:")
    total_examples = 0
    for split_name, dataset in train_dataset.items():
        examples = len(dataset)
        total_examples += examples
        print(f"  {split_name}: {examples:,} examples")
    
    print(f"  Total: {total_examples:,} examples")
    
    # Show sample
    if 'train' in train_dataset and len(train_dataset['train']) > 0:
        sample = train_dataset['train'][0]
        print(f"\nSample data structure:")
        print(f"  Keys: {list(sample.keys())}")
        print(f"  Input length: {len(sample['input_ids'])} tokens")
        print(f"  Label length: {len(sample['labels'])} tokens")

## 4. Load and Prepare Model

In [None]:
def load_base_model(model_name: str, tokenizer: AutoTokenizer) -> AutoModelForCausalLM:
    """Load the base LLaMA model with quantization"""
    
    print(f"Loading base model {model_name}...")
    print(f"Using quantization: {QUANTIZATION_CONFIG.load_in_4bit}-bit")
    
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=QUANTIZATION_CONFIG,
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.float16,
            use_cache=False  # Disable cache for training
        )
        
        # Resize token embeddings if tokenizer was modified
        if len(tokenizer) > model.config.vocab_size:
            print(f"Resizing token embeddings: {model.config.vocab_size} -> {len(tokenizer)}")
            model.resize_token_embeddings(len(tokenizer))
        
        print(f"✅ Model loaded successfully")
        print(f"  Model size: ~{sum(p.numel() for p in model.parameters()) / 1e6:.0f}M parameters")
        print(f"  Vocab size: {model.config.vocab_size}")
        print(f"  Max position embeddings: {model.config.max_position_embeddings}")
        
        return model
        
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        print("Make sure you're authenticated with Hugging Face and have access to LLaMA models")
        raise

def setup_lora_model(model: AutoModelForCausalLM, lora_config: dict) -> PeftModel:
    """Set up LoRA configuration and wrap the model"""
    
    print(f"Setting up LoRA with configuration:")
    for key, value in lora_config.items():
        print(f"  {key}: {value}")
    
    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)
    
    # Create LoRA configuration
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=lora_config['r'],
        lora_alpha=lora_config['lora_alpha'],
        lora_dropout=lora_config['lora_dropout'],
        bias=lora_config['bias'],
        target_modules=lora_config['target_modules'],
        inference_mode=False
    )
    
    # Wrap model with LoRA
    model = get_peft_model(model, peft_config)
    
    # Print trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    
    print(f"\n✅ LoRA model ready")
    print(f"  Trainable parameters: {trainable_params:,} ({trainable_params/total_params:.2%})")
    print(f"  Total parameters: {total_params:,}")
    
    return model

# Load model if datasets are available
if train_dataset is not None and tokenizer is not None:
    # Clear GPU memory first
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
    
    # Load base model
    base_model = load_base_model(MODEL_NAME, tokenizer)
    
    # Set up LoRA
    model = setup_lora_model(base_model, LORA_CONFIG)
    
    # Check GPU memory usage
    if torch.cuda.is_available():
        memory_used = torch.cuda.memory_allocated(0) / 1e9
        memory_reserved = torch.cuda.memory_reserved(0) / 1e9
        print(f"\nGPU Memory Usage:")
        print(f"  Allocated: {memory_used:.2f} GB")
        print(f"  Reserved: {memory_reserved:.2f} GB")
else:
    print("❌ Skipping model loading due to missing datasets")
    model = None

## 5. Setup Training Components

In [None]:
def setup_training_arguments(config: dict) -> TrainingArguments:
    """Create training arguments from configuration"""
    
    # Update output directory to use our paths
    config = config.copy()
    config['output_dir'] = str(CHECKPOINT_DIR)
    config['logging_dir'] = str(LOGS_DIR)
    
    # Add W&B configuration if enabled
    if USE_WANDB:
        config['report_to'] = 'wandb'
        config['run_name'] = RUN_NAME
    else:
        config['report_to'] = []
    
    # Ensure directories exist
    Path(config['output_dir']).mkdir(exist_ok=True)
    Path(config['logging_dir']).mkdir(exist_ok=True)
    
    training_args = TrainingArguments(**config)
    
    print(f"Training arguments configured:")
    print(f"  Output dir: {training_args.output_dir}")
    print(f"  Epochs: {training_args.num_train_epochs}")
    print(f"  Batch size: {training_args.per_device_train_batch_size}")
    print(f"  Gradient accumulation: {training_args.gradient_accumulation_steps}")
    print(f"  Learning rate: {training_args.learning_rate}")
    print(f"  Weight decay: {training_args.weight_decay}")
    print(f"  Warmup ratio: {training_args.warmup_ratio}")
    print(f"  Save steps: {training_args.save_steps}")
    print(f"  Eval steps: {training_args.eval_steps}")
    print(f"  FP16: {training_args.fp16}")
    print(f"  Gradient checkpointing: {training_args.gradient_checkpointing}")
    
    return training_args

def setup_data_collator(tokenizer: AutoTokenizer) -> DataCollatorForLanguageModeling:
    """Setup data collator for causal language modeling"""
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,  # Causal LM, not masked LM
        pad_to_multiple_of=8,  # For efficiency
        return_tensors="pt"
    )
    
    print(f"✅ Data collator configured for causal LM")
    return data_collator

class TrainingCallbacks:
    """Custom callbacks for training monitoring"""
    
    @staticmethod
    def get_callbacks():
        callbacks = []
        
        # Early stopping callback
        early_stopping = EarlyStoppingCallback(
            early_stopping_patience=3,
            early_stopping_threshold=0.01
        )
        callbacks.append(early_stopping)
        
        return callbacks

# Setup training components if model is available
if model is not None and train_dataset is not None:
    print("Setting up training components...")
    
    # Training arguments
    training_args = setup_training_arguments(TRAINING_CONFIG)
    
    # Data collator
    data_collator = setup_data_collator(tokenizer)
    
    # Callbacks
    callbacks = TrainingCallbacks.get_callbacks()
    
    print(f"\n✅ Training components ready")
    print(f"  Callbacks: {len(callbacks)} configured")
else:
    print("❌ Skipping training setup due to missing model or datasets")

## 6. Initialize Weights & Biases (Optional)

In [None]:
def initialize_wandb():
    """Initialize Weights & Biases for experiment tracking"""
    
    if not USE_WANDB:
        print("W&B logging disabled")
        return
    
    try:
        # Initialize wandb
        wandb.init(
            project=WANDB_PROJECT,
            name=RUN_NAME,
            config={
                'model_name': MODEL_NAME,
                'lora_config': LORA_CONFIG,
                'training_config': TRAINING_CONFIG,
                'quantization': '4-bit',
                'dataset_size': {
                    split: len(dataset) for split, dataset in train_dataset.items()
                } if train_dataset else {}
            }
        )
        
        print(f"✅ W&B initialized")
        print(f"  Project: {WANDB_PROJECT}")
        print(f"  Run name: {RUN_NAME}")
        print(f"  Dashboard: {wandb.run.url}")
        
    except Exception as e:
        print(f"⚠️ W&B initialization failed: {e}")
        print("Training will continue without W&B logging")
        global USE_WANDB
        USE_WANDB = False

# Initialize W&B
initialize_wandb()

## 7. Create Trainer and Start Training

In [None]:
def create_trainer() -> Trainer:
    """Create the Hugging Face Trainer"""
    
    if model is None or train_dataset is None:
        raise ValueError("Model and dataset must be loaded before creating trainer")
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset['train'],
        eval_dataset=train_dataset.get('validation'),
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=callbacks,
    )
    
    print(f"✅ Trainer created")
    print(f"  Training samples: {len(train_dataset['train'])}")
    if 'validation' in train_dataset:
        print(f"  Validation samples: {len(train_dataset['validation'])}")
    
    return trainer

def calculate_training_time(trainer: Trainer) -> tuple:
    """Calculate estimated training time"""
    
    train_dataloader = trainer.get_train_dataloader()
    num_batches_per_epoch = len(train_dataloader)
    num_epochs = training_args.num_train_epochs
    
    total_steps = num_batches_per_epoch * num_epochs
    
    # Rough estimate: 1-3 seconds per step on T4/V100
    estimated_seconds_per_step = 2.0
    estimated_total_seconds = total_steps * estimated_seconds_per_step
    
    hours = estimated_total_seconds // 3600
    minutes = (estimated_total_seconds % 3600) // 60
    
    print(f"Training estimation:")
    print(f"  Batches per epoch: {num_batches_per_epoch}")
    print(f"  Total epochs: {num_epochs}")
    print(f"  Total steps: {total_steps}")
    print(f"  Estimated time: {int(hours)}h {int(minutes)}m")
    
    return total_steps, estimated_total_seconds

# Create trainer and estimate training time
if model is not None and train_dataset is not None:
    print("Creating trainer...")
    trainer = create_trainer()
    
    # Calculate training time
    total_steps, estimated_time = calculate_training_time(trainer)
    
    print(f"\n🚀 Ready to start training!")
    print(f"Training will begin in the next cell...")
else:
    print("❌ Cannot create trainer - missing model or datasets")
    trainer = None

## 8. Start Training 🚀

In [None]:
def start_training(trainer: Trainer):
    """Start the training process"""
    
    print(f"🚀 Starting training at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"="*60)
    
    # Clear GPU memory before training
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        
        print(f"GPU memory before training:")
        print(f"  Allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
        print(f"  Reserved: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
    
    try:
        # Start training
        start_time = time.time()
        
        training_result = trainer.train()
        
        end_time = time.time()
        training_duration = end_time - start_time
        
        print(f"\n✅ Training completed successfully!")
        print(f"Training time: {training_duration/3600:.2f} hours")
        
        # Print training summary
        print(f"\nTraining Summary:")
        print(f"  Total steps: {training_result.global_step}")
        print(f"  Final train loss: {training_result.training_loss:.4f}")
        print(f"  Best model checkpoint: {trainer.state.best_model_checkpoint}")
        
        # Log final GPU memory
        if torch.cuda.is_available():
            print(f"\nGPU memory after training:")
            print(f"  Allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
            print(f"  Reserved: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
        
        return training_result
        
    except Exception as e:
        print(f"❌ Training failed: {e}")
        print(f"This might be due to:")
        print(f"  - Insufficient GPU memory")
        print(f"  - Invalid configuration")
        print(f"  - Dataset issues")
        
        # Try to save current state
        try:
            emergency_save_path = CHECKPOINT_DIR / "emergency_checkpoint"
            trainer.save_model(emergency_save_path)
            print(f"Emergency checkpoint saved to: {emergency_save_path}")
        except:
            print("Could not save emergency checkpoint")
        
        raise

# Start training if everything is ready
if trainer is not None:
    # Final confirmation
    print(f"⚠️ About to start training with the following configuration:")
    print(f"  Model: {MODEL_NAME}")
    print(f"  Training samples: {len(train_dataset['train'])}")
    print(f"  Epochs: {training_args.num_train_epochs}")
    print(f"  Batch size: {training_args.per_device_train_batch_size}")
    print(f"  Learning rate: {training_args.learning_rate}")
    print(f"  Output directory: {training_args.output_dir}")
    
    print(f"\nStarting training in 3 seconds...")
    time.sleep(3)
    
    # Start training
    training_result = start_training(trainer)
    
else:
    print("❌ Cannot start training - trainer not initialized")
    print("Please check the previous cells for errors")
    training_result = None

## 9. Save and Export Final Model

In [None]:
def save_final_model(trainer: Trainer, model, tokenizer: AutoTokenizer):
    """Save the final trained model and tokenizer"""
    
    print(f"Saving final model to {FINAL_MODEL_DIR}...")
    
    # Save the LoRA model
    lora_model_dir = FINAL_MODEL_DIR / "lora_model"
    lora_model_dir.mkdir(exist_ok=True)
    
    model.save_pretrained(lora_model_dir)
    tokenizer.save_pretrained(lora_model_dir)
    
    print(f"✅ LoRA model saved to: {lora_model_dir}")
    
    # Also save using trainer (includes training state)
    trainer_model_dir = FINAL_MODEL_DIR / "trainer_model"
    trainer.save_model(trainer_model_dir)
    
    print(f"✅ Trainer model saved to: {trainer_model_dir}")
    
    # Save model configuration and metadata
    model_info = {
        'base_model': MODEL_NAME,
        'model_type': 'LLaMA-2-7B with LoRA',
        'task': 'Insurance Domain Fine-tuning',
        'lora_config': LORA_CONFIG,
        'training_config': TRAINING_CONFIG,
        'training_completed': datetime.now().isoformat(),
        'final_loss': training_result.training_loss if training_result else None,
        'total_steps': training_result.global_step if training_result else None,
        'best_checkpoint': trainer.state.best_model_checkpoint if hasattr(trainer.state, 'best_model_checkpoint') else None
    }
    
    info_file = FINAL_MODEL_DIR / "model_info.json"
    with open(info_file, 'w') as f:
        json.dump(model_info, f, indent=2)
    
    print(f"✅ Model info saved to: {info_file}")
    
    return lora_model_dir, trainer_model_dir

def merge_and_save_model(model, tokenizer: AutoTokenizer, base_model_name: str):
    """Merge LoRA weights with base model and save (optional)"""
    
    print(f"\nOptional: Merging LoRA weights with base model...")
    print(f"⚠️ This will require additional GPU memory and time")
    
    try:
        # Load base model for merging
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        
        # Merge LoRA weights
        merged_model = model.merge_and_unload()
        
        # Save merged model
        merged_model_dir = FINAL_MODEL_DIR / "merged_model"
        merged_model_dir.mkdir(exist_ok=True)
        
        merged_model.save_pretrained(merged_model_dir)
        tokenizer.save_pretrained(merged_model_dir)
        
        print(f"✅ Merged model saved to: {merged_model_dir}")
        
        # Clean up
        del base_model, merged_model
        torch.cuda.empty_cache()
        gc.collect()
        
        return merged_model_dir
        
    except Exception as e:
        print(f"⚠️ Model merging failed: {e}")
        print(f"This is optional - you can still use the LoRA model")
        return None

def create_model_card(model_dir: Path):
    """Create a model card for the trained model"""
    
    model_card_content = f"""
# LLaMA Insurance Fine-tuned Model

## Model Description
This model is a fine-tuned version of {MODEL_NAME} specifically trained for insurance domain tasks.

## Training Details
- **Base Model**: {MODEL_NAME}
- **Fine-tuning Method**: LoRA (Low-Rank Adaptation)
- **Training Date**: {datetime.now().strftime('%Y-%m-%d')}
- **Training Framework**: Transformers + PEFT
- **Quantization**: 4-bit with BitsAndBytesConfig

## LoRA Configuration
- **Rank (r)**: {LORA_CONFIG['r']}
- **Alpha**: {LORA_CONFIG['lora_alpha']}
- **Dropout**: {LORA_CONFIG['lora_dropout']}
- **Target Modules**: {', '.join(LORA_CONFIG['target_modules'])}

## Training Configuration
- **Learning Rate**: {TRAINING_CONFIG['learning_rate']}
- **Batch Size**: {TRAINING_CONFIG['per_device_train_batch_size']}
- **Epochs**: {TRAINING_CONFIG['num_train_epochs']}
- **Gradient Accumulation**: {TRAINING_CONFIG['gradient_accumulation_steps']}

## Usage
```python
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("{model_dir}")

# Load base model
base_model = AutoModelForCausalLM.from_pretrained("{MODEL_NAME}")

# Load LoRA model
model = PeftModel.from_pretrained(base_model, "{model_dir}")

# Generate text
inputs = tokenizer("[INST] Explain health insurance coverage [/INST]", return_tensors="pt")
outputs = model.generate(**inputs, max_length=200)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
```

## Tasks Supported
- Insurance claim classification
- Policy document summarization
- FAQ generation
- Compliance checking
- Contract question-answering

## Limitations
- Specialized for insurance domain only
- Requires validation by subject matter experts
- Should not be used for final decision making without human oversight

## License
Subject to LLaMA 2 license terms and conditions.
"""
    
    model_card_file = model_dir / "README.md"
    with open(model_card_file, 'w') as f:
        f.write(model_card_content.strip())
    
    print(f"✅ Model card created: {model_card_file}")

# Save models if training completed successfully
if training_result is not None and trainer is not None:
    print("Saving trained models...")
    
    # Save LoRA model
    lora_dir, trainer_dir = save_final_model(trainer, model, tokenizer)
    
    # Create model card
    create_model_card(lora_dir)
    
    # Optional: merge and save full model (comment out if memory is limited)
    # merged_dir = merge_and_save_model(model, tokenizer, MODEL_NAME)
    
    print(f"\n🎉 Model saving complete!")
    print(f"\nSaved models:")
    print(f"  LoRA model: {lora_dir}")
    print(f"  Trainer model: {trainer_dir}")
    # if merged_dir:
    #     print(f"  Merged model: {merged_dir}")
    
    print(f"\nNext steps:")
    print(f"1. Run 04_evaluation.ipynb to evaluate model performance")
    print(f"2. Run 05_inference_demo.ipynb to test the model")
    print(f"3. Consider uploading to Hugging Face Hub for sharing")
    
else:
    print("❌ Cannot save models - training did not complete successfully")

## 10. Cleanup and Summary

In [None]:
def cleanup_and_summary():
    """Clean up memory and provide training summary"""
    
    print("🧹 Cleaning up memory...")
    
    # Clean up models from memory
    global model, base_model, trainer
    
    if 'model' in globals():
        del model
    if 'base_model' in globals():
        del base_model
    if 'trainer' in globals():
        trainer = None
    
    # Clear GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        
        print(f"GPU memory after cleanup:")
        print(f"  Allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
        print(f"  Reserved: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
    
    # Close W&B run
    if USE_WANDB:
        try:
            wandb.finish()
            print("✅ W&B run completed")
        except:
            pass
    
    # Create training summary
    summary = {
        'training_completed': training_result is not None,
        'completion_time': datetime.now().isoformat(),
        'model_name': MODEL_NAME,
        'output_directory': str(FINAL_MODEL_DIR),
        'lora_config': LORA_CONFIG,
        'training_config': TRAINING_CONFIG
    }
    
    if training_result:
        summary.update({
            'final_loss': training_result.training_loss,
            'total_steps': training_result.global_step,
        })
    
    # Save summary
    summary_file = OUTPUT_DIR / "training_summary.json"
    with open(summary_file, 'w') as f:
        json.dump(summary, f, indent=2)
    
    print(f"\n📊 Training Summary:")
    print(json.dumps(summary, indent=2))
    print(f"\nSummary saved to: {summary_file}")
    
    return summary

# Final cleanup and summary
final_summary = cleanup_and_summary()

print(f"\n" + "="*60)
if training_result is not None:
    print(f"🎉 LLaMA Insurance Fine-tuning COMPLETED SUCCESSFULLY! 🎉")
    print(f"\nYour fine-tuned model is ready for evaluation and inference.")
    print(f"Check the outputs/ directory for saved models and logs.")
else:
    print(f"❌ Training was not completed successfully.")
    print(f"Please check the error messages above and try again.")
    print(f"Common issues: insufficient GPU memory, authentication problems, or data issues.")

print(f"\nNext notebooks to run:")
print(f"1. 04_evaluation.ipynb - Evaluate model performance")
print(f"2. 05_inference_demo.ipynb - Test model inference")
print(f"\nThank you for using this LLaMA insurance fine-tuning notebook!")
print("="*60)