In [None]:
# Core PyTorch (CUDA 12.1 for RTX 4090 - better than 11.8)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# Transformers ecosystem
pip install transformers>=4.36.0
pip install datasets
pip install peft>=0.7.0
pip install trl
pip install accelerate

# Memory optimization
pip install bitsandbytes  # Still useful even without quantization
pip install flash-attn --no-build-isolation  # Critical for performance

# Monitoring
pip install wandb
pip install pynvml

# For TensorRT inference optimization
pip install tensorrt

# For better performance monitoring
pip install gpustat
pip install nvidia-ml-py3

In [None]:
#!/usr/bin/env python3
"""
Optimized Fine-tuning Pipeline for Meta Llama 3.2 8B
Using NVIDIA AI Software Stack (CUDA, cuDNN, TensorRT, PyTorch)
"""

import os
import gc
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import wandb
from trl import SFTTrainer
import tensorrt as trt
import pynvml

In [None]:
# NVIDIA optimizations
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False

In [None]:
class OptimizedLlamaFineTuner:
    def __init__(self, config):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.setup_nvidia_environment()
        self.tokenizer = None
        self.model = None
        self.dataset = None
        self.is_multi_gpu = torch.cuda.device_count() > 1
        
    def setup_nvidia_environment(self):
        """Setup NVIDIA environment and check GPU capabilities"""
        if not torch.cuda.is_available():
            raise RuntimeError("CUDA is not available!")
            
        # Initialize NVML for GPU monitoring
        pynvml.nvmlInit()
        gpu_count = pynvml.nvmlDeviceGetCount()
        
        print(f"🚀 NVIDIA Setup:")
        print(f"   - CUDA Version: {torch.version.cuda}")
        print(f"   - cuDNN Version: {torch.backends.cudnn.version()}")
        print(f"   - Available GPUs: {gpu_count}")
        
        for i in range(gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            name = pynvml.nvmlDeviceGetName(handle).decode()
            memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
            print(f"   - GPU {i}: {name} ({memory.total // 1024**3} GB)")
            
        # Enable TensorFloat-32 for A100/RTX 30xx series
        if torch.cuda.get_device_capability()[0] >= 8:
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.allow_tf32 = True
            print("   - TensorFloat-32 enabled for Ampere+ GPUs")
            
    def load_model_and_tokenizer(self):
        """Load Llama 3.1 8B with optimized settings for dual RTX 4090s"""
        model_name = self.config.get("model_name", "meta-llama/Llama-3.1-8B-Instruct")
        
        # With 48GB VRAM, we can afford full precision for 8B model
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
            bnb_8bit_use_double_quant=True,
        ) if self.config.get("use_quantization", False) else None
        
        print(f"📥 Loading tokenizer for {model_name}...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True,
            padding_side="right"
        )
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        print(f"📥 Loading {model_name} optimized for dual RTX 4090s...")
        
        # Load 8B model without quantization for maximum quality
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto" if self.is_multi_gpu else None,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,  # Best precision for Ada Lovelace
            attn_implementation="flash_attention_2" if self.config.get("use_flash_attention", True) else "eager",
            use_cache=False,  # Disable for training
        )
        
        # Prepare model for training (only if using quantization)
        if bnb_config:
            self.model = prepare_model_for_kbit_training(self.model)
        
        print(f"✅ Model loaded. Multi-GPU: {self.is_multi_gpu}")
        if self.is_multi_gpu:
            print(f"   Model distributed across {torch.cuda.device_count()} GPUs")
        
    def setup_lora(self):
        """Setup LoRA configuration for efficient fine-tuning"""
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=self.config.get("lora_r", 16),
            lora_alpha=self.config.get("lora_alpha", 32),
            lora_dropout=self.config.get("lora_dropout", 0.1),
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
            bias="none",
        )
        
        self.model = get_peft_model(self.model, lora_config)
        
        # Print trainable parameters
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in self.model.parameters())
        
        print(f"🎯 LoRA Configuration:")
        print(f"   - Trainable parameters: {trainable_params:,}")
        print(f"   - Total parameters: {total_params:,}")
        print(f"   - Trainable %: {100 * trainable_params / total_params:.2f}%")
        
    # def load_and_prepare_dataset(self):
    #     """Load and prepare dataset for training"""
    #     dataset_name = self.config.get("dataset_name", "tatsu-lab/alpaca")
        
    #     print(f"📊 Loading dataset: {dataset_name}")
        
    #     # Load dataset
    #     if isinstance(dataset_name, str):
    #         dataset = load_dataset(dataset_name, split="train")
    #     else:
    #         # Handle custom dataset loading
    #         dataset = dataset_name
            
    #     # Prepare dataset based on format
    #     if "instruction" in dataset.column_names and "output" in dataset.column_names:
    #         # Alpaca format
    #         def format_alpaca(examples):
    #             texts = []
    #             for instruction, input_text, output in zip(
    #                 examples["instruction"], 
    #                 examples.get("input", [""] * len(examples["instruction"])), 
    #                 examples["output"]
    #             ):
    #                 if input_text:
    #                     prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
    #                 else:
    #                     prompt = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
    #                 texts.append(prompt)
    #             return {"text": texts}
                
    #         dataset = dataset.map(format_alpaca, batched=True, remove_columns=dataset.column_names)
            
    #     elif "text" not in dataset.column_names:
    #         raise ValueError("Dataset must have 'text' column or Alpaca format (instruction/output)")
            
    #     # Filter by length to avoid OOM
    #     max_length = self.config.get("max_length", 2048)
    #     dataset = dataset.filter(lambda x: len(self.tokenizer.encode(x["text"])) <= max_length)
        
    #     # Take subset if specified
    #     if self.config.get("max_samples"):
    #         dataset = dataset.select(range(min(len(dataset), self.config["max_samples"])))
            
    #     self.dataset = dataset
    #     print(f"✅ Dataset prepared: {len(dataset)} samples")

    def load_and_prepare_dataset(self):
        """Load and prepare dataset for training"""
        dataset_name = self.config.get("dataset_name", "tatsu-lab/alpaca")
        # Add dataset_config to handle configurations like "ehr_rel_bigbio_pairs"
        dataset_config_name = self.config.get("dataset_config_name", None) 
        
        print(f"📊 Loading dataset: {dataset_name} (config: {dataset_config_name if dataset_config_name else 'None'})")
        
        # Load dataset
        if isinstance(dataset_name, str):
            # Pass dataset_config_name if provided
            if dataset_config_name:
                dataset = load_dataset(dataset_name, dataset_config_name, split="train")
            else:
                dataset = load_dataset(dataset_name, split="train")
        else:
            # Handle custom dataset loading (if dataset_name is already a loaded Dataset object)
            dataset = dataset_name
            
        print(f"Dataset columns before preparation: {dataset.column_names}")

        # --- Prepare dataset based on format ---
        # Prioritize 'text' column if it exists directly
        if "text" in dataset.column_names:
            print("Dataset has a 'text' column. Using it directly.")
            # No specific formatting needed, it's already in 'text'
            pass 
        elif "instruction" in dataset.column_names and "output" in dataset.column_names:
            # Alpaca format
            print("Dataset detected as Alpaca format.")
            def format_alpaca(examples):
                texts = []
                for instruction, input_text, output in zip(
                    examples["instruction"], 
                    examples.get("input", [""] * len(examples["instruction"])), 
                    examples["output"]
                ):
                    if input_text:
                        # Ensure proper newline characters and strip whitespace
                        prompt = f"### Instruction:\n{instruction.strip()}\n\n### Input:\n{input_text.strip()}\n\n### Response:\n{output.strip()}"
                    else:
                        prompt = f"### Instruction:\n{instruction.strip()}\n\n### Response:\n{output.strip()}"
                    texts.append(prompt)
                return {"text": texts}
                
            dataset = dataset.map(format_alpaca, batched=True, remove_columns=dataset.column_names)
            
        elif all(col in dataset.column_names for col in ["concept1", "concept2", "relation_type"]):
            # Specific handling for 'bigbio/ehr_rel' or similar relation extraction datasets
            print("Dataset detected as 'bigbio/ehr_rel' or similar relation extraction format.")
            def format_ehr_rel(examples):
                texts = []
                for c1, c2, rel_type in zip(
                    examples["concept1"], 
                    examples["concept2"], 
                    examples["relation_type"]
                ):
                    # Define how to represent the relation as text for your LLM.
                    # This example creates a descriptive sentence.
                    text_representation = f"The medical concept '{c1.strip()}' has a '{rel_type.strip()}' relationship with '{c2.strip()}'."
                    texts.append(text_representation)
                return {"text": texts}

            dataset = dataset.map(format_ehr_rel, batched=True, remove_columns=dataset.column_names)
            
        else:
            # Fallback for other formats: you'll need to define how to convert them to a 'text' column.
            # Or raise an error if no known format is matched.
            raise ValueError(
                "Dataset format not recognized. "
                "It must have a 'text' column, Alpaca format (instruction/output), "
                "or a specifically handled format like 'bigbio/ehr_rel'."
                f"Available columns: {dataset.column_names}"
            )
            
        print(f"Dataset columns after preparation: {dataset.column_names}")

        # Filter by length to avoid OOM
        max_length = self.config.get("max_length", 2048)
        # Ensure 'text' column exists before encoding and filtering
        if "text" in dataset.column_names:
            dataset = dataset.filter(lambda x: len(self.tokenizer.encode(x["text"])) <= max_length)
        else:
            # This case should ideally not happen if the above logic is correct,
            # but it's a safeguard.
            print("Warning: 'text' column not found after preparation. Skipping length filter.")
        
        # Take subset if specified
        if self.config.get("max_samples"):
            dataset = dataset.select(range(min(len(dataset), self.config["max_samples"])))
            
        self.dataset = dataset
        print(f"✅ Dataset prepared: {len(dataset)} samples")
        
    def setup_training_arguments(self):
        """Setup optimized training arguments for dual RTX 4090s"""
        # Calculate optimal batch size for dual RTX 4090s (48GB total VRAM)
        base_batch_size = self.config.get("batch_size", 16)  # Increased for high VRAM
        per_device_batch_size = base_batch_size // max(1, torch.cuda.device_count())
        
        return TrainingArguments(
            output_dir=self.config.get("output_dir", "./llama-3.2-8b-finetuned"),
            
            # Training hyperparameters optimized for RTX 4090s
            num_train_epochs=self.config.get("num_epochs", 3),
            per_device_train_batch_size=per_device_batch_size,
            gradient_accumulation_steps=self.config.get("gradient_accumulation_steps", 2),  # Reduced due to higher batch size
            learning_rate=self.config.get("learning_rate", 2e-4),
            lr_scheduler_type="cosine",
            warmup_ratio=0.1,
            weight_decay=0.01,
            
            # RTX 4090 Ada Lovelace optimizations
            bf16=True,  # Native bfloat16 support on Ada Lovelace
            tf32=True,  # Enable TensorFloat-32 for maximum performance
            dataloader_pin_memory=True,
            dataloader_num_workers=8,  # Higher due to powerful CPU
            
            # Memory optimizations (less aggressive due to abundant VRAM)
            gradient_checkpointing=self.config.get("gradient_checkpointing", False),  # Optional with 48GB
            optim="adamw_torch_fused",  # Fused optimizer for NVIDIA Ada Lovelace
            max_grad_norm=1.0,
            
            # Multi-GPU settings
            ddp_backend="nccl" if self.is_multi_gpu else None,
            ddp_find_unused_parameters=False,
            
            # Logging and saving
            logging_steps=5,  # More frequent logging for 12-hour testing
            save_steps=250,   # More frequent saves
            save_total_limit=5,
            evaluation_strategy="no",
            
            # Performance monitoring
            report_to="wandb" if self.config.get("use_wandb", False) else None,
            run_name=f"llama-3.2-8b-dual-4090-{self.config.get('experiment_name', 'default')}",
            
            # Additional Ada Lovelace optimizations
            remove_unused_columns=False,
            prediction_loss_only=True,
            disable_tqdm=False,
        )
        
    def train(self):
        """Execute the fine-tuning process"""
        print("🚀 Starting fine-tuning process...")
        
        # Initialize wandb if enabled
        if self.config.get("use_wandb", False):
            wandb.init(
                project=self.config.get("wandb_project", "llama-finetune"),
                name=f"llama-3.2-8b-{self.config.get('experiment_name', 'default')}"
            )
            
        # Setup training arguments
        training_args = self.setup_training_arguments()
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )
        
        # Initialize SFTTrainer (optimized for instruction tuning)
        trainer = SFTTrainer(
            model=self.model,
            train_dataset=self.dataset,
            data_collator=data_collator,
            args=training_args,
            tokenizer=self.tokenizer,
            dataset_text_field="text",
            max_seq_length=self.config.get("max_length", 2048),
            packing=False,  # Disable packing to avoid issues with instruction format
        )
        
        # Clear cache before training
        torch.cuda.empty_cache()
        gc.collect()
        
        # Start training
        print("🎯 Training started...")
        trainer.train()
        
        # Save the final model
        print("💾 Saving model...")
        trainer.save_model()
        self.tokenizer.save_pretrained(training_args.output_dir)
        
        print("✅ Fine-tuning completed!")
        
    def optimize_for_inference(self, model_path):
        """Optimize model for inference using TensorRT (optional)"""
        print("⚡ Optimizing model for inference...")
        
        # Load the fine-tuned model
        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
        model.eval()
        
        # Convert to TorchScript
        traced_model = torch.jit.trace(model, example_inputs=(torch.randint(0, 1000, (1, 512)),))
        
        # Save optimized model
        optimized_path = f"{model_path}_optimized"
        traced_model.save(f"{optimized_path}/traced_model.pt")
        
        print(f"✅ Optimized model saved to: {optimized_path}")
        
    def monitor_gpu_usage(self):
        """Monitor GPU usage during training"""
        if torch.cuda.is_available():
            for i in range(torch.cuda.device_count()):
                memory_allocated = torch.cuda.memory_allocated(i) / 1024**3
                memory_reserved = torch.cuda.memory_reserved(i) / 1024**3
                print(f"GPU {i}: {memory_allocated:.2f}GB allocated, {memory_reserved:.2f}GB reserved")


In [None]:
# Configuration 1: Best Overall Performance
LLAMA_8B_DOLLY_CONFIG = {
    "model_name": "meta-llama/Llama-3.1-8B-Instruct",
    # "dataset_name": "databricks/databricks-dolly-15k",
    "dataset_name": "bigbio/ehr_rel", # hugging face dataset
    "experiment_name": "llama_8b_dolly_premium",
    
    # Training settings optimized for dual RTX 4090s
    "num_epochs": 3,
    "batch_size": 24,  # Higher for 8B model
    "gradient_accumulation_steps": 2,
    "learning_rate": 5e-5,  # Lower for larger model
    "max_length": 2048,
    
    # LoRA settings for 8B model
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    
    # Optimizations
    "use_quantization": False,
    "use_flash_attention": True,
    "gradient_checkpointing": True,  # Enable for 8B model
    
    # Expected time: ~4-5 hours
    "output_dir": "./llama-8b-dolly-finetuned",
}

In [None]:
# Configuration 2: Fastest Training (Good for testing)
LLAMA_3B_DOLLY_CONFIG = {
    "model_name": "meta-llama/Llama-3.2-3B-Instruct",
    "dataset_name": "databricks/databricks-dolly-15k",
    "experiment_name": "llama_3b_dolly_fast",
    
    # Aggressive settings for fast training
    "num_epochs": 5,  # More epochs since it's faster
    "batch_size": 32,  # Very high batch size
    "gradient_accumulation_steps": 1,
    "learning_rate": 2e-4,
    "max_length": 2048,
    
    # Higher LoRA rank for 3B model
    "lora_r": 32,
    "lora_alpha": 64,
    "lora_dropout": 0.05,
    
    # Optimizations
    "use_quantization": False,
    "use_flash_attention": True,
    "gradient_checkpointing": False,
    
    # Expected time: ~2-3 hours
    "output_dir": "./llama-3b-dolly-finetuned",
}

In [None]:
# Configuration 3: Code-Focused Fine-tuning
LLAMA_8B_CODE_CONFIG = {
    "model_name": "meta-llama/Llama-3.1-8B-Instruct",
    "dataset_name": "sahil2801/CodeAlpaca-20k",
    "experiment_name": "llama_8b_code_specialist",
    
    # Code-specific settings
    "num_epochs": 3,
    "batch_size": 16,
    "gradient_accumulation_steps": 2,
    "learning_rate": 1e-4,
    "max_length": 4096,  # Longer context for code
    
    # LoRA for code tasks
    "lora_r": 32,
    "lora_alpha": 64,
    "lora_dropout": 0.1,
    
    # Optimizations
    "use_quantization": False,
    "use_flash_attention": True,
    "gradient_checkpointing": True,
    
    # Expected time: ~3-4 hours
    "output_dir": "./llama-8b-code-finetuned",
}

In [None]:
def set_config(config=LLAMA_8B_DOLLY_CONFIG):
    config = config

    print("🚀 Dual RTX 4090 Fine-tuning Configuration (FREE VERSION):")
    print(f"   - Model: Llama 3.1 8B Instruct (Latest)")
    print(f"   - Dataset: Databricks Dolly 15K (Human-written)")
    print(f"   - Total VRAM: 48GB")
    print(f"   - Batch size: {config['batch_size']}")
    print(f"   - Context length: {config['max_length']}")
    print(f"   - LoRA rank: {config['lora_r']}")
    print(f"   - Quantization: {'Disabled' if not config['use_quantization'] else 'Enabled'}")
    print(f"   - Monitoring: Local only (no external services)")
    print(f"   - Expected training time: 4-5 hours")
    print()

In [None]:
def train():
    # Initialize fine-tuner
    fine_tuner = OptimizedLlamaFineTuner(config)
    
    try:
        # Load model and tokenizer
        fine_tuner.load_model_and_tokenizer()
        
        # Setup LoRA
        fine_tuner.setup_lora()
        
        # Load and prepare dataset
        fine_tuner.load_and_prepare_dataset()
        
        # Monitor initial GPU usage
        print("📊 Initial GPU Memory Usage:")
        fine_tuner.monitor_gpu_usage()
        print()
        
        # Start training
        fine_tuner.train()
        
        # Final GPU usage
        print("📊 Final GPU Memory Usage:")
        fine_tuner.monitor_gpu_usage()
        
        # Optional: Optimize for inference
        # fine_tuner.optimize_for_inference(config["output_dir"])
        
    except Exception as e:
        print(f"❌ Error during fine-tuning: {e}")
        # Print GPU memory for debugging
        fine_tuner.monitor_gpu_usage()
        raise
    finally:
        # Cleanup
        torch.cuda.empty_cache()
        gc.collect()

In [None]:
def main():
    """Main training function optimized for dual RTX 4090s"""
    set_config()
    train()
    

In [None]:
# if __name__ == "__main__":
#     print("🚀 Dual RTX 4090 Testing Configurations")
#     print("=" * 50)
    
#     strategy = get_testing_strategy()
#     for phase, details in strategy.items():
#         print(f"\n{phase.upper()}:")
#         print(f"  Duration: {details['duration']}")
#         print(f"  Purpose: {details['purpose']}")
#         if 'config' in details:
#             config = details['config']
#             print(f"  Model: {config['model_name']}")
#             print(f"  Dataset: {config['dataset_name']}")
#             print(f"  Expected VRAM: {EXPECTED_METRICS.get(config['model_name'].split('/')[-1].lower().replace('-', '_').split('_')[1] + 'b', {}).get('vram_usage', 'N/A')}")