In [None]:
# Install PyTorch with CUDA support
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Install transformers and related libraries
pip install transformers>=4.36.0
pip install datasets
pip install peft>=0.7.0
pip install trl
pip install bitsandbytes
pip install accelerate
pip install flash-attn --no-build-isolation

# Optional: For monitoring
pip install wandb
pip install pynvml

# For TensorRT optimization (optional)
pip install tensorrt

In [None]:
#!/usr/bin/env python3
"""
Optimized Fine-tuning Pipeline for Meta Llama 3.2 3B
Using NVIDIA AI Software Stack (CUDA, cuDNN, TensorRT, PyTorch)
"""

import os
import gc
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import wandb
from trl import SFTTrainer
import tensorrt as trt
import pynvml

In [None]:
# NVIDIA optimizations
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False

In [None]:
class OptimizedLlamaFineTuner:
    def __init__(self, config):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.setup_nvidia_environment()
        self.tokenizer = None
        self.model = None
        self.dataset = None
        
    def setup_nvidia_environment(self):
        """Setup NVIDIA environment and check GPU capabilities"""
        if not torch.cuda.is_available():
            raise RuntimeError("CUDA is not available!")
            
        # Initialize NVML for GPU monitoring
        pynvml.nvmlInit()
        gpu_count = pynvml.nvmlDeviceGetCount()
        
        print(f"🚀 NVIDIA Setup:")
        print(f"   - CUDA Version: {torch.version.cuda}")
        print(f"   - cuDNN Version: {torch.backends.cudnn.version()}")
        print(f"   - Available GPUs: {gpu_count}")
        
        for i in range(gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            name = pynvml.nvmlDeviceGetName(handle).decode()
            memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
            print(f"   - GPU {i}: {name} ({memory.total // 1024**3} GB)")
            
        # Enable TensorFloat-32 for A100/RTX 30xx series
        if torch.cuda.get_device_capability()[0] >= 8:
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.allow_tf32 = True
            print("   - TensorFloat-32 enabled for Ampere+ GPUs")
            
    def load_model_and_tokenizer(self):
        """Load Llama 3.2 3B with optimized quantization"""
        model_name = "meta-llama/Llama-3.2-3B-Instruct"
        
        # BitsAndBytesConfig for 4-bit quantization
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
        
        print("📥 Loading tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True,
            padding_side="right"
        )
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        print("📥 Loading model with 4-bit quantization...")
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2" if self.config.get("use_flash_attention", True) else "eager"
        )
        
        # Prepare model for k-bit training
        self.model = prepare_model_for_kbit_training(self.model)
        
        print(f"✅ Model loaded on: {self.model.device}")
        
    def setup_lora(self):
        """Setup LoRA configuration for efficient fine-tuning"""
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=self.config.get("lora_r", 16),
            lora_alpha=self.config.get("lora_alpha", 32),
            lora_dropout=self.config.get("lora_dropout", 0.1),
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
            bias="none",
        )
        
        self.model = get_peft_model(self.model, lora_config)
        
        # Print trainable parameters
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in self.model.parameters())
        
        print(f"🎯 LoRA Configuration:")
        print(f"   - Trainable parameters: {trainable_params:,}")
        print(f"   - Total parameters: {total_params:,}")
        print(f"   - Trainable %: {100 * trainable_params / total_params:.2f}%")
        
    def load_and_prepare_dataset(self):
        """Load and prepare dataset for training"""
        dataset_name = self.config.get("dataset_name", "tatsu-lab/alpaca")
        
        print(f"📊 Loading dataset: {dataset_name}")
        
        # Load dataset
        if isinstance(dataset_name, str):
            dataset = load_dataset(dataset_name, split="train")
        else:
            # Handle custom dataset loading
            dataset = dataset_name
            
        # Prepare dataset based on format
        if "instruction" in dataset.column_names and "output" in dataset.column_names:
            # Alpaca format
            def format_alpaca(examples):
                texts = []
                for instruction, input_text, output in zip(
                    examples["instruction"], 
                    examples.get("input", [""] * len(examples["instruction"])), 
                    examples["output"]
                ):
                    if input_text:
                        prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
                    else:
                        prompt = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
                    texts.append(prompt)
                return {"text": texts}
                
            dataset = dataset.map(format_alpaca, batched=True, remove_columns=dataset.column_names)
            
        elif "text" not in dataset.column_names:
            raise ValueError("Dataset must have 'text' column or Alpaca format (instruction/output)")
            
        # Filter by length to avoid OOM
        max_length = self.config.get("max_length", 2048)
        dataset = dataset.filter(lambda x: len(self.tokenizer.encode(x["text"])) <= max_length)
        
        # Take subset if specified
        if self.config.get("max_samples"):
            dataset = dataset.select(range(min(len(dataset), self.config["max_samples"])))
            
        self.dataset = dataset
        print(f"✅ Dataset prepared: {len(dataset)} samples")
        
    def setup_training_arguments(self):
        """Setup optimized training arguments"""
        return TrainingArguments(
            output_dir=self.config.get("output_dir", "./llama-3.2-3b-finetuned"),
            
            # Training hyperparameters
            num_train_epochs=self.config.get("num_epochs", 3),
            per_device_train_batch_size=self.config.get("batch_size", 4),
            gradient_accumulation_steps=self.config.get("gradient_accumulation_steps", 4),
            learning_rate=self.config.get("learning_rate", 2e-4),
            lr_scheduler_type="cosine",
            warmup_ratio=0.1,
            
            # NVIDIA optimizations
            bf16=True,  # Use bfloat16 for Ampere+ GPUs
            tf32=True,  # Enable TensorFloat-32
            dataloader_pin_memory=True,
            dataloader_num_workers=4,
            
            # Memory optimizations
            gradient_checkpointing=True,
            optim="adamw_torch_fused",  # Fused optimizer for NVIDIA GPUs
            
            # Logging and saving
            logging_steps=10,
            save_steps=500,
            save_total_limit=3,
            evaluation_strategy="no",  # Disable eval to save memory
            
            # Additional optimizations
            remove_unused_columns=False,
            report_to="wandb" if self.config.get("use_wandb", False) else None,
            run_name=f"llama-3.2-3b-finetune-{self.config.get('experiment_name', 'default')}",
            
            # DDP settings (if using multiple GPUs)
            ddp_find_unused_parameters=False,
        )
        
    def train(self):
        """Execute the fine-tuning process"""
        print("🚀 Starting fine-tuning process...")
        
        # Initialize wandb if enabled
        if self.config.get("use_wandb", False):
            wandb.init(
                project=self.config.get("wandb_project", "llama-finetune"),
                name=f"llama-3.2-3b-{self.config.get('experiment_name', 'default')}"
            )
            
        # Setup training arguments
        training_args = self.setup_training_arguments()
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )
        
        # Initialize SFTTrainer (optimized for instruction tuning)
        trainer = SFTTrainer(
            model=self.model,
            train_dataset=self.dataset,
            data_collator=data_collator,
            args=training_args,
            tokenizer=self.tokenizer,
            dataset_text_field="text",
            max_seq_length=self.config.get("max_length", 2048),
            packing=False,  # Disable packing to avoid issues with instruction format
        )
        
        # Clear cache before training
        torch.cuda.empty_cache()
        gc.collect()
        
        # Start training
        print("🎯 Training started...")
        trainer.train()
        
        # Save the final model
        print("💾 Saving model...")
        trainer.save_model()
        self.tokenizer.save_pretrained(training_args.output_dir)
        
        print("✅ Fine-tuning completed!")
        
    def optimize_for_inference(self, model_path):
        """Optimize model for inference using TensorRT (optional)"""
        print("⚡ Optimizing model for inference...")
        
        # Load the fine-tuned model
        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
        model.eval()
        
        # Convert to TorchScript
        traced_model = torch.jit.trace(model, example_inputs=(torch.randint(0, 1000, (1, 512)),))
        
        # Save optimized model
        optimized_path = f"{model_path}_optimized"
        traced_model.save(f"{optimized_path}/traced_model.pt")
        
        print(f"✅ Optimized model saved to: {optimized_path}")
        
    def monitor_gpu_usage(self):
        """Monitor GPU usage during training"""
        if torch.cuda.is_available():
            for i in range(torch.cuda.device_count()):
                memory_allocated = torch.cuda.memory_allocated(i) / 1024**3
                memory_reserved = torch.cuda.memory_reserved(i) / 1024**3
                print(f"GPU {i}: {memory_allocated:.2f}GB allocated, {memory_reserved:.2f}GB reserved")


In [None]:
def main():
    """Main training function"""
    
    # Configuration
    config = {
        # Model and dataset
        "dataset_name": "tatsu-lab/alpaca",  # Change to your dataset
        "max_samples": None,  # Set to limit dataset size for testing
        "max_length": 2048,
        
        # LoRA configuration
        "lora_r": 16,
        "lora_alpha": 32,
        "lora_dropout": 0.1,
        
        # Training hyperparameters
        "num_epochs": 3,
        "batch_size": 4,  # Adjust based on your GPU memory
        "gradient_accumulation_steps": 4,
        "learning_rate": 2e-4,
        
        # Optimizations
        "use_flash_attention": True,
        
        # Paths and logging
        "output_dir": "./llama-3.2-3b-finetuned",
        "experiment_name": "alpaca_finetune",
        "use_wandb": False,  # Set to True if you want to use Weights & Biases
        "wandb_project": "llama-finetune",
    }

In [None]:
# Initialize fine-tuner
fine_tuner = OptimizedLlamaFineTuner(config)

try:
    # Load model and tokenizer
    fine_tuner.load_model_and_tokenizer()
    
    # Setup LoRA
    fine_tuner.setup_lora()
    
    # Load and prepare dataset
    fine_tuner.load_and_prepare_dataset()
    
    # Start training
    fine_tuner.train()
    
    # Optional: Optimize for inference
    # fine_tuner.optimize_for_inference(config["output_dir"])
    
except Exception as e:
    print(f"❌ Error during fine-tuning: {e}")
    raise
finally:
    # Cleanup
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
if __name__ == "__main__":
    main()