## The finetuning failed due to memory and hardware issue
- moved to Google Colab

### Original Plan
- Running Sampler Datasets through Llama to Generate New Donor Profiles

Section 1: Setup and Imports 
- Imports
- Tokenizer initialization
- Path definitions

Section 2: Data Processing 
- Profile formatting functions
- Data processing and saving
- Verification checks

Section 3: Model Configuration 
- Model initialization
- LoRA configuration
- Training parameters

Section 4: Training 
- Training loop
- Progress tracking
- Model saving

Section 5: Generation and Evaluation
- Profile generation
- Comparison with other LLMs
- Analysis

In [1]:
## Verifying MPS availability

import torch
print("MPS available:", torch.backends.mps.is_available())
print("PyTorch version:", torch.__version__)
print("Using MPS device:", torch.device("mps"))

MPS available: True
PyTorch version: 2.5.1
Using MPS device: mps


## Step 1. Data Preprocessing

In [2]:
!pip install peft



In [3]:
!pip install sentencepiece
!pip install bitsandbytes
!pip install accelerate



In [1]:
from huggingface_hub import login
login("mytokenhere")  

# My real token is removed prior to submitting this file for hand-in for security purpose

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    LlamaTokenizer,
    LlamaForCausalLM,
    TrainingArguments,
    Trainer
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)

### Load and Preprocess Data

In [3]:
# Load the data
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Let's assume we're working with your original dataset first
donor_data = load_dataset('/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/sampled/sampled_datasets/original_sample_20241111_231039.json')

# Format each profile into instruction format
def prepare_donor_profile(profile):
    formatted_profile = f"""### Instruction:
Generate a detailed sperm donor profile based on these characteristics:
Height: {profile['height']}
Weight: {profile['weight']}
Eye Color: {profile['eye_color'] or 'Not specified'}
Hair: {profile['hair_color'] or 'Not specified'}
Education: {profile['education_level']} in {profile['education_field']}
Ethnic Background: {profile['ethnic_background']}

### Response:
{profile['donor_description']}

### End"""
    return formatted_profile

# Convert all profiles
formatted_data = [prepare_donor_profile(profile) for profile in donor_data]

### Tokenization

In [7]:
!pip install protobuf

Collecting protobuf
  Downloading protobuf-5.28.3-cp38-abi3-macosx_10_9_universal2.whl.metadata (592 bytes)
Downloading protobuf-5.28.3-cp38-abi3-macosx_10_9_universal2.whl (414 kB)
Installing collected packages: protobuf
Successfully installed protobuf-5.28.3


In [5]:
import json
import os
from transformers import LlamaTokenizer

In [6]:
# Initialize tokenizer
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer.pad_token = tokenizer.eos_token

In [7]:
# Define file paths
original_dataset_path = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/sampled/sampled_datasets/original_sample_20241111_231039.json"
augmented_dataset_path = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/sampled/sampled_datasets/augmented_sample_20241111_231039.json"

# Define output directory
processed_dir = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/processed/llama_training"
if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)

In [8]:
# Define the profile preparation function
def prepare_donor_profile(profile):
    """Format a single donor profile into instruction format"""
    formatted_profile = f"""### Instruction:
Generate a detailed sperm donor profile based on these characteristics:
Height: {profile['height']}
Weight: {profile['weight']}
Eye Color: {profile['eye_color']}
Hair: {profile['hair_color']}
Education: {profile['education_level']} in {profile['education_field']}
Ethnic Background: {profile['ethnic_background']}

### Response:
{profile['donor_description']}

### End"""
    return formatted_profile


In [9]:
# Define the processing function
def load_and_process_dataset(input_file_path, dataset_type):
    """
    Load and process a dataset, saving the results
    dataset_type: 'original' or 'augmented'
    """
    # Load data
    with open(input_file_path, 'r') as f:
        donor_data = json.load(f)
    
    # Process each profile
    processed_data = []
    for profile in donor_data:
        formatted_profile = prepare_donor_profile(profile)
        
        # Tokenize the formatted profile
        tokenized = tokenizer(formatted_profile, truncation=True, max_length=512)
        
        processed_data.append({
            "text": formatted_profile,
            "input_ids": tokenized["input_ids"],
            "attention_mask": tokenized["attention_mask"],
            "original_data": profile  # keeping original data for reference
        })
    
    # Save processed data
    output_file = os.path.join(processed_dir, f'processed_{dataset_type}_dataset.json')
    with open(output_file, 'w') as f:
        json.dump(processed_data, f, indent=2)
    
    print(f"Processed {len(processed_data)} profiles from {dataset_type} dataset")
    print(f"Saved to: {output_file}")
    
    # Return first example for inspection
    return processed_data[0]



In [10]:
# Process both datasets
try:
    # Process original dataset
    print("\nProcessing original dataset...")
    original_example = load_and_process_dataset(original_dataset_path, 'original')
    
    print("\nProcessing augmented dataset...")
    augmented_example = load_and_process_dataset(augmented_dataset_path, 'augmented')
    
    # Show example of processed profile
    print("\nExample of processed profile (original dataset):")
    print(original_example['text'])
    
except Exception as e:
    print(f"Error processing data: {e}")


Processing original dataset...
Processed 250 profiles from original dataset
Saved to: /Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/processed/llama_training/processed_original_dataset.json

Processing augmented dataset...
Processed 250 profiles from augmented dataset
Saved to: /Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/processed/llama_training/processed_augmented_dataset.json

Example of processed profile (original dataset):
### Instruction:
Generate a detailed sperm donor profile based on these characteristics:
Height: 5'10 (178cm)
Weight: 162 lbs (73kg)
Eye Color: Black
Hair: Dark Brown
Education: Master in Architecture
Ethnic Background: East Indian

### Response:
Quadrilingual Architect. loves his career as an architect. Heâ€™s had a lifelong creative streak and drawing has always been a favorite pastime, from doodling to designing buildings! This smart cookie has an M.S. in architecture (3.7 GPA) and can even speak four languages f

In [11]:
# Verifying the output

import json

def inspect_processed_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    print(f"Total profiles in dataset: {len(data)}")
    
    # Look at first profile
    first_profile = data[0]
    print("\nKeys in each profile:")
    print(list(first_profile.keys()))
    
    # Decode a few tokens to show what they represent
    print("\nFirst few tokens decoded:")
    decoded = tokenizer.decode(first_profile['input_ids'][:10])
    print(decoded)
    
    # Verify format consistency
    format_check = all("### Instruction:" in item['text'] and 
                      "### Response:" in item['text'] and 
                      "### End" in item['text'] 
                      for item in data)
    print(f"\nAll profiles have correct formatting: {format_check}")

# Check the processed original dataset
print("Checking original dataset:")
inspect_processed_data("/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/processed/llama_training/processed_original_dataset.json")

Checking original dataset:
Total profiles in dataset: 250

Keys in each profile:
['text', 'input_ids', 'attention_mask', 'original_data']

First few tokens decoded:
<s> ### Instruction:
Generate a detailed

All profiles have correct formatting: True


## Model and training configuration

In [12]:
import torch
from transformers import LlamaForCausalLM, TrainingArguments
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)

# Load the base model with quantization for memory efficiency
model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    torch_dtype=torch.float16,  # Use half precision
    device_map="auto"
)

print("Base model loaded successfully")

# Configure LoRA
lora_config = LoraConfig(
    r=16,                # Rank - number of LoRA pairs
    lora_alpha=32,       # Alpha parameter for LoRA scaling
    target_modules=["q_proj", "v_proj"], # Which modules to apply LoRA to
    lora_dropout=0.05,   # Dropout probability for LoRA layers
    bias="none",         # We don't train bias parameters
    task_type=TaskType.CAUSAL_LM # We're doing causal language modeling
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/outputs/llama_fine_tuned",
    num_train_epochs=3,           # Number of training epochs
    per_device_train_batch_size=4, # Batch size per device during training
    gradient_accumulation_steps=4, # Number of updates steps to accumulate before performing a backward/update pass
    learning_rate=2e-4,          # Initial learning rate
    warmup_steps=100,            # Number of warmup steps for learning rate scheduler
    logging_steps=10,            # Log every X updates steps
    save_strategy="epoch",       # Save the model every epoch
    evaluation_strategy="epoch", # Evaluate the model every epoch
    report_to="none"            # Disable wandb logging
)

print("Model and training configuration completed")

Downloading shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [07:50<00:00, 235.21s/it]
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:21<00:00, 10.83s/it]
Some parameters are on the meta device because they were offloaded to the disk.


Base model loaded successfully


  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
Model and training configuration completed




In [13]:
## Second iteration

import torch
from transformers import LlamaForCausalLM, TrainingArguments
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)

# Check if MPS is available
mps_available = torch.backends.mps.is_available()
device = torch.device("mps" if mps_available else "cpu")
print(f"Using device: {device}")

# Load the base model
model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    torch_dtype=torch.float16,
    device_map="auto",
    use_cache=False  # Important for training
)

print("Base model loaded successfully")

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Prepare model for training
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # This will show us what's being trained

# Set up training arguments
training_args = TrainingArguments(
    output_dir="/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/outputs/llama_fine_tuned",
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Reduced batch size
    gradient_accumulation_steps=8,  # Increased gradient accumulation
    learning_rate=2e-4,
    warmup_steps=100,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="no",      # Changed from "epoch" to "no" to be more efficient and focus purely on training
    save_total_limit=3,           # Keep only the last 3 checkpoints
    fp16=True,                    # Enable mixed precision training
    optim="adamw_torch",          # Use PyTorch's AdamW optimizer
)

print("Model and training configuration completed")

# Print some configuration details for documentation
print("\nTraining Configuration Summary:")
print(f"Batch Size: {training_args.per_device_train_batch_size}")
print(f"Gradient Accumulation Steps: {training_args.gradient_accumulation_steps}")
print(f"Effective Batch Size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Number of Epochs: {training_args.num_train_epochs}")
print(f"Learning Rate: {training_args.learning_rate}")

Using device: mps


Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [01:14<00:00, 37.14s/it]
Some parameters are on the meta device because they were offloaded to the disk.


Base model loaded successfully
trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.1243
Model and training configuration completed

Training Configuration Summary:
Batch Size: 2
Gradient Accumulation Steps: 8
Effective Batch Size: 16
Number of Epochs: 3
Learning Rate: 0.0002


The results reflected the advantage of using LoRA, making training efficient while maintaining model capabilities

### Model Parameters:

- Trainable params: 8,388,608 (about 8M)
- Total params: 6,746,804,224 (about 6.7B)
- Only training 0.1243% of parameters

### Training Configuration:

- Batch Size: 2 (per device)
- Gradient Accumulation: 8 steps
- Effective Batch Size: 16 (2 * 8)
- 3 epochs
- Learning Rate: 0.0002 (2e-4)

---
## Training
- The following section can be ignored
- I kept running into issues with compatibility so I moved the rest to a Google Colab notebook, which you will find in the `models` folder of the project file.

In [22]:
import json
import os
import time
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
import numpy as np
import torch

In [41]:
# Added this to force PyTorch to not use MPS since I kept running into issues with the actual training step

import torch
import os
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

# Verify environment
print("DEBUG - Initial PyTorch setup:")
print(f"MPS available: {torch.backends.mps.is_available()}")
print(f"Using device: {torch.device('cpu')}")

DEBUG - Initial PyTorch setup:
MPS available: True
Using device: cpu


In [42]:
# 1. First, modify the prepare_dataset function to handle padding properly
def prepare_dataset(file_path):
    print(f"Loading data from {file_path}")
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    print("Converting data format...")
    try:
        # Convert to format expected by trainer
        formatted_data = []
        for item in data:
            # Tokenize with padding and truncation
            tokenized = tokenizer(
                item['text'],
                padding='max_length',
                truncation=True,
                max_length=512,  # Adjust this value based on your data
                return_tensors=None  # Return lists instead of tensors
            )
            formatted_data.append({
                'input_ids': tokenized['input_ids'],
                'attention_mask': tokenized['attention_mask'],
                'labels': tokenized['input_ids'].copy()  # For causal language modeling
            })
        
        return Dataset.from_list(formatted_data)
    except Exception as e:
        print(f"Error in prepare_dataset: {e}")
        raise


In [43]:
# 2. Load and prepare the dataset
print("Loading original dataset...")
try:
    train_dataset_original = prepare_dataset("/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/processed/llama_training/processed_original_dataset.json")
    print(f"Original dataset size: {len(train_dataset_original)}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

Loading original dataset...
Loading data from /Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/data/processed/llama_training/processed_original_dataset.json
Converting data format...
Original dataset size: 250


In [44]:
# 3. Split dataset into train/eval
train_test_split = train_dataset_original.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
print(f"Training set size: {len(train_dataset)}")
print(f"Evaluation set size: {len(eval_dataset)}")

Training set size: 225
Evaluation set size: 25


In [45]:
# 4. Initialize model on device
device = torch.device("cpu")
# Use to_empty() instead of to() for meta tensors
model = model.to_empty(device=device)
print("Model initialized on CPU")

Model initialized on CPU


In [46]:
# 5. Update training arguments with stricter CPU settings
training_args = TrainingArguments(
    output_dir="/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/outputs/llama_fine_tuned",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    warmup_steps=100,
    logging_steps=10,
    
    # Matching save and evaluation strategies
    save_strategy="steps",
    save_steps=50,               
    eval_strategy="steps",      
    eval_steps=50,              
    
    save_total_limit=3,         
    optim="adamw_torch",
    
    # Stricter CPU settings
    use_cpu=True,
    no_cuda=True,
    use_mps_device=False,
    
    # Checkpointing settings
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    
    # Data processing settings
    remove_unused_columns=False,
    max_grad_norm=0.3,
    
    # Additional settings
    evaluation_strategy="steps",
    dataloader_pin_memory=False,
    gradient_checkpointing=True,
    
    # Add these device-specific settings
    hub_token=None,
    torch_compile=False,
    use_ipex=False,
)

# After creating the trainer, add this verification
print("\nDEBUG - Verifying trainer setup:")
print(f"Trainer device: {trainer.args.device}")
print(f"Data collator device: {data_collator.return_tensors}")
print(f"Model device map: {model.hf_device_map if hasattr(model, 'hf_device_map') else 'Not using device map'}")

# Before starting training, force model to CPU again
model = model.to('cpu')
for param in model.parameters():
    if param.device.type != 'cpu':
        param.data = param.data.to('cpu')


DEBUG - Verifying trainer setup:
Trainer device: cpu
Data collator device: pt
Model device map: {'model.embed_tokens': 'mps', 'model.layers.0': 'mps', 'model.layers.1': 'mps', 'model.layers.2': 'mps', 'model.layers.3': 'mps', 'model.layers.4': 'mps', 'model.layers.5': 'mps', 'model.layers.6': 'mps', 'model.layers.7': 'disk', 'model.layers.8': 'disk', 'model.layers.9': 'disk', 'model.layers.10': 'disk', 'model.layers.11': 'disk', 'model.layers.12': 'disk', 'model.layers.13': 'disk', 'model.layers.14': 'disk', 'model.layers.15': 'disk', 'model.layers.16': 'disk', 'model.layers.17': 'disk', 'model.layers.18': 'disk', 'model.layers.19': 'disk', 'model.layers.20': 'disk', 'model.layers.21': 'disk', 'model.layers.22': 'disk', 'model.layers.23': 'disk', 'model.layers.24': 'disk', 'model.layers.25': 'disk', 'model.layers.26': 'disk', 'model.layers.27': 'disk', 'model.layers.28': 'disk', 'model.layers.29': 'disk', 'model.layers.30': 'disk', 'model.layers.31': 'disk', 'model.norm': 'disk', 'mod

In [47]:
# 6. Initialize the data collator with explicit device setting
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
    return_tensors="pt"
)

In [48]:
# 7. Make sure all model parameters are on CPU
for param in model.parameters():
    param.data = param.data.to('cpu')

In [49]:
# 8. Initialize the trainer with explicit device handling and debug prints
print("\nDEBUG - Before trainer initialization:")
print(f"Device settings from training args:")
print(f"use_cpu: {training_args.use_cpu}")
print(f"no_cuda: {training_args.no_cuda}")
print(f"use_mps_device: {training_args.use_mps_device}")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

  0%|          | 0/42 [04:07<?, ?it/s]


DEBUG - Before trainer initialization:
Device settings from training args:
use_cpu: True
no_cuda: True
use_mps_device: False





In [51]:

print("Forcing all model components to CPU...")

# 1. Force model and all parameters to CPU explicitly
model = model.to('cpu')
model.device = torch.device('cpu')  # Explicitly set device attribute

# 2. Ensure all model parameters are on CPU
for param in model.parameters():
    if hasattr(param, 'data'):
        param.data = param.data.to('cpu')
    if hasattr(param, 'grad') and param.grad is not None:
        param.grad.data = param.grad.data.to('cpu')

# 3. Verify device placement
print("\nDevice check after forcing CPU:")
print(f"Model device: {next(model.parameters()).device}")
for name, param in model.named_parameters():
    if param.device.type != 'cpu':
        print(f"Warning: {name} is on {param.device}")

print("\nVerification complete. Ready to start training.")

Forcing all model components to CPU...

Device check after forcing CPU:
Model device: cpu

Verification complete. Ready to start training.


In [55]:

print("Reinitializing model with explicit CPU configuration...")

# 1. First, reset PEFT configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    device='cpu'  # Explicitly set device
)

# 2. Reinitialize base model with explicit CPU config
model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    torch_dtype=torch.float32,  # Use float32 instead of float16 for CPU
    device_map=None,  # Disable device mapping
    load_in_8bit=False,  # Disable 8-bit loading
)

# 3. Force model to CPU before PEFT
model = model.to('cpu')

# 4. Create PEFT model with additional safety checks
def create_safe_peft_model(model, config):
    # Ensure model is on CPU
    model = model.to('cpu')
    
    # Create PEFT model
    peft_model = get_peft_model(model, config)
    
    # Force PEFT model to CPU
    peft_model = peft_model.to('cpu')
    
    # Verify all components are on CPU
    for name, module in peft_model.named_modules():
        if hasattr(module, 'to'):
            module.to('cpu')
    
    return peft_model

# 5. Create PEFT model with safety checks
model = create_safe_peft_model(model, lora_config)

# 6. Verify model state
print("\nVerifying model configuration:")
print(f"Base model device: {next(model.parameters()).device}")
print(f"Model class: {type(model)}")

# 7. Print some PEFT-specific information
if hasattr(model, 'active_adapter'):
    print(f"Active adapter: {model.active_adapter}")
if hasattr(model, 'peft_config'):
    print("PEFT config:", model.peft_config)

# 8. Reinitialize trainer with updated model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

print("\nModel and trainer reinitialization complete.")

Reinitializing model with explicit CPU configuration...


TypeError: __init__() got an unexpected keyword argument 'device'

In [54]:
# Step 9: Training execution with custom forward pass
print("\nDEBUG - Starting enhanced training process...")

# First, ensure everything is on CPU and MPS is disabled
if hasattr(torch.mps, 'empty_cache'):
    torch.mps.empty_cache()

# Verify and force CPU for all model components
def ensure_cpu_tensors(model):
    model = model.to('cpu')
    if hasattr(model, 'device'):
        model.device = torch.device('cpu')
    
    # Force all buffers to CPU
    for buffer in model.buffers():
        if buffer is not None:
            buffer.data = buffer.data.to('cpu')
    
    # Force all parameters to CPU
    for param in model.parameters():
        if param is not None:
            if hasattr(param, 'data'):
                param.data = param.data.to('cpu')
            if hasattr(param, 'grad') and param.grad is not None:
                param.grad.data = param.grad.data.to('cpu')
    
    return model

# Apply CPU enforcement
trainer.model = ensure_cpu_tensors(trainer.model)

# Custom forward function
def safe_forward(model, batch):
    # Ensure inputs are on CPU and properly formatted
    inputs = {
        k: v.to('cpu') if torch.is_tensor(v) else v
        for k, v in batch.items()
    }
    
    # Remove any None values
    inputs = {k: v for k, v in inputs.items() if v is not None}
    
    print("\nDEBUG - Input devices after processing:")
    for k, v in inputs.items():
        if torch.is_tensor(v):
            print(f"{k}: device={v.device}, shape={v.shape}")
    
    # Perform forward pass with error checking
    try:
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs
    except Exception as e:
        print(f"\nError in forward pass: {str(e)}")
        print(f"Model device: {next(model.parameters()).device}")
        raise

try:
    print("\nDEBUG - Testing single batch before training:")
    sample_dataloader = trainer.get_train_dataloader()
    sample_batch = next(iter(sample_dataloader))
    
    print("Batch devices:")
    for key, value in sample_batch.items():
        if hasattr(value, 'device'):
            print(f"{key}: device={value.device}, dtype={value.dtype}")
    
    print("\nDEBUG - Attempting custom forward pass...")
    outputs = safe_forward(trainer.model, sample_batch)
    print("Single forward pass successful!")
    
    # If forward pass succeeds, proceed with training
    print("\nStarting full training...")
    trainer.train(resume_from_checkpoint=False)
    
    # Save the final model
    output_dir = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/outputs/llama_fine_tuned/final_model"
    os.makedirs(output_dir, exist_ok=True)
    trainer.save_model(output_dir)
    print(f"\nModel saved to {output_dir}")
    
except Exception as e:
    print("\nDEBUG - Error occurred:")
    print(f"Error type: {type(e)}")
    print(f"Error message: {str(e)}")
    
    # Additional debugging information
    print("\nModel state:")
    print(f"Model class: {type(trainer.model)}")
    print(f"Model device: {next(trainer.model.parameters()).device}")
    
    # Try to save current state
    try:
        save_dir = "/Users/cindylinsf/Documents/CCI/THESIS/Msc_Thesis_Project_Files/outputs/llama_fine_tuned/error_state"
        os.makedirs(save_dir, exist_ok=True)
        trainer.save_model(save_dir)
        print("Current state saved successfully.")
    except Exception as save_error:
        print(f"Could not save current state: {save_error}")

print("\nTraining process completed or interrupted")


DEBUG - Starting enhanced training process...

DEBUG - Testing single batch before training:
Batch devices:
input_ids: device=cpu, dtype=torch.int64
attention_mask: device=cpu, dtype=torch.int64
labels: device=cpu, dtype=torch.int64

DEBUG - Attempting custom forward pass...

DEBUG - Input devices after processing:
input_ids: device=cpu, shape=torch.Size([1, 512])
attention_mask: device=cpu, shape=torch.Size([1, 512])
labels: device=cpu, shape=torch.Size([1, 512])

Error in forward pass: Placeholder storage has not been allocated on MPS device!
Model device: cpu

DEBUG - Error occurred:
Error type: <class 'RuntimeError'>
Error message: Placeholder storage has not been allocated on MPS device!

Model state:
Model class: <class 'peft.peft_model.PeftModelForCausalLM'>
Model device: cpu
Current state saved successfully.

Training process completed or interrupted
