In [1]:
import os

os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset

# Load tokenizer and model
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Print model configuration
print(model.config)

# Load your dataset
dataset = load_dataset('csv', data_files='fusion_recipes_formatted.csv')

# Tokenize data
def tokenize_function(examples):
    # Tokenize both prompts and completions
    model_inputs = tokenizer(
        examples['prompt'],
        max_length=256,
        truncation=True,
        padding='max_length',
        return_tensors="pt"
    )
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["completion"],
            max_length=256,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000,
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["train"].select(range(100)),  # Small subset for evaluation
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_culinaria_model")

# Optional: Push the model to Hugging Face Hub
# trainer.push_to_hub("your-username/fine-tuned-llama-3-2-1b-instruct-culinaria")

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pad_token_id": 128009,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.45.1",
  "use_cache": true,
  "vocab_size": 128256
}



Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG15XFamilyCommandBuffer: 0x31fe38090>
    label = <none> 
    device = <AGXG15SDevice: 0x11af05400>
        name = Apple M3 Pro 
    commandQueue = <AGXG15XFamilyCommandQueue: 0x318871c00>
        label = <none> 
        device = <AGXG15SDevice: 0x11af05400>
            name = Apple M3 Pro 
    retainedReferences = 1


RuntimeError: MPS backend out of memory (MPS allocated: 19.09 GB, other allocations: 1.30 GB, max allowed: 20.40 GB). Tried to allocate 64.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
import gc

# Set environment variable to allow more memory allocation (use with caution)
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

# Verify MPS availability
if not torch.backends.mps.is_available():
    raise EnvironmentError("MPS is not available. Ensure you're running on an Apple Silicon Mac with the latest PyTorch version.")

# Set device to MPS
device = torch.device("mps")

# Load tokenizer and model without specifying torch_dtype
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)  # Move model to MPS device

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Set padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Print model configuration
print(model.config)

# Load your dataset
dataset = load_dataset('csv', data_files='fusion_recipes_formatted.csv')

# Tokenize data with reduced sequence length
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples['prompt'],
        max_length=128,  # Reduced from 256
        truncation=True,
        padding='max_length',
        return_tensors="pt"
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["completion"],
            max_length=128,  # Reduced from 256
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization with caching to save time on subsequent runs
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    cache_file_name="tokenized_cache.arrow"
)

# Define training arguments with optimizations
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Reduced from 2
    per_device_eval_batch_size=1,   # Reduced from 2
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000,
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    # fp16=True,  # Ensure fp16 is disabled
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["train"].select(range(100)),  # Small subset for evaluation
)

# Training loop
try:
    trainer.train()
except RuntimeError as e:
    if 'out of memory' in str(e):
        print("Encountered out-of-memory error. Consider reducing batch size or gradient accumulation steps.")
        # Optionally, implement additional memory clearing here
    else:
        raise e

# Clear unused variables and cache
del model, trainer, tokenized_datasets
gc.collect()
# torch.mps.empty_cache()  # Not available; use gc.collect() instead

# Save the fine-tuned model
trainer.save_model("./fine_tuned_culinaria_model")

# Optional: Push the model to Hugging Face Hub
# trainer.push_to_hub("your-username/fine-tuned-llama-3-2-1b-instruct-culinaria")


In [11]:
pip install psutil

python(9858) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
import psutil

def check_memory():
    # Get virtual memory statistics
    mem = psutil.virtual_memory()

    # Print the memory usage in a readable format
    print(f"Total memory: {mem.total / (1024**3):.2f} GB")
    print(f"Available memory: {mem.available / (1024**3):.2f} GB")
    print(f"Used memory: {mem.used / (1024**3):.2f} GB")
    print(f"Percentage used: {mem.percent}%")

# Call the function to check memory before training
check_memory()

Total memory: 18.00 GB
Available memory: 4.47 GB
Used memory: 6.68 GB
Percentage used: 75.1%
