# llama model checkpointing project - phase 1

this notebook implements phase 1: loading openllama-3b model and saving with pytorch approach

In [1]:
# import required libraries
import torch
import time
import os
from transformers import LlamaForCausalLM, LlamaTokenizer
import gc

In [2]:
# setup device and check cuda availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"using device: {device}")

if torch.cuda.is_available():
    print(f"cuda device: {torch.cuda.get_device_name(0)}")
    print(f"cuda memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} gb")
    print(f"cuda memory free: {torch.cuda.memory_reserved(0) / 1e9:.2f} gb")

using device: cuda
cuda device: NVIDIA GeForce GTX 1650
cuda memory: 3.9 gb
cuda memory free: 0.00 gb


In [3]:
# create saved_models directory if it doesn't exist
os.makedirs('saved_models', exist_ok=True)
print("created saved_models directory")

created saved_models directory


In [4]:
# load openllama-3b model with pretrained weights
model_name = "openlm-research/open_llama_3b"
print(f"loading model: {model_name}")

# load tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_name)
print("tokenizer loaded successfully")

# load model with memory optimization (fixed: use dtype instead of torch_dtype)
model = LlamaForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float16,  # use half precision for memory efficiency
    device_map="auto" if torch.cuda.is_available() else None,
    low_cpu_mem_usage=True
)

print(f"model loaded successfully")
print(f"model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}m")

if torch.cuda.is_available():
    print(f"cuda memory allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} gb")

loading model: openlm-research/open_llama_3b


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


tokenizer loaded successfully


Some parameters are on the meta device because they were offloaded to the cpu.


model loaded successfully
model parameters: 3426.5m
cuda memory allocated: 2.93 gb


In [5]:
# test model inference to verify it's working
test_prompt = "the future of artificial intelligence is"
inputs = tokenizer(test_prompt, return_tensors="pt")

if torch.cuda.is_available():
    inputs = {k: v.to(device) for k, v in inputs.items()}

print(f"testing model with prompt: '{test_prompt}'")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=50,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"generated text: {generated_text}")
print("model inference test successful")

testing model with prompt: 'the future of artificial intelligence is'


generated text: the future of artificial intelligence is now #ai
Estonia has 100,000 drones and 1,000,000 people. It’s the world’s most advanced country at using them
model inference test successful


In [6]:
# save model using pytorch approach with timing (save only state dict for security)
pytorch_save_path = "saved_models/openllama_3b_pytorch.pth"

print("saving model using pytorch approach...")
start_time = time.time()

# save only model state dict for weights_only=True compatibility
torch.save(model.state_dict(), pytorch_save_path)

pytorch_save_time = time.time() - start_time
file_size = os.path.getsize(pytorch_save_path) / (1024**3)  # convert to gb

print(f"pytorch save completed in {pytorch_save_time*1000:.1f} ms")
print(f"file size: {file_size:.2f} gb")
print(f"saved to: {pytorch_save_path}")

saving model using pytorch approach...


pytorch save completed in 6377.5 ms
file size: 2.73 gb
saved to: saved_models/openllama_3b_pytorch.pth


In [7]:
# simplified load test - just verify file can be loaded
print("testing model loading from saved file...")
start_time = time.time()

# load the saved state dict to cpu to verify integrity
state_dict = torch.load(pytorch_save_path, map_location='cpu')

pytorch_load_time = time.time() - start_time

print(f"pytorch load completed in {pytorch_load_time*1000:.1f} ms")
print(f"loaded {len(state_dict)} parameters successfully")
print("model loading test successful")

# cleanup
del state_dict
gc.collect()

testing model loading from saved file...


pytorch load completed in 3616.3 ms
loaded 237 parameters successfully
model loading test successful


697

In [8]:
# phase 1 summary
print("\n=== phase 1 summary ===")
print(f"model: openllama-3b")
print(f"device: {device}")
print(f"pytorch save time: {pytorch_save_time*1000:.1f} ms")
print(f"pytorch load time: {pytorch_load_time*1000:.1f} ms")
print(f"file size: {file_size:.2f} gb")
print(f"saved to: {pytorch_save_path}")
print("\nphase 1 completed successfully!")


=== phase 1 summary ===
model: openllama-3b
device: cuda
pytorch save time: 6377.5 ms
pytorch load time: 3616.3 ms
file size: 2.73 gb
saved to: saved_models/openllama_3b_pytorch.pth

phase 1 completed successfully!
