In [2]:
import torch
import gc
from math import exp
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

In [3]:
import gc
import time
import torch

def clear_memory(extra_names=()):
    """
    Aggressively free GPU memory across *all* CUDA devices.
    Pass any extra global var names via extra_names if needed.
    """
    # 0) Early exit if no CUDA
    if not torch.cuda.is_available():
        gc.collect()
        return print("CUDA not available. Collected CPU garbage only.")

    # 1) Try to move known models off GPU (if still around)
    for name in ("model", "base_model"):
        obj = globals().get(name, None)
        try:
            if obj is not None and hasattr(obj, "to"):
                obj.to("cpu")
        except Exception:
            pass

    # 2) Delete common globals (if present)
    for name in ("inputs", "base_model", "model", "tokenizer", *extra_names):
        globals().pop(name, None)

    # 3) Delete any stray CUDA tensors lingering in globals()
    for name, obj in list(globals().items()):
        try:
            if torch.is_tensor(obj) and obj.is_cuda:
                del globals()[name]
        except Exception:
            pass

    # 4) Full GC pass
    gc.collect()
    time.sleep(0.1)

    # 5) Clear *each* CUDA device
    for idx in range(torch.cuda.device_count()):
        try:
            with torch.cuda.device(idx):
                torch.cuda.synchronize()
                torch.cuda.empty_cache()
                # Collect interprocess memory (helps when using multiple processes / dataloaders)
                torch.cuda.ipc_collect()
                # Reset peak stats (optional, for cleaner diagnostics)
                try:
                    torch.cuda.reset_peak_memory_stats(idx)
                except Exception:
                    pass
        except Exception:
            # Keep going even if one device throws
            pass

    # 6) One more GC + sync
    gc.collect()
    for idx in range(torch.cuda.device_count()):
        try:
            with torch.cuda.device(idx):
                torch.cuda.synchronize()
        except Exception:
            pass

    # 7) Report per-device
    for idx in range(torch.cuda.device_count()):
        alloc = torch.cuda.memory_allocated(idx) / (1024 ** 3)
        reserv = torch.cuda.memory_reserved(idx) / (1024 ** 3)
        print(f"cuda:{idx} -> allocated: {alloc:.2f} GB | reserved: {reserv:.2f} GB")

# run it
clear_memory()


cuda:0 -> allocated: 0.00 GB | reserved: 0.00 GB


In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=False,
    quantization_config=bnb_config,
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.81s/it]


In [9]:
prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|>"

# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
# Generate the text
generation_output = model.generate(
  input_ids=input_ids,
  max_new_tokens=20
)

# Print the output
print(tokenizer.decode(generation_output[0]))
print(input_ids)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|>Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|>Here is an email apologizing to Sarah for the tragic gardening mishap:

Subject: A Sinc
tensor([[128000,   8144,    459,   2613,  21050,   4954,    311,  21077,    369,
            279,  35279,  60299,  64496,    391,     13,  83017,   1268,    433,
           7077,  16134,     91,  78191,     91,     29]], device='cuda:0')


In [8]:
for id in input_ids[0]:
   print(tokenizer.decode(id))

<|begin_of_text|>
Write
 an
 email
 apolog
izing
 to
 Sarah
 for
 the
 tragic
 gardening
 mish
ap
.
 Explain
 how
 it
 happened
.<
|
assistant
|
>


In [10]:
generation_output

tensor([[128000,   8144,    459,   2613,  21050,   4954,    311,  21077,    369,
            279,  35279,  60299,  64496,    391,     13,  83017,   1268,    433,
           7077,  16134,     91,  78191,     91,     29,   8586,    374,    459,
           2613,  21050,   4954,    311,  21077,    369,    279,  35279,  60299,
          64496,    391,   1473,  13317,     25,    362,    328,   2910]],
       device='cuda:0')