In [None]:
!pip install -U bitsandbytes

In [None]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gc

In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
LLAMA = "meta-llama/Llama-3.2-1B-Instruct"
PHI = "microsoft/Phi-4-mini-instruct"
GEMMA = "google/gemma-3-270m-it"
QWEN = "Qwen/Qwen3-4B-Instruct-2507"
DEEPSEEK = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [None]:
messages = [
    {"role": "user", "content": "Tell a joke for a room of Data Scientists"}
  ]

In [None]:
# Quantization Config - this allows us to load the model into memory and use less memory

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
# Tokenizer

# tokenizer = AutoTokenizer.from_pretrained(LLAMA)
# tokenizer.pad_token = tokenizer.eos_token
# inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

In [None]:
# inputs

In [None]:
# The model

# model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)

In [None]:
model

In [None]:
# memory = model.get_memory_footprint()/1e6
# print(f'Memory footprint:{memory:,.1f}MB')

In [None]:
# outputs = model.generate(inputs, max_new_tokens = 80)
# print(tokenizer.decode(outputs[0]))

In [None]:
# Delete Python references
# del inputs, outputs, model, tokenizer

# Clear GPU cache
# torch.cuda.empty_cache()

# Optional: trigger garbage collection to free CPU memory too
# import gc
# gc.collect()

In [None]:
# Wrapping everything in a function - and  adding Streaming

def generate(model, messages):
  tokenizer = AutoTokenizer.from_pretrained(model)
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(messages, return_tensors='pt').to('cuda')
  streamer = TextStreamer(tokenizer)
  model = AutoModelForCausalLM.from_pretrained(model, device_map='auto',
                                               quantization_config=quant_config)
  outputs = model.generate(inputs, max_new_tokens=80, streamer=streamer)
  del tokenizer, streamer, model, inputs, outputs
  torch.cuda.empty_cache()