# Install packages

In [2]:
!pip install -q fastapi uvicorn pyngrok nest_asyncio transformers accelerate bitsandbytes

In [3]:
!pip install git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3

Collecting git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3
  Cloning https://github.com/huggingface/transformers (to revision v4.49.0-Gemma-3) to /tmp/pip-req-build-dzdrdg1i
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-dzdrdg1i
  Running command git checkout -q 1c0f782fe5f983727ff245c4c1b3906f9b99eec2
  Resolved https://github.com/huggingface/transformers to commit 1c0f782fe5f983727ff245c4c1b3906f9b99eec2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.50.0.dev0-py3-none-any.whl size=10936429 sha256=9410cdf29c25d4251d580dd6442f2d3679cb3c615c9f74ff904da8547962bbdf
  Stored in directory: /tmp/pip-eph

In [4]:
import os
from google.colab import drive
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load Model from Hugging Face

In [5]:
import os
from transformers import AutoTokenizer, BitsAndBytesConfig, Gemma3ForCausalLM
import torch

# 1. Model ID
model_id = "google/gemma-3-1b-it"

# 2. Quantization Configuration (Fix for deprecation warning)
quantization_config = BitsAndBytesConfig()

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,  # Enable 4-bit quantization
#     bnb_4bit_use_double_quant=True,  # Enable nested quantization for better memory efficiency
#     bnb_4bit_quant_type="nf4",  # Use 4-bit NormalFloat quantization
#     bnb_4bit_compute_dtype=torch.float16  # Use float16 for computation
# )

model = Gemma3ForCausalLM.from_pretrained(
    model_id, quantization_config=quantization_config
).eval()

config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/192 [00:00<?, ?B/s]

# Check model size

In [6]:
# 4. Save tokenizer and model to a local directory
model_dir = "gemma-3-1b-it"  # A local directory to save the model and tokenizer
os.makedirs(model_dir, exist_ok=True)

# Save tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(model_dir)

# Save model weights
model.save_pretrained(model_dir, safe_serialization=True)

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [7]:
import os

total_size = 0
for dirpath, dirnames, filenames in os.walk(model_dir):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        # skip if it is symbolic link
        if not os.path.islink(fp):
            total_size += os.path.getsize(fp)

print(f"Size of the saved model: {total_size / (1024**3):.2f} GB")

Size of the saved model: 0.96 GB


In [8]:
import re
import torch
from transformers import pipeline, AutoTokenizer

# Set device to "cuda" if available, otherwise "cpu"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_dir)

# -----

# pipe = pipeline("text-generation", model=model_dir, tokenizer=tokenizer, torch_dtype=torch.float16, device_map="auto")

# messages = [
#     {
#         "role": "system",
#         "content": "You are a friendly chatbot, respond to each question accurately and concisely.",
#     },
#     {   "role": "user",
#         "content": "Can you provide a short description of colorado rocky mountains?"
#     },
# ]

# prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# outputs = pipe(prompt, max_new_tokens=1000, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
# print(outputs)

# ----

prompt = "Can you write me a concise yet effective cover letter for a job of Data Engineer that requires Azure Databricks expertise?"

messages = [
    [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are a helpful assistant."},]
        },
        {
            "role": "user",
            "content": [{"type": "text", "text": "What are the Capitals of major European nations?"},]
        },
    ],
]

inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(model.device)

with torch.inference_mode():
    outputs = model.generate(**inputs, max_new_tokens=64)



In [9]:
response = tokenizer.batch_decode(outputs)
print(response)

['<bos><start_of_turn>user\nYou are a helpful assistant.\n\nWhat are the Capitals of major European nations?<end_of_turn>\n<start_of_turn>model\nOkay, here are the capitals of major European nations:\n\n*   **France:** Paris\n*   **Germany:** Berlin\n*   **Italy:** Rome\n*   **Spain:** Madrid\n*   **United Kingdom:** London\n*   **Belgium:** Brussels\n*   **Portugal:** Lisbon\n*   ']


In [10]:
response[0]

'<bos><start_of_turn>user\nYou are a helpful assistant.\n\nWhat are the Capitals of major European nations?<end_of_turn>\n<start_of_turn>model\nOkay, here are the capitals of major European nations:\n\n*   **France:** Paris\n*   **Germany:** Berlin\n*   **Italy:** Rome\n*   **Spain:** Madrid\n*   **United Kingdom:** London\n*   **Belgium:** Brussels\n*   **Portugal:** Lisbon\n*   '

In [29]:
# Input string
output = response[0]

# Remove special tokens and metadata
cleaned_output = output.split("<start_of_turn>model")[1].strip()

# Extract the list of capitals
# capitals_section = cleaned_output.split("Okay, here are the capitals of major European nations:")[1].strip()

# Split into individual lines and preserve markdown formatting
capitals_list = [line.strip() for line in cleaned_output.split("\n") if line.strip()]

# Print the cleaned list with markdown formatting
for item in capitals_list:
    print(item)

Okay, here are the capitals of major European nations:
*   **France:** Paris
*   **Germany:** Berlin
*   **Italy:** Rome
*   **Spain:** Madrid
*   **United Kingdom:** London
*   **Belgium:** Brussels
*   **Portugal:** Lisbon
*
