# Install packages

In [3]:
!pip install -q fastapi uvicorn pyngrok nest_asyncio transformers accelerate bitsandbytes

In [4]:
import os
from google.colab import drive
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load Model from Hugging Face

In [5]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from google.cloud import storage

# 2. Model ID
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# 3. Quantization Configuration (Fix for deprecation warning)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True  # Use 4-bit quantization
)

# 4. Download tokenizer and model (with quantization)
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config  # Apply quantization config
)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Check model size

In [6]:
# 4. Save tokenizer and model to a local directory
model_dir = "TinyLlama-1.1B-Chat-v1.0"  # A local directory to save the model and tokenizer
os.makedirs(model_dir, exist_ok=True)

# Save tokenizer
tokenizer.save_pretrained(model_dir)

# Save model weights
model.save_pretrained(model_dir, safe_serialization=True)

In [7]:
# prompt: what is the size of my saved model?

import os

model_dir = "TinyLlama-1.1B-Chat-v1.0"  # Same directory where you saved the model

total_size = 0
for dirpath, dirnames, filenames in os.walk(model_dir):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        # skip if it is symbolic link
        if not os.path.islink(fp):
            total_size += os.path.getsize(fp)

print(f"Size of the saved model: {total_size / (1024**3):.2f} GB")

Size of the saved model: 0.76 GB


In [8]:
import re
import torch
from transformers import pipeline, AutoTokenizer

model_dir = "TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
pipe = pipeline("text-generation", model=model_dir, tokenizer=tokenizer, torch_dtype=torch.float16, device_map="auto")

messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot, respond to each question accurately and concisely.",
    },
    {   "role": "user",
        "content": "Can you provide a short description of colorado rocky mountains?"
    },
]

prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=1000, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Device set to use cuda:0


[{'generated_text': "<|system|>\nYou are a friendly chatbot, respond to each question accurately and concisely.</s>\n<|user|>\nCan you provide a short description of colorado rocky mountains?</s>\n<|assistant|>\nColorado Rocky Mountains is a mountain range located in the western United States. The range stretches from the southern border of Colorado to the northwestern corner of Wyoming, covering a total of 1.7 million acres. The Rocky Mountains are characterized by their rugged, jagged formations and steep cliffs, including the iconic Grand Visionary (Victor), Pike's Peak, and Longs Peak. The range is home to a diverse array of wildlife, including elk, deer, and bison, as well as numerous species of flora, including conifers and pines. Visitors can hike, mountain bike, and drive through the mountains, with many trails and scenic drives offering stunning views of the rugged terrain."}]


In [9]:
response = outputs[0]["generated_text"]

In [10]:
# Extract text after <|assistant|>
assistant_start = response.find("<|assistant|>")
if assistant_start != -1:
  cleaned_output = response[assistant_start + len("<|assistant|>"):].strip()
  print(cleaned_output)
else:
  print("Could not find <|assistant|> in the response.")

Colorado Rocky Mountains is a mountain range located in the western United States. The range stretches from the southern border of Colorado to the northwestern corner of Wyoming, covering a total of 1.7 million acres. The Rocky Mountains are characterized by their rugged, jagged formations and steep cliffs, including the iconic Grand Visionary (Victor), Pike's Peak, and Longs Peak. The range is home to a diverse array of wildlife, including elk, deer, and bison, as well as numerous species of flora, including conifers and pines. Visitors can hike, mountain bike, and drive through the mountains, with many trails and scenic drives offering stunning views of the rugged terrain.
