### Install libraries 

First, install the libraries to be able to load fine-tuned model

In [1]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("Device Name:", torch.cuda.get_device_name(0))

CUDA Available: True
Device Name: NVIDIA L4


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
torch.__version__

'2.6.0+cu124'

In [4]:
torch.version.cuda

'12.4'

In [5]:
import transformers
print(transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


4.55.2


In [6]:
import sys
print(sys.executable)

/opt/conda/envs/gemma-env/bin/python


In [7]:
from transformers import pipeline

In [8]:
g3_plain_model = "gs://mlops-course-polar-pillar-461115-g2-week10/fine-tuning/output/gemma-3-1b-it-1755402334885-20250817091650/merged_model"
local_dir = "g3_plain_model"
model_name = "g3_plain_model"

In [9]:
!echo models/$local_dir

models/g3_plain_model


### Run the following cell only if you want to reimport the model

In [10]:
# !rm -rf models/$local_dir
# !mkdir -p models/$local_dir
# !gsutil -m cp -r $g3_plain_model models/$local_dir

### Vertex AI Workbench    

In [11]:
# Path for local VS code set up
import os
# local_dir = os.path.join(os.getcwd(), r"models\g3_plain_model")
local_dir = os.path.join(os.getcwd(), f"models/{model_name}")
local_dir

'/home/jupyter/iris_pipeline/llm_ops_demo/models/g3_plain_model'

In [12]:
# # Check local_dir path
import os
os.path.join(local_dir, "merged_model")

'/home/jupyter/iris_pipeline/llm_ops_demo/models/g3_plain_model/merged_model'

In [13]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

local_path = os.path.join(local_dir, "merged_model")
tokenizer = AutoTokenizer.from_pretrained(local_path)
model = AutoModelForCausalLM.from_pretrained(local_path)
model

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=1152, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1152, out_features=256, bias=False)
          (v_proj): Linear(in_features=1152, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1152, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (up_proj): Linear(in_features=1152, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=1152, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((11

In [14]:
def gemma3_prompt(text):
    prompt = (
    "<start_of_turn>system\n"
    "Classify the flower based on its measurements into one of the following species: [setosa, versicolor, virginica]\n"
    "<end_of_turn>\n"
    "<start_of_turn>user\n"
    +text+
    "<end_of_turn>\n"
    "<start_of_turn>assistant\n"
    )
    
    return prompt

In [23]:
# Text for g3_plain_model
text= (
    # "Sepal Length: 6.4, Sepal Width: 2.9, Petal Length: 4.3, Petal Width: 1.3" #versicolor
    "Sepal Length: 5.0, Sepal Width: 3.6, Petal Length: 1.4, Petal Width: 0.2" #setosa
    # "Sepal Length: 5.9, Sepal Width: 3.0, Petal Length: 5.1, Petal Width: 1.8" #virginica
)

In [16]:
# # Text for g3_binned_model
# text= (
#     "Sepal Length: medium, Sepal Width: small, Petal Length: medium, Petal Width: medium" #versicolor
#     # "Sepal Length: small, Sepal Width: medium, Petal Length: small, Petal Width: small" #setosa
#     # "Sepal Length: large, Sepal Width: medium, Petal Length: large, Petal Width: large" #virginica
# )

In [17]:
local_dir

'/home/jupyter/iris_pipeline/llm_ops_demo/models/g3_plain_model'

In [24]:
prompt = gemma3_prompt(text)

inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(
    **inputs,
    max_new_tokens=2,
    do_sample=False,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

# Decode only the new tokens
generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
print("Predicted class:", generated_text.strip())
print(f"Model used: {local_dir.split('/')[-1]}")
print(f"\nPrompt used:\n{prompt}")

Predicted class: Versicolor
Model used: g3_plain_model

Prompt used:
<start_of_turn>system
Classify the flower based on its measurements into one of the following species: [setosa, versicolor, virginica]
<end_of_turn>
<start_of_turn>user
Sepal Length: 5.0, Sepal Width: 3.6, Petal Length: 1.4, Petal Width: 0.2<end_of_turn>
<start_of_turn>assistant



In [19]:
# Load iris_binned_v1_test.jsonl to prepare y_test
import json

file_path = "data/iris_binned_v1_test.jsonl"

labels = []
with open(file_path, "r") as f:
    for line in f:
        record = json.loads(line)
        for msg in record["messages"]:
            if msg["role"] == "assistant":
                labels.append(msg["content"].strip())

print("First 10 labels:", labels[:10])
print("Total labels:", len(labels))


First 10 labels: ['setosa', 'setosa', 'virginica', 'virginica', 'virginica', 'setosa', 'versicolor', 'setosa', 'virginica', 'setosa']
Total labels: 60


In [20]:
import json

def predict_jsonl(
    jsonl_path: str,
    tokenizer,
    model,
    gemma3_prompt,
    max_new_tokens: int = 2,
    do_sample: bool = False,
):
    """
    Iterates over a JSONL with records like:
      {"messages":[{"role":"system","content":"..."}, {"role":"user","content":"..."}, {"role":"assistant","content":"label"}]}
    Extracts the user content (fallback to system), builds prompt, generates prediction,
    and appends to `predictions`.
    """
    predictions = []

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            obj = json.loads(line)

            # Prefer user content; fallback to system if user is missing
            user_text = next((m["content"] for m in obj.get("messages", []) if m.get("role") == "user"), None)
            if user_text is None:
                user_text = next((m["content"] for m in obj.get("messages", []) if m.get("role") == "system"), "")

            prompt = gemma3_prompt(user_text.strip())
            inputs = tokenizer(prompt, return_tensors="pt")

            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=do_sample,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
            )

            # Decode only the newly generated tokens
            new_tokens = outputs[0][inputs["input_ids"].shape[-1]:]
            pred = tokenizer.decode(new_tokens, skip_special_tokens=True).strip().lower()
            predictions.append(pred)

    return predictions

In [21]:
predictions = predict_jsonl(
    "data/iris_binned_v1_test.jsonl",
    tokenizer=tokenizer,
    model=model,
    gemma3_prompt=gemma3_prompt,
    max_new_tokens=2,
    do_sample=False,
)

print(predictions[:10])
print("Total predictions:", len(predictions))

['versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor']
Total predictions: 60


In [22]:
# Confusion matrix, classification report 
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(labels, predictions))
print(classification_report(labels, predictions))

[[ 0 20  0]
 [ 0 20  0]
 [ 0 20  0]]
              precision    recall  f1-score   support

      setosa       0.00      0.00      0.00        20
  versicolor       0.33      1.00      0.50        20
   virginica       0.00      0.00      0.00        20

    accuracy                           0.33        60
   macro avg       0.11      0.33      0.17        60
weighted avg       0.11      0.33      0.17        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
