In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Detect if a lamp is an acquity brand lamp using Supervised fine tuned Gemini

Initialize the project

## Overview

This codelab walks you through the process of building a simple image classification system using a pre-trained and then fine-tuned Gemini model. The goal is to accurately identify whether a given lamp image belongs to the "Acquity" brand.

The process involves:

1.  **Loading the pre-trained Gemini model:** We start by initializing the generative AI model.
2.  **Preparing the dataset:** You'll need a dataset of lamp images, labeled as either "Acquity" or "Not Acquity". This dataset will be used for fine-tuning.
3.  **Fine-tuning the model:** The codelab will guide you through the steps of fine-tuning the Gemini model on your specific dataset. This process adapts the model's knowledge to better recognize Acquity lamps.
4.  **Evaluating the model:** After fine-tuning, we'll test the model's performance on unseen data to assess its accuracy.
5.  **Making predictions:** Finally, you'll learn how to use the fine-tuned model to predict whether a new lamp image is an Acquity brand lamp.

By the end of this codelab, you will have a working example of how to leverage the power of large language models for specific image classification tasks through supervised fine-tuning.
"""

In [None]:
# ------------ Parameters ------------
PROJECT_ID = 'sarthaks-lab' # @param {type:"string"}
LOCATION = 'us-central1' # @param {type:"string"}

In [None]:
# Provide a bucket name
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}
BUCKET_URI = f"gs://{BUCKET_NAME}"

# Create the bucket if it doesn't exist
!gsutil ls -b {BUCKET_URI} || gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

In [None]:
# Upload the training data to your bucket
!gsutil cp "/Users/sarthakgy/Desktop/insights-samples/imagery_insights/notebooks/Supervised fine tuning/data/acquity_detector_negative_examples_v2.jsonl" {BUCKET_URI}/data/

In [None]:
# Install the necessary library
!pip install --upgrade --user --quiet google-genai google-cloud-aiplatform

# Import the library
import google.cloud.aiplatform as aiplatform
from google import genai
from google.genai import types
import time

# Initialize the Vertex AI SDK and Gen AI Client
aiplatform.init(project=PROJECT_ID, location=LOCATION)
client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

# Define the dataset URI and model name
dataset_uri = f"{BUCKET_URI}/data/acquity_detector_negative_examples_v2.jsonl"
tuned_model_display_name = "Aquity_model_detector_fine_tuned"
base_model = "gemini-2.5-flash"

training_dataset = {
    "gcs_uri": dataset_uri,
}

# Tune a model using `tune` method.
sft_tuning_job = client.tunings.tune(
    base_model=base_model,
    training_dataset=training_dataset,
    config=types.CreateTuningJobConfig(
        tuned_model_display_name=tuned_model_display_name,
    ),
)

# Get the tuning job info.
tuning_job = client.tunings.get(name=sft_tuning_job.name)

# Status Check
print("Tuning job created. Waiting for completion...")
# Wait for job completion
running_states = [
    "JOB_STATE_PENDING",
    "JOB_STATE_RUNNING",
]

while tuning_job.state.name in running_states:
    print(".", end="")
    tuning_job = client.tunings.get(name=tuning_job.name)
    time.sleep(60) # Check every minute

print()

if tuning_job.state.name == "JOB_STATE_SUCCEEDED":
    MODEL_ENDPOINT = tuning_job.tuned_model.endpoint
    print(f"Model deployed to endpoint: {MODEL_ENDPOINT}")
else:
    print(f"Tuning job failed with state: {tuning_job.state.name}")
    if hasattr(tuning_job, 'error') and tuning_job.error:
        print(f"Error: {tuning_job.error}")
    # Fallback to backup endpoint from later in the notebook
    MODEL_ENDPOINT = "projects/635092392839/locations/us-central1/endpoints/3000095541712388096"
    print(f"Using backup endpoint: {MODEL_ENDPOINT}")

## Setup
Enable APIs and Set Permissions
Enable the Vertex AI API

Make sure you have been granted the roles for the GCP project you'll access from this notebook:

roles/aiplatform.user

In [None]:
from google.cloud import bigquery
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import pandas as pd

In [None]:
# Updated query to group by asset_id and aggregate all image URIs
BIGQUERY_QUERY = """
    SELECT
        asset_id,
        ARRAY_AGG(gcs_uris[SAFE_OFFSET(0)]) AS gcs_uris
    FROM
        `sarthaks-lab.imagery_insights_analysis.utility_pole_evaluations`
    WHERE
        type='Street light'
        AND ARRAY_LENGTH(gcs_uris) > 0
    GROUP BY
        asset_id
    LIMIT 1000
"""

In [None]:
MODEL_ENDPOINT = "projects/635092392839/locations/us-central1/endpoints/3000095541712388096"

In [None]:
# Updated prompt to be more lenient and ask for model number
PROMPT = '''Follow these rules precisely to generate your answer:

1.  **Analyze the Input:** Carefully examine the provided Lamp Input. Compare its specific features, design elements, markings, and any visible model numbers against the information in the Acuity Brand Identification Guide.

2.  **Generate a Confidence Score:** Based on your analysis, internally generate a confidence score from 0.0 to 1.0 that represents your certainty that the lamp is an Acuity brand product.

3.  **Apply Strict Classification Logic:** Use your confidence score to determine your answer according to the following thresholds:
    *   **If the score is greater than 0.5:** Your answer is "Yes". This indicates a high degree of confidence, requiring a direct match of multiple key features, design language, or a model number consistent with the guide.
    *   **If the score is between 0.25 and 0.5 (inclusive):** Your answer is "Maybe". This indicates some features align with the guide, but there is not enough evidence for a confident "Yes" or "No".
    *   **If the score is less than 0.25:** Your answer is "No". This indicates the lamp has features that contradict the guide, is identifiable as a different brand, or lacks any resemblance to an Acuity product.

4.  **Format Your Output:** Your response must strictly follow one of the formats below, based on the answer you determined in the previous step. Do not add any extra text, explanations, or apologies.
    *   **For a "Yes" answer:**
        *   If a model number is visible or mentioned in the Lamp Input, respond with: `Yes. Model Number: [model number]`
        *   If no model number is available, respond with: `Yes.`

    *   **For a "Maybe" answer:**
        *   Respond with: `Maybe.`

    *   **For a "No" answer:**
        *   Respond with: `No.`'''

In [None]:
NUM_ROWS_TO_PROCESS = 1000

In [None]:
# ------------ 1. Fetch data from BigQuery ------------
# Create a BigQuery client
client = bigquery.Client(project=PROJECT_ID)

# Execute the query and load the results into a pandas DataFrame
df = client.query(BIGQUERY_QUERY).to_dataframe()

In [None]:
# ------------ 2. Initialize Vertex AI and Load Model ------------
# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)

# Load the generative model
model = GenerativeModel(MODEL_ENDPOINT)

In [None]:
# ------------ 3. Analyze Images ------------
# Create an empty list to store the analysis results
analysis_results = []

# Create a subset of the DataFrame to process
df_to_process = df.head(NUM_ROWS_TO_PROCESS)
for index, row in df_to_process.iterrows():
    # Get the list of GCS URIs for the asset
    image_uris = row['gcs_uris']
    asset_id = row['asset_id']
    print(f"Processing asset: {asset_id}")

    if image_uris is not None and len(image_uris) > 0:
        try:
            # Prepare the content for the model: prompt + all images
            content_parts = [PROMPT]
            for uri in image_uris:
                if uri: # Ensure URI is not None
                    content_parts.append(Part.from_uri(uri, mime_type='image/jpeg'))

            # Generate content with all images and the prompt
            response = model.generate_content(content_parts)
            result_text = response.text.strip()
            print(f"  -> Result: {result_text}")
            analysis_results.append(result_text)

        except Exception as e:
            error_message = f"Error processing asset: {e}"
            print(f"  -> {error_message}")
            analysis_results.append(error_message)
    else:
        no_uri_message = "No GCS URIs found for this asset."
        print(f"  -> {no_uri_message}")
        analysis_results.append(no_uri_message)

In [None]:
# ------------ 4. Display All Results ------------
# Create a new DataFrame with only asset_id and analysis_result
results_df = pd.DataFrame({
    'asset_id': df_to_process['asset_id'],
    'analysis_result': analysis_results
})

# Display the full DataFrame without filtering
print("\n--- Final Results --- Succeeded")
display(results_df)

# Task
Modify the script to add a print statement inside the main processing loop that prints the `asset_id` of each asset from the `sarthaks-lab.imagery_insights_analysis.utility_pole_evaluations` BigQuery table as it is being analyzed by the model.

# Task
Summarize the lamp detection results from the `results_df` DataFrame. The summary should be in a new pandas DataFrame and include the count of 'Yes', 'Maybe', 'No', and 'Error' detections, along with a few sample `asset_id`s for each category.

## Generate Summary of Analysis Results

### Subtask:
Add a new code cell that processes the `results_df` DataFrame. This code will categorize the results into 'Yes', 'Maybe', 'No', and 'Error', count the items in each category, and retrieve a few sample `asset_id`s. The summary will be displayed in a new, nicely formatted pandas DataFrame.

In [None]:
try:
    def categorize_result(result):
        if not isinstance(result, str):
            return 'Error'
        if result.startswith('Yes'):
            return 'Yes'
        elif result == 'Maybe.':
            return 'Maybe'
        elif result == 'No.':
            return 'No'
        else:
            return 'Error'

    results_df['category'] = results_df['analysis_result'].apply(categorize_result)

    summary_df = results_df.groupby('category').agg(
        count=('asset_id', 'size'),
        sample_assets=('asset_id', lambda x: list(x.head(3)))
    ).reset_index()

    print("\n--- Analysis Summary ---")
    display(summary_df)

except NameError:
    print("Error: The 'results_df' DataFrame is not defined. Please ensure the previous cell has been executed successfully.")

### Important Note on Cell Execution

The error `NameError: name 'results_df' is not defined` occurs because the DataFrame `results_df` is created in a previous cell but is not available when the summarization code is run. This can happen if the cells are not run in the correct top-to-bottom order, or if the notebook kernel has been restarted.

**To fix this, please ensure you have successfully executed the cell that performs the image analysis and creates the `results_df` DataFrame (the long cell beginning with `from google.cloud import bigquery`) *before* running the final summary cell below.**