In [1]:
import os
import google.generativeai as genai
from google.colab import userdata
GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')

genai.configure(api_key=GEMINI_API_KEY)

In [2]:
model_info = genai.get_model(name="gemini-2.0-flash")
model_info

Model(name='models/gemini-2.0-flash',
      base_model_id='',
      version='2.0',
      display_name='Gemini 2.0 Flash',
      description='Gemini 2.0 Flash',
      input_token_limit=1048576,
      output_token_limit=8192,
      supported_generation_methods=['generateContent', 'countTokens', 'createCachedContent'],
      temperature=1.0,
      max_temperature=2.0,
      top_p=0.95,
      top_k=40)

In [3]:
system_prompt = "You are an expert wardrobe inventory logger and fashion reviewer. Your job is to review videos and photos and catalogue items in the most correct way possible for fashion needs and wardrobe inventory databases."
print(system_prompt)


You are an expert wardrobe inventory logger and fashion reviewer. Your job is to review videos and photos and catalogue items in the most correct way possible for fashion needs and wardrobe inventory databases.


In [4]:
temperature = 0.5
top_k = 40
top_p = 0.95

# Model config

generation_config = {
    "temperature": temperature,
    "top_p": top_p,
    "top_k": top_k,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain"
}

# Model setup

model_gemini_flash = genai.GenerativeModel(model_name="models/gemini-2.0-flash",
                                           system_instruction=system_prompt,
                                           generation_config=generation_config)

# Create helper function for generating content
def generate_content(prompt, model):
  """Returns a given model's output for a given prompt."""
  return model.generate_content(prompt)

In [5]:
# Set up path to video file

path_to_video_file = "/content/drive/MyDrive/StyleCloud/few-clothes"

In [None]:
import cv2
import random
import numpy as np
import matplotlib.pyplot as plt

from PIL import Image, ImageDraw, ImageFont

# Open the video file
video_capture = cv2.VideoCapture(path_to_video_file)

if not video_capture.isOpened():
    print("Error: Could not open video.")
else:
    frame_count = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video_capture.get(cv2.CAP_PROP_FPS)
    width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_length_seconds = round(frame_count / fps, 2)
    video_length_minutes = round(video_length_seconds / 60, 2)

    print(f"[INFO] Video Metadata:")
    print(f" - Frame count: {frame_count}")
    print(f" - FPS: {fps}")
    print(f" - Video length: {video_length_seconds} seconds ({video_length_minutes} minutes)")
    print(f" - Width: {width}")
    print(f" - Height: {height}")

    # Choose random frames
    num_frames_to_display = 10
    random_frames_indices = random.sample(range(frame_count), num_frames_to_display)
    frames = []

    for frame_index in random_frames_indices:
        video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
        success, frame = video_capture.read()
        if success:
            frames.append(frame)
            print(f"[INFO] Captured Frame Number: {frame_index}")
        else:
            print(f"Error: Could not read frame {frame_index}.")
            continue

    if len(frames) > 0:
        # Resize frames to a consistent size for the grid
        resized_frames = [cv2.resize(frame, (200, 200)) for frame in frames]

        # Create a grid to display the frames
        grid_rows = 2
        grid_cols = 5  # 2 rows x 5 columns = 10 frames
        if len(resized_frames) < grid_rows * grid_cols:
            grid_rows = 1
            grid_cols = len(resized_frames)

        grid_height = grid_rows * resized_frames[0].shape[0]
        grid_width = grid_cols * resized_frames[0].shape[1]
        grid = np.zeros((grid_height, grid_width, 3), dtype=np.uint8)

        for i, frame in enumerate(resized_frames):
            row = i // grid_cols
            col = i % grid_cols
            grid[row * resized_frames[0].shape[0]:(row + 1) * resized_frames[0].shape[0],
                 col * resized_frames[0].shape[1]:(col + 1) * resized_frames[0].shape[1], :] = frame

        # Convert BGR to RGB for Matplotlib
        grid_rgb = cv2.cvtColor(grid, cv2.COLOR_BGR2RGB)

        # Display the grid using Matplotlib
        plt.figure(figsize=(10, 5))
        plt.imshow(grid_rgb)
        plt.axis('off')  # Hide axes for better visualization
        plt.title("Random Frames Grid")
        plt.show()

        # Save the grid to a file and display it
        grid_path = "random_frames_grid.jpg"
        cv2.imwrite(grid_path, grid)
        print("[INFO] Saved Random Frames Grid as 'random_frames_grid.jpg'")

        # Display the saved image in the notebook
        # display(Image(filename=grid_path))
    else:
        print("No frames were successfully captured to display.")

    video_capture.release()

In [7]:
video_file_upload = genai.upload_file(path=path_to_video_file,mime_type="video/quicktime")

In [8]:
%%time
import time
# Check status of video file
print("[INFO] Video upload processing, please wait.", end="")

while video_file_upload.state.name == "PROCESSING":
  print(".", end="")
  time.sleep(2)
  video_file_upload = genai.get_file(video_file_upload.name)

if video_file_upload.state.name == "FAILED":
  raise ValueError(video_file_upload.state.name)

[INFO] Video upload processing, please wait....CPU times: user 66.6 ms, sys: 10.5 ms, total: 77.1 ms
Wall time: 8.19 s


In [None]:
# Check the video file state name, if the state name is ACTIVE, ok to proceed
print(video_file_upload.state.name)
print(video_file_upload)

In [10]:
print("My files:")

for f in genai.list_files():
    print("->", f.name)

My files:
-> files/li9c88qq6r73


In [None]:
#genai.delete_file(name=video_file_upload.name)

Caching code. currently dosent work due to free tier of gemini

In [None]:
# from google.generativeai import caching

# for cached_object in caching.CachedContent.list():
#   print(cached_object)

In [None]:
# try:
#   video_file_cache = caching.CachedContent.get(name="cachedContents/db8qp5uvbw27") # note: this will error if the target cached object is not available
#   print(video_file_cache)
# except Exception as e:
#   video_file_cache = None
#   print(e)
#   print(video_file_cache)

In [None]:
# # Create a cache (this can be varied by the time you'd like it)
# # Note: caching = cost for storage but = 4x cheaper input, see pricing: https://ai.google.dev/pricing
# import datetime

# CACHE_MINUTES = 2 # Note: Can change this based on your use case. Beware caching storage costs, best to cache for the exact amount of time you need.
# CACHE_HOURS = CACHE_MINUTES / 60

# if video_file_cache == None:
#   video_file_cache = caching.CachedContent.create(
#       model="models/gemini-2.0-flash-001", # note: model cache name should be same as model used to generate outputs, model postfix is required, e.g. "-002"
#       display_name="wardrobe catalogue video",
#       system_instruction=system_prompt,
#       contents=[video_file_upload],
#       ttl=datetime.timedelta(minutes=CACHE_MINUTES) # Note: there are no minimum or maximum bounds on context caching, however you should consider the use case of your app because caching prices can ramp up if left unchecked.
# )

# video_file_cache

#Helper Functions

In [11]:
def count_tokens(input_prompt, model=model_gemini_flash):
  """Returns the total tokens from a given input prompt.

  See the docs on token counting: https://ai.google.dev/gemini-api/docs/tokens?lang=python#text-tokens
  """
  return model.count_tokens(input_prompt).total_tokens

def df_to_csv_string(df) -> str:
  """
  Converts a pandas DataFrame to a CSV string.

  Parameters:
    df (pd.DataFrame): The DataFrame to convert.

  Returns:
    str: The DataFrame as a CSV-formatted string.
  """
  csv_buffer = StringIO()
  df.to_csv(csv_buffer, index=False)
  return csv_buffer.getvalue()

def csv_string_to_df(csv_string):
    """
    Converts a CSV string to a pandas DataFrame.

    Parameters:
      csv_string (str): The CSV-formatted string.

    Returns:
      pd.DataFrame: The resulting DataFrame.
    """
    return pd.read_csv(StringIO(csv_string))

#CSV Schema

In [12]:
from typing import Dict, Any, Optional, List

csv_schema = [
    {
        "field": "item_number",
        "type": "int",
        "description": "Sequential integer starting from 1. Example: 1, 2, 3",
        "example": "1"
    },
    {
        "field": "item_type",
        "type": "str",
        "description": "Type of item in lowercase. Standard categories include: shirt, t-shirt, shorts, innerwear, jeans, pants.",
        "example": "shorts"
    },
    {
        "field": "item_description",
        "type": "str",
        "description": "Detailed description including color, material, and notable features. Should be a complete phrase starting with a capital letter. Do not use commas in this field. Just describe the item verbatim.",
        "example": "Beige striped linen full sleeve shirt ."
    },

    {
        "field": "occasion",
        "type": "str",
        "description": "Suitable ocassion to wear the piece of clothing. Standard categories include: gym, partywear, casual, formal. Use NA if item type is innerwear",
        "example": "gym"
    },

    {
        "field": "item_brand",
        "type": "str",
        "description": "Brand name with proper capitalization. Try to identify brand based on logos and use NA if not visible/mentioned. Examples: Zara, Puma, Nike, NA. ",
        "example": "Puma"
    },
    {
        "field": "item_condition",
        "type": "str",
        "description": "Condition in lowercase, using standard terms: no visible damage, slight wear, moderate wear, significant damage, as new.",
        "example": "no visible damage"
    },
    {
        "field": "number_of_items",
        "type": "int",
        "description": "Integer count of identical items. If there are multiple of the same item, enter their integer count. Example: Single item: 1, Multiple items: 4",
        "example": "1"
    },

    {
        "field": "timestamp",
        "type": "str",
        "description": "Time in HH:MM:SS format with leading zeros.",
        "example": "00:01:30"
    },

    {
        "field": "overall_certainty_flag",
        "type": "int",
        "description": "Overall confidence level 1-10 considering all fields: 1-3: Multiple uncertain fields, needs review. 4-6: Some uncertainty in key fields. 7-8: Minor uncertainty in non-critical fields. 9-10: All fields verified with high confidence.",
        "example": "8"
    },

    {
        "field": "is_similar_to",
        "type": "str",
        "description": "Reference to similar item by item_number, or NA. Use for: Matching cloth pieces, Items from same occasion.",
        "example": "NA"
    },

]

def get_schema_string() -> str:
  """Returns a simple string representation of the schema."""
  return "\n".join([
      f"{i+1}. {field['field']} ({field['type']}): {field['description']} Example: {field['example']}"
      for i, field in enumerate(csv_schema)
  ])

def get_field_names_as_string() -> str:
  """Returns comma-separated field names for CSV header."""
  return ",".join([field["field"] for field in csv_schema])

def get_field_names_as_list() -> List:
  """Returns a list of field names for CSV header."""
  return [field["field"] for field in csv_schema]

print(f"[INFO] CSV schema string:\n{get_schema_string()}")
print()
print(f"[INFO] CSV header:\n{get_field_names_as_list()}")

[INFO] CSV schema string:
1. item_number (int): Sequential integer starting from 1. Example: 1, 2, 3 Example: 1
2. item_type (str): Type of item in lowercase. Standard categories include: shirt, t-shirt, shorts, innerwear, jeans, pants. Example: shorts
3. item_description (str): Detailed description including color, material, and notable features. Should be a complete phrase starting with a capital letter. Do not use commas in this field. Just describe the item verbatim. Example: Beige striped linen full sleeve shirt .
4. occasion (str): Suitable ocassion to wear the piece of clothing. Standard categories include: gym, partywear, casual, formal. Use NA if item type is innerwear Example: gym
5. item_brand (str): Brand name with proper capitalization. Try to identify brand based on logos and use NA if not visible/mentioned. Examples: Zara, Puma, Nike, NA.  Example: Puma
6. item_condition (str): Condition in lowercase, using standard terms: no visible damage, slight wear, moderate wear, s

In [13]:
import io
import csv

# Extract the CSV content from the input string
def quick_check_csv(model_output,
                    ideal_number_of_fields=len(get_field_names_as_list()),
                    target_start_tag="<csv>",
                    target_end_tag="</csv>"):
  """
  Extracts and validates a CSV from a model's output and performs formatting checks.

  This function extracts a CSV segment from the `model_output` string by locating
  the `target_start_tag` and `target_end_tag`. It validates the CSV's structure using
  Python's `csv` module, ensuring that all rows have the expected number of fields.
  Any formatting issues are detected and optionally fixed by enclosing problematic
  fields with quotes where necessary.

  Args:
      model_output: An object containing the model's output as a string in its `text` attribute.
      ideal_number_of_fields (int, optional): The expected number of fields in each CSV row.
          Defaults to the length of `get_field_names_as_list()`.
      target_start_tag (str, optional): The starting tag indicating the CSV content. Defaults to "<csv>".
      target_end_tag (str, optional): The ending tag indicating the CSV content. Defaults to "</csv>".

  Returns:
      str: The extracted and optionally fixed CSV content.
      bool: A flag indicating if the extracted CSV required fixing.
      list: A list of issues found, including details of rows with formatting problems.
  """

  # Get text output from model
  output_text = model_output.text

  # Quick assertions to make sure target start and end tags are available
  assert target_start_tag in output_text, f"target_start_tag: {target_start_tag} not in model's output text, is there an error?"
  assert target_end_tag in output_text, f"target_end_tag: {target_end_tag} not in model's output text, is there an error?"

  # Baseline filtering
  output_text = output_text.replace("```csv", "").replace("```", "")

  # Extract CSV string from XML tags
  csv_string = output_text.split(target_start_tag)[1].split(target_end_tag)[0].strip()

  # Prepare StringIO objects for input and output
  input_csv = io.StringIO(csv_string)
  output_csv = io.StringIO()

  # Read and write CSV with error handling
  reader = csv.reader(input_csv)
  writer = csv.writer(output_csv)

  # Initialize variables to track issues
  field_counts = []
  fixes_required = []
  fix_csv_required = False

  # Print ideal number of fields
  print(f"[INFO] Ideal number of fields: {ideal_number_of_fields}")

  for i, row in enumerate(reader):
      # For the first row, determine the ideal number of fields, only if ideal_number_of_fields var is not available
      if ideal_number_of_fields == None:
        if i == 0:
            ideal_number_of_fields = len(row)
            print(f"[INFO] Ideal number of fields: {ideal_number_of_fields}")

      # Count fields in each row and compare to the header
      field_counts.append(len(row))
      if len(row) != ideal_number_of_fields:
        error_string = f"[INFO] Row {i} has an unexpected number of fields: {len(row)} (expected: {ideal_number_of_fields})"
        print(error_string)
        fixes_required.append(f"Error: {error_string.replace('[INFO] ', '')} | Row to fix: {','.join(row)}")
        fix_csv_required = True

      # Fix problematic fields if required
      # Enclose fields containing commas, newlines, or quotes
      fixed_row = [
          f'"{field}"'.replace('"""', '"') if any(char in field for char in [",", "\n", '"']) else field for field in row
      ]

      # Write the fixed row to the output
      writer.writerow(fixed_row)

  # Get the fixed CSV data as a string
  output_csv.seek(0)
  output_csv_extracted = output_csv.getvalue()

  # Print a summary of issues, if any
  if fix_csv_required:
      print("[INFO] Some rows required fixing.")
  else:
      print("[INFO] No issues detected in the CSV.")

  # Print the fixed CSV output (for debugging or further use)
  # print(output_csv_fixed)

  return output_csv_extracted, fix_csv_required, fixes_required

class ExampleModelOutput:
  """Simple class to create a `text` attribute, similar to Gemini model outputs."""
  def __init__(self, text):
    self.text = text

example_model_output = ExampleModelOutput(text="""This is an example model output with CSV values from looking at a video.

Row 9 (item_number=9) of the CSV is broken due to having an extra comma in the item_name.

<csv>
'item_number', 'item_type', 'item_description', 'occasion', 'item_brand', 'item_condition', 'number_of_items', 'timestamp', 'overall_certainty_flag', 'is_similar_to'
1,Shirt,Blue striped linen full sleeve shirt,partywear,Zara,excellent,1,00:01:05,10,2
2,Shirt,Beige striped linen full sleeve shirt,partywear,bannaclub,excellent,2,00:06:20,10,1
3,T-Shirt, Light Dirty Green nylon half sleeve t-shirt,gym,puma,good,1,00:20:30,9,NA
</csv>
""")

example_output_csv_extracted, example_fix_csv_required, example_fixes_required = quick_check_csv(model_output=example_model_output,
                                                                                                 ideal_number_of_fields=len(get_field_names_as_list()),
                                                                                                 target_start_tag="<csv>",
                                                                                                 target_end_tag="</csv>")


print(f"[INFO] CSV fix required? {example_fix_csv_required}")
print(f"[INFO] Example extracted CSV:\n{example_output_csv_extracted}")
print(f"[INFO] Fixes required:\n{example_fixes_required}")

[INFO] Ideal number of fields: 10
[INFO] No issues detected in the CSV.
[INFO] CSV fix required? False
[INFO] Example extracted CSV:
'item_number', 'item_type', 'item_description', 'occasion', 'item_brand', 'item_condition', 'number_of_items', 'timestamp', 'overall_certainty_flag', 'is_similar_to'
1,Shirt,Blue striped linen full sleeve shirt,partywear,Zara,excellent,1,00:01:05,10,2
2,Shirt,Beige striped linen full sleeve shirt,partywear,bannaclub,excellent,2,00:06:20,10,1
3,T-Shirt, Light Dirty Green nylon half sleeve t-shirt,gym,puma,good,1,00:20:30,9,NA

[INFO] Fixes required:
[]


In [14]:
import base64
from IPython.display import Image, display

def mermaid_graph(graph, scale=2):
    graphbytes = graph.encode("ascii")
    base64_bytes = base64.b64encode(graphbytes)
    base64_string = base64_bytes.decode("ascii")
    # print(base64_string)
    display(
        Image(
            url=f"https://mermaid.ink/img/{base64_string}"
        )
    )

mermaid_graph("""
graph LR;
    A[Start] --> B[Target Video - 165,000 tokens]

    subgraph "Step 1: Initial Extraction"
        B --> C1[Gemini Model Initial Prompt - 10,000 tokens]
        C1 --> D1{CSV Validation}
        D1 -->|Valid| E1[Output 1]
        D1 -->|Invalid| FX1[Gemini CSV Fixer Model]
        FX1 --> D1
    end

    subgraph "Step 2: Expand Extraction"
        B & E1 --> C2[Gemini Model Secondary Prompt - 2000-5000 tokens]
        C2 --> D2{CSV Validation}
        D2 -->|Valid| E2[Output 2]
        D2 -->|Invalid| FX2[Gemini CSV Fixer Model]
        FX2 --> D2
    end

    subgraph "Step 3: Finalize Extraction"
        B & E1 & E2 --> C3[Gemini Model Final Prompt - 2000-5000 tokens]
        C3 --> D3{CSV Validation}
        D3 -->|Valid| E3[Final Output]
        D3 -->|Invalid| FX3[Gemini CSV Fixer Model]
        FX3 --> D3
    end

    E3 --> F[Final Results]

    %% Styling
    style C1 fill:#b3d9ff,stroke:#333,stroke-width:1px
    style C2 fill:#b3d9ff,stroke:#333,stroke-width:1px
    style C3 fill:#b3d9ff,stroke:#333,stroke-width:1px
    style D1 fill:#f9f,stroke:#333,stroke-width:1px
    style D2 fill:#f9f,stroke:#333,stroke-width:1px
    style D3 fill:#f9f,stroke:#333,stroke-width:1px
    style FX1 fill:#ccffcc,stroke:#333,stroke-width:1px
    style FX2 fill:#ccffcc,stroke:#333,stroke-width:1px
    style FX3 fill:#ccffcc,stroke:#333,stroke-width:1px
""")

In [16]:
iniital_prompt = """Video Content Overview: The video appears to be a walkthrough of a clothes, showcasing the different clothing pieces. The owner moves methodically from one cloth to cloth, pointing out significant items.

Challenges: Potential challenges include obscured items, items in drawers/cupboards that aren't opened, fast camera movements, and the owner's potentially inaccurate estimations of current conditions. Lighting might also be an issue in some areas, making it difficult to ascertain the condition of items.

Strategy for Comprehensive Inventory: I will pause the video frequently to ensure all visible items are captured. Catalog items sequentially as they appear in the video, ensure no item is missed. I'll pay close attention to the owner's descriptions and any mentioned details like brand, condition, or purchase price. If an item is only partially visible, I will make a note of it in the overall_certainty_flag field. I will categorize items broadly (shirts, pants, shorts, etc) to aid in organization.

CSV Formatting Strategy:

Consistent Field Order: Adhere strictly to the specified column order to ensure compatibility.
No Commas in Descriptions: Use spaces or other delimiters if necessary to avoid CSV conflicts.
Complete Field Population: Ensure every column is filled, using "NA" where applicable.
Proper Escaping of Special Characters: Handle any internal quotes or special characters appropriately to maintain CSV integrity.

Have I logged all items in the video? No. There are more items to do in a second pass. I will set <item_logging_status> to MORETIMESTAMPSTODO.
</video_inventory_analysis>

<estimated_item_count> ESTIMATED_TOTAL_ITEM_COUNT: 75 </estimated_item_count>
the csv format is:

[INFO] CSV schema string:
1. item_number (int): Sequential integer starting from 1. Example: 1, 2, 3 Example: 1
2. item_type (str): Type of item in lowercase. Standard categories include: shirt, t-shirt, shorts, innerwear, jeans, pants. Example: shorts
3. item_description (str): Detailed description including color, material, and notable features. Should be a complete phrase starting with a capital letter. Do not use commas in this field. Just describe the item verbatim. Example: Beige striped linen full sleeve shirt .
4. occasion (str): Suitable ocassion to wear the piece of clothing. Standard categories include: gym, partywear, casual, formal. Use NA if item type is innerwear Example: gym
5. item_brand (str): Brand name with proper capitalization. Try to identify brand based on logos and use NA if not visible/mentioned. Examples: Zara, Puma, Nike, NA.  Example: Puma
6. item_condition (str): Condition in lowercase, using standard terms: no visible damage, slight wear, moderate wear, significant damage, as new. Example: no visible damage
7. number_of_items (int): Integer count of identical items. If there are multiple of the same item, enter their integer count. Example: Single item: 1, Multiple items: 4 Example: 1
8. timestamp (str): Time in HH:MM:SS format with leading zeros. Example: 00:01:30
9. overall_certainty_flag (int): Overall confidence level 1-10 considering all fields: 1-3: Multiple uncertain fields, needs review. 4-6: Some uncertainty in key fields. 7-8: Minor uncertainty in non-critical fields. 9-10: All fields verified with high confidence. Example: 8
10. is_similar_to (str): Reference to similar item by item_number, or NA. Use for: Matching cloth pieces, Items from same occasion. Example: NA
"""

In [None]:
start_time = time.time()
model_response_1 = generate_content(prompt=[video_file_upload,
                                                     iniital_prompt],
                                             model=model_gemini_flash)

end_time = time.time()
model_response_1_time = end_time - start_time

print(f"[INFO] Time taken for model_reponse_1 : {round(model_response_1_time, 2)} seconds")
print(model_response_1.text)

In [None]:
model_response_1.usage_metadata