## Data extraction and optimization use case

### Imports and variables

In [10]:
from google import genai
from google.genai import types
import base64
import IPython
import json

from config.schema import schema_work_package_basic, schema_work_package_advanced
from config.system_prompt import system_prompt

In [2]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'mg-ce-demos'

In [3]:
REGION = "us-central1"
MODEL = "gemini-2.5-pro-preview-05-06" 
MODEL_FLASH = "gemini-2.5-flash-preview-05-20"

### Setup gemini client

In [4]:
client = genai.Client(
    vertexai = True,
    project = PROJECT_ID,
    location = REGION,
)

### Functions

In [5]:
def generate(prompt, pdf_file_path=None, model=MODEL):
    if pdf_file_path:
        pdf_file = types.Part.from_uri(
            file_uri=pdf_file_path,
            mime_type="application/pdf",
        )
    
        contents = [
            types.Content(
                role="user",
                parts=[
                    pdf_file,
                    types.Part.from_text(text=prompt)
                ]
            )
        ]
    else:
        contents = [
            types.Content(
                role="user",
                parts=[
                    types.Part.from_text(text=prompt)
                ]
            )
        ]

    token_count = client.models.count_tokens(
        model=MODEL,
        contents=contents,
    )
    print(f"This prompt has {token_count.total_tokens} input tokens")

    generate_content_config = types.GenerateContentConfig(
        temperature = 0.2,
        top_p = 1,
        seed = 0,
        max_output_tokens = 65535,
        response_modalities = ["TEXT"],
        response_mime_type="application/json",
        system_instruction = system_prompt,
        response_schema = schema_work_package_advanced,
        safety_settings = [types.SafetySetting(
            category="HARM_CATEGORY_HATE_SPEECH",
            threshold="OFF"
        ),types.SafetySetting(
            category="HARM_CATEGORY_DANGEROUS_CONTENT",
            threshold="OFF"
        ),types.SafetySetting(
            category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
            threshold="OFF"
        ),types.SafetySetting(
            category="HARM_CATEGORY_HARASSMENT",
            threshold="OFF"
        )],
    )

    response = client.models.generate_content(
        model = model,
        contents = contents,
        config = generate_content_config
    )

    return response

### Output refinement

In [6]:
prompt = "Review this document, and extract key elements and information.  Respond ONLY with a valid JSON object strictly conforming to the required schema."
ex_file_path = "gs://wec_demo_files/examples/idaho_nat_lab_work_package_ctc_example.pdf"

In [7]:
response = generate(prompt=prompt, pdf_file_path=ex_file_path, model=MODEL_FLASH)

This prompt has 5702 input tokens


In [11]:
json.loads(response.text)

{'document_metadata': {'document_id': 'INL/EXT-12-25847',
  'title': 'Work Breakdown Structure and Plant/Equipment Designation System Numbering Scheme for the High Temperature Gas-Cooled Reactor (HTGR) Component Test Capability (CTC)',
  'author': 'Jeffrey D. Bryan',
  'date': 'September 2009',
  'organization': 'Idaho National Laboratory'},
 'acronyms': [{'acronym': 'BEA',
   'definition': 'Battelle Energy Alliance, LLC'},
  {'acronym': 'BIM', 'definition': 'Building Information Modeling'},
  {'acronym': 'CMMS',
   'definition': 'Computerized Maintenance Management System'},
  {'acronym': 'CTC', 'definition': 'Component Test Capability'},
  {'acronym': 'DD&D',
   'definition': 'Deactivation, Decommissioning, and Dismantlement'},
  {'acronym': 'DOE', 'definition': 'Department of Energy'},
  {'acronym': 'DOE-ID',
   'definition': 'Department of Energy, Idaho Operations Office'},
  {'acronym': 'F&ORs',
   'definition': 'Functional and Operational Requirements'},
  {'acronym': 'HTGR', 'de