<a href="https://colab.research.google.com/github/chrloc313/Efficient-Geospatial-Reasoning/blob/main/kdca_gpt4_1_benchmarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import os
import base64
from openai import AzureOpenAI
from google.colab import userdata

endpoint = userdata.get("azure-endpoint")
model_name = "gpt-4.1"
deployment = "gpt-4.1"

subscription_key = userdata.get("azure-4.1-key")
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

In [38]:
!pwd
!cd ..

/content/Efficient-Geospatial-Reasoning


In [39]:
import torch, json
from torch.utils.data import DataLoader

curr_dir = os.getcwd()
test_dataset_path = curr_dir + "/data/test_split_dataset.json"

with open(test_dataset_path, "r", encoding="utf-8") as file:
    test_dataset = json.load(file)

def custom_collate_fn(batch):
    image_ids = [item['Image_ID'] for item in batch]
    questions = [item['Question'] for item in batch]
    ground_truths = [item['Ground_Truth'] for item in batch]
    question_types = [item['Question_Type'] for item in batch]

    data = {
        'Image_ID': image_ids,
        'Question': questions,
        'Question_Type': question_types
    }
    target = ground_truths

    return data, target

BATCH_SIZE = 16
test_dataloader = DataLoader(test_dataset, BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)

In [40]:
for data, ground_truth in test_dataloader:
    print(f"Data: {data}")
    print(f"Answer: {ground_truth}")
    break

Data: {'Image_ID': ['6640.JPG', '9103.JPG', '8931.JPG', '6964.JPG', '7147.JPG', '6976.JPG', '7720.JPG', '7362.JPG', '7601.JPG', '8404.JPG', '6710.JPG', '7263.JPG', '8149.JPG', '7357.JPG', '8264.JPG', '6348.JPG'], 'Question': ['How many buildings can be seen in this image?', 'Is the entire road flooded?', 'What is the condition of the road in this image?', 'Is the entire road flooded?', 'What is the condition of the road in this image?', 'Is the entire road flooded?', 'How many non flooded buildings can be seen in this image?', 'Is the entire road flooded?', 'What is the overall condition of the given image?', 'How many buildings are non flooded?', 'What is the overall condition of the given image?', 'How many buildings are flooded in this image?', 'What is the condition of the road in this image?', 'How many buildings are flooded?', 'What is the overall condition of the given image?', 'What is the overall condition of the given image?'], 'Question_Type': ['Simple_Counting', 'Yes_No', '

In [41]:
def encode_image(image_id):
    image_path = os.getcwd() + '/data/images/' + image_id
    with open(image_path, 'rb') as image_file:
      encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
    return encoded_string

SYSTEM_PROMPT = """You are an expert at flood damage assessment. Your task is to accurately answer questions based on provided image data from the FloodNet dataset. Your responses will be used for benchmarking.

For each question, you MUST respond with a JSON object containing a single key, "answer", and its corresponding value. Do NOT include any other text, explanations, or conversational filler outside of this JSON object.

Here are the rules for the "answer" value based on the 'Question_Type':

1.  **'Simple_Counting'**: The "answer" MUST be an integer value (e.g., `5`).
2.  **'Complex_Counting'**: The "answer" MUST be an integer value (e.g., `12`).
3.  **'Condition_Recognition'**:
    *   If the question pertains to the **road**: The "answer" MUST be one of 'flooded', 'non flooded', or 'flooded,non flooded'.
    *   If the question pertains to the **entire image**: The "answer" MUST be one of 'flooded' or 'non flooded'.
4.  **'Yes_No'**: The "answer" MUST be either 'Yes' or 'No'.

If you cannot confidently determine the answer based on the image, the "answer" MUST be 'Uncertain'.

Respond with the answer choice that you believe most correctly and confidently answers the presented question, strictly adhering to the specified format and possible values."""
import time

def retrieve_response(question, question_type, encoded_string):
    start_time = time.perf_counter()
    response = client.chat.completions.create(
      messages=[
          {
              "role": "system",
              "content": SYSTEM_PROMPT,
          },
          {
              "role": "user",
              "content": [
                  {
                      "type": "text",
                      "text": "Question Type: {}; Question: {}".format(question_type, question)
                  },
                  {
                      "type": "image_url",
                      "image_url": {"url": f"data:image/jpeg;base64,{encoded_string}"}
                  },
              ],
          }
      ],
      max_completion_tokens=13107,
      temperature=1.0,
      top_p=1.0,
      frequency_penalty=0.0,
      presence_penalty=0.0,
      model=deployment
    )

    return {"Response": response.choices[0].message.content, "Elapsed_Time": time.perf_counter() - start_time}

def answer_evalulation(model_response, ground_truth):
    try:
        answer = json.loads(model_response)
        if isinstance(answer, dict) and 'answer' in answer:
            if answer['answer'] == ground_truth:
                return 1
        return 0
    except json.JSONDecodeError:
        print(f"Warning: Model returned non-JSON response: {model_response}")
        return 0

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame(test_dataset)
df['Model_Response'] = pd.Series(dtype='object')
df['Correct?'] = pd.Series(dtype='bool')
df['Model_Response_Time'] = pd.Series(dtype='float64')
rolling_score = 0

for i, (batch_data, batch_answers) in enumerate(test_dataloader):
    current_batch_size = len(batch_data['Image_ID'])
    encoded_images = [encode_image(batch_data['Image_ID'][k]) for k in range(current_batch_size)]
    model_responses = [retrieve_response(batch_data["Question"][k], batch_data["Question_Type"][k], encoded_images[k]) for k in range(current_batch_size)]
    results = [answer_evalulation(model_responses[k]["Response"], batch_answers[k]) for k in range(current_batch_size)]

    for j in range(current_batch_size):
        row = i * BATCH_SIZE + j
        df.at[row, 'Model_Response'] = model_responses[j]["Response"]
        df.at[row, 'Correct?'] = bool(results[j])
        df.at[row, 'Model_Response_Time'] = model_responses[j]["Elapsed_Time"]
    rolling_score += sum(results)
    print(f"Batch {i}/{len(test_dataloader)} finished")

new_df_order = ['Question', 'Question_Type', 'Correct?', 'Ground_Truth', 'Model_Response', 'Model_Response_Time', 'Image_ID']
df = df.reindex(columns=new_df_order)

benchmark_output_path = os.getcwd() + '/data/benchmarks'
os.makedirs(os.path.dirname(benchmark_output_path), exist_ok=True)

i = 0
while True:
    file_path = benchmark_output_path + '/gpt4.1-benchmark-output_{}.csv'.format(i)
    if not os.path.exists(file_path):
        df.to_csv(file_path, index=False)
        break
    i += 1

print(f"Final Accuracy: {rolling_score/len(test_dataset)}")

Batch 0/43 finished
Batch 1/43 finished
Batch 2/43 finished
Batch 3/43 finished
Batch 4/43 finished
Batch 5/43 finished
Batch 6/43 finished
