In [21]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from google.colab import drive
import statsmodels.api as sm
import statsmodels.formula.api as smf
drive.mount('/content/drive')
ROOT_DRIVE_PATH = Path("/content/drive/MyDrive/DSGA1011/Project")
OUTPUT_DIR = ROOT_DRIVE_PATH / 'spatial_mm_outputs'
models_display = ["gpt4o", "gpt5_mini", "gemini_flash", "gemini_flash_lite"]

def extract_answer(text):
    if not text: return ""
    text = str(text)
    lines = text.split('\n')
    for line in lines[::-1]:
        if "Answer:" in line:
            return line.split("Answer:")[1].strip().lower()
    return text.strip().lower()

def get_perspective(question):
    return "Camera" if "camera" in question.lower() else "Human"

def calculate_accuracy(file_path):
    if not file_path.exists():
        return None
    with open(file_path, 'r') as f:
        data = json.load(f)
    results = []
    for item in data:
        pred = extract_answer(item.get('raw_response', ''))
        gold = item['answer'].lower().strip()
        is_correct = (gold in pred) or (pred == gold)
        results.append({
            'perspective': get_perspective(item['question']),
            'hops': item.get('hop', 2),
            'correct': is_correct
        })
    return pd.DataFrame(results)

def get_acc(df, persp=None):
    if df is None or df.empty: return "-"
    sub = df[df['perspective'] == persp] if persp else df
    if len(sub) == 0: return "-"
    return round(sub['correct'].mean() * 100, 1)

def combine_dfs(df1, df2):
    if df1 is None and df2 is None: return None
    if df1 is None: return df2
    if df2 is None: return df1
    return pd.concat([df1, df2], ignore_index=True)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
print("TABLE 1: STANDARD PROMPT RESULTS\n")
print("--- Spatial-Obj (Standard) ---\n")
obj_baseline_data = []
for model in models_display:
    path1 = OUTPUT_DIR / f"spatial_mm_one_obj_spatial_obj_baseline_{model}.json"
    path2 = OUTPUT_DIR / f"spatial_mm_two_obj_spatial_obj_baseline_{model}.json"
    df1 = calculate_accuracy(path1)
    df2 = calculate_accuracy(path2)
    df_combined = combine_dfs(df1, df2)

    obj_baseline_data.append({
        "Model": model,
        "Human Pers.": get_acc(df_combined, "Human"),
        "Camera Pers.": get_acc(df_combined, "Camera"),
        "Overall": get_acc(df_combined)
    })

df_obj_baseline = pd.DataFrame(obj_baseline_data)
print(df_obj_baseline.to_markdown(index=False))


print("\n--- Spatial-CoT (Standard) ---\n")
cot_baseline_data = []
for model in models_display:
    path = OUTPUT_DIR / f"spatial_cot_multihop_baseline_{model}.json"
    df = calculate_accuracy(path)

    cot_baseline_data.append({
        "Model": model,
        "Human Pers.": get_acc(df, "Human"),
        "Camera Pers.": get_acc(df, "Camera"),
        "Overall": get_acc(df)
    })

df_cot_baseline = pd.DataFrame(cot_baseline_data)
print(df_cot_baseline.to_markdown(index=False))


print("\nTABLE 2: ZERO-SHOT COT PROMPT RESULTS")
print("\n--- Spatial-Obj (Zero-shot CoT) ---\n")
obj_cot_data = []
for model in models_display:
    path1 = OUTPUT_DIR / f"spatial_mm_one_obj_spatial_obj_zeroshot_cot_{model}.json"
    path2 = OUTPUT_DIR / f"spatial_mm_two_obj_spatial_obj_zeroshot_cot_{model}.json"
    df1 = calculate_accuracy(path1)
    df2 = calculate_accuracy(path2)
    df_combined = combine_dfs(df1, df2)

    obj_cot_data.append({
        "Model": model,
        "Human Pers.": get_acc(df_combined, "Human"),
        "Camera Pers.": get_acc(df_combined, "Camera"),
        "Overall": get_acc(df_combined)
    })

df_obj_cot = pd.DataFrame(obj_cot_data)
print(df_obj_cot.to_markdown(index=False))
print("\n--- Spatial-CoT (Zero-shot CoT) ---\n")
cot_cot_data = []
for model in models_display:
    path = OUTPUT_DIR / f"spatial_cot_multihop_zeroshot_cot_{model}.json"
    df = calculate_accuracy(path)

    cot_cot_data.append({
        "Model": model,
        "Human Pers.": get_acc(df, "Human"),
        "Camera Pers.": get_acc(df, "Camera"),
        "Overall": get_acc(df)
    })

df_cot_cot = pd.DataFrame(cot_cot_data)
print(df_cot_cot.to_markdown(index=False))


TABLE 1: STANDARD PROMPT RESULTS

--- Spatial-Obj (Standard) ---

| Model             |   Human Pers. |   Camera Pers. |   Overall |
|:------------------|--------------:|---------------:|----------:|
| gpt4o             |          47.8 |           73.9 |      65.9 |
| gpt5_mini         |          52.2 |           88.1 |      77.2 |
| gemini_flash      |          51.8 |           87.3 |      76.6 |
| gemini_flash_lite |          45.2 |           62.8 |      57.5 |

--- Spatial-CoT (Standard) ---

| Model             |   Human Pers. |   Camera Pers. |   Overall |
|:------------------|--------------:|---------------:|----------:|
| gpt4o             |          61   |           75.3 |      64.4 |
| gpt5_mini         |          63.6 |           76.7 |      66.7 |
| gemini_flash      |          64.8 |           84.9 |      69.6 |
| gemini_flash_lite |          57.4 |           63   |      58.8 |

TABLE 2: ZERO-SHOT COT PROMPT RESULTS

--- Spatial-Obj (Zero-shot CoT) ---

| Model             

In [24]:
print("TABLE 3: SPATIAL-COT ACCURACY BY NUMBER OF HOPS\n")

hop_data = []
for model in models_display:
    df_base = calculate_accuracy(OUTPUT_DIR / f"spatial_cot_multihop_baseline_{model}.json")
    df_cot = calculate_accuracy(OUTPUT_DIR / f"spatial_cot_multihop_zeroshot_cot_{model}.json")
    if df_base is not None and df_cot is not None:
        hop_data.append({ "Model": f"{model} (Standard)",
            "2 Hops": round(df_base[df_base['hops'] == 2]['correct'].mean() * 100, 1),
            "3 Hops": round(df_base[df_base['hops'] == 3]['correct'].mean() * 100, 1),
            "≥4 Hops": round(df_base[df_base['hops'] >= 4]['correct'].mean() * 100, 1),
            "All": round(df_base['correct'].mean() * 100, 1)})
        hop_data.append({ "Model": f"{model} (CoT)",
            "2 Hops": round(df_cot[df_cot['hops'] == 2]['correct'].mean() * 100, 1),
            "3 Hops": round(df_cot[df_cot['hops'] == 3]['correct'].mean() * 100, 1),
            "≥4 Hops": round(df_cot[df_cot['hops'] >= 4]['correct'].mean() * 100, 1),
            "All": round(df_cot['correct'].mean() * 100, 1)})
print(pd.DataFrame(hop_data).to_markdown(index=False))

TABLE 3: SPATIAL-COT ACCURACY BY NUMBER OF HOPS

| Model                        |   2 Hops |   3 Hops |   ≥4 Hops |   All |
|:-----------------------------|---------:|---------:|----------:|------:|
| gpt4o (Standard)             |     82.2 |     70.5 |      55.3 |  64.4 |
| gpt4o (CoT)                  |     73.3 |     78.1 |      55.3 |  65.7 |
| gpt5_mini (Standard)         |     77.8 |     78.1 |      56   |  66.7 |
| gpt5_mini (CoT)              |     80   |     82.9 |      59.7 |  70.6 |
| gemini_flash (Standard)      |     86.7 |     78.1 |      59.1 |  69.6 |
| gemini_flash (CoT)           |     88.9 |     74.3 |      58.6 |  68.4 |
| gemini_flash_lite (Standard) |     80   |     65.7 |      48.1 |  58.8 |
| gemini_flash_lite (CoT)      |     77.8 |     75.2 |      54.7 |  65   |


In [None]:
def calculate_accuracy_for_stats(file_path):
    if not file_path.exists():
        return pd.DataFrame()
    with open(file_path, 'r') as f:
        data = json.load(f)
    results = []
    for item in data:
        pred = extract_answer(item.get('raw_response', ''))
        gold = item['answer'].lower().strip()
        is_correct = 1 if (gold in pred) or (pred == gold) else 0
        results.append({
            'image_name': item.get('image_name'),
            'hops': item.get('hop', 2),
            'correct': is_correct
        })
    return pd.DataFrame(results)

def run_hop_level_gee_regression(df_paired):
    all_hop_bins = {"2 Hops": 2, "3 Hops": 3, "≥4 Hops": 4}
    analysis_results = []

    for hop_label, hop_val in all_hop_bins.items():
        if hop_val < 4:
            df_hop = df_paired[df_paired['hops'] == hop_val].copy()
        else:
            df_hop = df_paired[df_paired['hops'] >= hop_val].copy()
        n_questions = len(df_hop)
        if n_questions == 0:
            analysis_results.append({"Hops": hop_label, "N": 0, "Standard Acc": "-", "CoT Acc": "-", "P-Value": "-", "Sig": ""})
            continue
        df_long = pd.melt(df_hop, id_vars=['image_name'], value_vars=['standard', 'cot'],
                          var_name='prompt', value_name='correct')


        formula = "correct ~ C(prompt, Treatment('standard'))"

        model = smf.gee(formula,
                        data=df_long,
                        groups=df_long['image_name'],
                        family=sm.families.Binomial(),
                        cov_struct=sm.cov_struct.Exchangeable())

        result = model.fit()

        z_score = result.tvalues["C(prompt, Treatment('standard'))[T.cot]"]
        p_two_sided = result.pvalues["C(prompt, Treatment('standard'))[T.cot]"]


        if z_score > 0:
            p_one_sided = p_two_sided / 2
        else:
            p_one_sided = 1 - (p_two_sided / 2)

        p_val_str = f"{p_one_sided:.4f}"

        if p_one_sided < 0.001: sig_str = "***"
        elif p_one_sided < 0.01: sig_str = "**"
        elif p_one_sided < 0.05: sig_str = "*"
        else: sig_str = ""

        analysis_results.append({
            "Hops": hop_label,
            "N": n_questions,
            "Standard Acc": f"{df_hop['standard'].mean():.1%}",
            "CoT Acc": f"{df_hop['cot'].mean():.1%}",
            "P-Value": p_val_str,
            "Sig": sig_str
        })
    return pd.DataFrame(analysis_results)


print("PAIRED LOGISTIC REGRESSION (GEE) PER HOP\n")
print("Hypothesis: CoT increases accuracy compared to Standard.\n")

for model in models_display:
    print(f"\n\n--- MODEL: {model.upper()} ---")

    df_base = calculate_accuracy_for_stats(OUTPUT_DIR / f"spatial_cot_multihop_baseline_{model}.json").rename(columns={'correct':'standard'})
    df_cot  = calculate_accuracy_for_stats(OUTPUT_DIR / f"spatial_cot_multihop_zeroshot_cot_{model}.json").rename(columns={'correct':'cot'})



    df_paired = pd.merge(df_base, df_cot, on=['image_name', 'hops'], how='inner')

    results = run_hop_level_gee_regression(df_paired)
    print(results.to_markdown(index=False))

PAIRED LOGISTIC REGRESSION (GEE) PER HOP

Hypothesis: CoT increases accuracy compared to Standard.



--- MODEL: GPT4O ---
| Hops    |   N | Standard Acc   | CoT Acc   |   P-Value | Sig   |
|:--------|----:|:---------------|:----------|----------:|:------|
| 2 Hops  |  51 | 84.3%          | 76.5%     |    0.9327 |       |
| 3 Hops  | 113 | 69.0%          | 77.9%     |    0.0204 | *     |
| ≥4 Hops | 181 | 54.1%          | 52.5%     |    0.6378 |       |


--- MODEL: GPT5_MINI ---
| Hops    |   N | Standard Acc   | CoT Acc   |   P-Value | Sig   |
|:--------|----:|:---------------|:----------|----------:|:------|
| 2 Hops  |  51 | 80.4%          | 82.4%     |    0.1441 |       |
| 3 Hops  | 113 | 77.0%          | 83.2%     |    0.0545 |       |
| ≥4 Hops | 181 | 54.1%          | 56.4%     |    0.2074 |       |


--- MODEL: GEMINI_FLASH ---
| Hops    |   N | Standard Acc   | CoT Acc   |   P-Value | Sig   |
|:--------|----:|:---------------|:----------|----------:|:------|
| 2 Hops  |  51 

In [28]:
def apply_pure_reasoning_filter(df):
    mask = (df['standard'] == 0) & (df['bbox'] == 0) & (df['sg'] == 0)
    return df[mask].copy()


all_pure_reasoning_results = []

for model in models_display:
    df_base = calculate_accuracy_for_stats(OUTPUT_DIR / f"spatial_cot_multihop_baseline_{model}.json").rename(columns={'correct':'standard'})
    df_cot  = calculate_accuracy_for_stats(OUTPUT_DIR / f"spatial_cot_multihop_zeroshot_cot_{model}.json").rename(columns={'correct':'cot'})
    df_bbox = calculate_accuracy_for_stats(OUTPUT_DIR / f"spatial_cot_multihop_bbox_{model}.json").rename(columns={'correct':'bbox'})
    df_sg   = calculate_accuracy_for_stats(OUTPUT_DIR / f"spatial_cot_multihop_scenegraph_{model}.json").rename(columns={'correct':'sg'})

    if df_base.empty or df_cot.empty or df_bbox.empty or df_sg.empty:
        for hop_label in ["2 Hops", "3 Hops", "≥4 Hops"]:
             all_pure_reasoning_results.append({
                "Model": model, "Hop Bin": hop_label,
                "N (Pure Reasoning)": "N/A", "CoT Accuracy on Subset": "N/A"})
        continue

    df_full = pd.merge(df_base, df_cot, on=['image_name', 'hops'], how='inner')
    df_full = pd.merge(df_full, df_bbox, on=['image_name', 'hops'], how='left')
    df_full = pd.merge(df_full, df_sg, on=['image_name', 'hops'], how='left')
    df_full.fillna({'bbox': 0, 'sg': 0}, inplace=True)

    df_pure = apply_pure_reasoning_filter(df_full)

    for hop_label, hop_value in [("2 Hops", 2), ("3 Hops", 3), ("≥4 Hops", 4)]:
        if hop_value < 4:
            subset = df_pure[df_pure['hops'] == hop_value]
        else:
            subset = df_pure[df_pure['hops'] >= hop_value]

        n_questions = len(subset)
        cot_acc = subset['cot'].mean() if n_questions > 0 else 0

        all_pure_reasoning_results.append({
            "Model": model,
            "Hop Bin": hop_label,
            "N (Pure Reasoning)": n_questions,
            "CoT Accuracy on Subset": f"{cot_acc:.1%}"
        })

summary_df = pd.DataFrame(all_pure_reasoning_results)
print("TABLE 4: PURE REASONING ACCURACY OF COT BY NUMBER OF HOPS\n")
print("\n" + summary_df.to_markdown(index=False))

TABLE 4: PURE REASONING ACCURACY OF COT BY NUMBER OF HOPS


| Model             | Hop Bin   |   N (Pure Reasoning) | CoT Accuracy on Subset   |
|:------------------|:----------|---------------------:|:-------------------------|
| gpt4o             | 2 Hops    |                    0 | 0.0%                     |
| gpt4o             | 3 Hops    |                   35 | 48.6%                    |
| gpt4o             | ≥4 Hops   |                   83 | 20.5%                    |
| gpt5_mini         | 2 Hops    |                    0 | 0.0%                     |
| gpt5_mini         | 3 Hops    |                   26 | 34.6%                    |
| gpt5_mini         | ≥4 Hops   |                   83 | 16.9%                    |
| gemini_flash      | 2 Hops    |                    3 | 0.0%                     |
| gemini_flash      | 3 Hops    |                   27 | 18.5%                    |
| gemini_flash      | ≥4 Hops   |                   78 | 16.7%                    |
| gemini_flash_l