In [1]:
import pathlib
import glob
import os
import sys
import json
import seaborn as sns

from matplotlib import pyplot as plt

from analysis import Run, RunAnalyser, CriterionPlotter, load_all_runs_from_a_dir

In [2]:
COLORS = []
for idx, c in enumerate(sns.color_palette()):
    print(f"\\definecolor{{color{idx}}}{{RGB}}{{{','.join(str(int(k * 255)) for k in c)}}}")
    COLORS.append(f'color{idx}')

\definecolor{color0}{RGB}{31,119,180}
\definecolor{color1}{RGB}{255,127,14}
\definecolor{color2}{RGB}{44,160,44}
\definecolor{color3}{RGB}{214,39,40}
\definecolor{color4}{RGB}{148,103,189}
\definecolor{color5}{RGB}{140,86,75}
\definecolor{color6}{RGB}{227,119,194}
\definecolor{color7}{RGB}{127,127,127}
\definecolor{color8}{RGB}{188,189,34}
\definecolor{color9}{RGB}{23,190,207}


In [3]:
CITY_CATEGORIES = ['road_construction_site', 'crowd', 'large_trash_pile', 'fire', 'car']
FOREST_CATEGORIES = ['campsite', 'trash_pile', 'person', 'forest_fire', 'building']
ALL_CATEGORIES = ['large_trash_pile', 'fire', 'car', 'trash_pile', 'person', 'building','campsite', 'forest_fire', 'road_construction_site', 'crowd']
MODELS = [
    'GPT4o',
    'Sonnet',
    'Gemini',
    'Phi',
    'InternVL',
    'llava-interleave-7b',
    'Qwen2-VL-72B',
    'llava-onevision',
    'Pixtral-Large-Instruct-2411',
]
MODEL_NAMES = [
    'GPT-4o',
    'Claude 3.5 Sonnet',
    'Gemini 2.0 flash',
    'Phi 3.5 vision',
    'InternVL-2.5 8B MPO',
    'Llava-Interleave-7b',
    'Qwen2-VL-72B',
    'Llava-Onevision 72b',
    'Pixtral-Large'
]
colors_map = {k: v for k, v in zip(MODELS, COLORS)}
model_name_map = {k: v for k, v in zip(MODELS, MODEL_NAMES)}

In [4]:
colors_map

{'GPT4o': 'color0',
 'Sonnet': 'color1',
 'Gemini': 'color2',
 'Phi': 'color3',
 'InternVL': 'color4',
 'llava-interleave-7b': 'color5',
 'Qwen2-VL-72B': 'color6',
 'llava-onevision': 'color7',
 'Pixtral-Large-Instruct-2411': 'color8'}

In [5]:
# Can be used for forest as well
def class_aggregation_function(run: Run):
    object_type = str(run.object_type).lower()

    if "car" in object_type:
        return "car"
    elif "pickup" in object_type:
        return "car"
    elif "truck" in object_type:
        return "car"
    return object_type

def scenario_aggregation_function(run: Run):
    object_type = class_aggregation_function(run)

    if object_type in CITY_CATEGORIES:
        return "city"
    elif object_type in FOREST_CATEGORIES:
        return "forest"
    elif 'anomaly' in object_type:
        if 'City' in str(run.path):
            return 'anomaly-city'
        elif 'Forest' in str(run.path):
            return 'anomaly-forest'
        else:
            assert False, object_type
    else:
        assert False, object_type

def anomaly_aggregation_function(run: Run):
    object_type = str(run.object_type).lower()

    if "anomaly" in object_type:
        return "anomaly"
    return 'main'

def success_criterion(run):
    return run.model_claimed and RunAnalyser(run).success_criterion_satisfied(10)

## Settings

In [6]:
base_path = "../all_logs"
suffix = '-CityNew'
all_exp = [(m, os.path.join(base_path, m + suffix)) for m in MODELS]
print(all_exp)
categories = ALL_CATEGORIES
add_forest = True

[('GPT4o', '../all_logs/GPT4o-CityNew'), ('Sonnet', '../all_logs/Sonnet-CityNew'), ('Gemini', '../all_logs/Gemini-CityNew'), ('Phi', '../all_logs/Phi-CityNew'), ('InternVL', '../all_logs/InternVL-CityNew'), ('llava-interleave-7b', '../all_logs/llava-interleave-7b-CityNew'), ('Qwen2-VL-72B', '../all_logs/Qwen2-VL-72B-CityNew'), ('llava-onevision', '../all_logs/llava-onevision-CityNew'), ('Pixtral-Large-Instruct-2411', '../all_logs/Pixtral-Large-Instruct-2411-CityNew')]


In [7]:
main_plots = []
scenario_plots = []
class_plots = []

def get_stats(plotter, agg_fn):
    runs_aggregated_per_type = plotter.aggregate_runs_per_function(agg_fn)
    claimed_stats = plotter.plot_accuracy_in_aggregated_runs(runs_aggregated_per_type, None, success_criterion=success_criterion)
    unclaimed_stats = plotter.plot_accuracy_in_aggregated_runs(runs_aggregated_per_type, None,
                                                           success_criterion=lambda x: RunAnalyser(
                                                               x).success_criterion_satisfied(10))
    return claimed_stats, unclaimed_stats

for model_name, exp_path in all_exp:
    runs = load_all_runs_from_a_dir(pathlib.Path(exp_path))
    if add_forest:
        runs += load_all_runs_from_a_dir(pathlib.Path(exp_path.replace('City', 'Forest')))
    plotter = CriterionPlotter(runs)

    claimed_stats, unclaimed_stats = get_stats(plotter, anomaly_aggregation_function)
    main_plots.append((model_name, claimed_stats, unclaimed_stats))

    claimed_stats, unclaimed_stats = get_stats(plotter, scenario_aggregation_function)
    scenario_plots.append((model_name, claimed_stats, unclaimed_stats))

    claimed_stats, unclaimed_stats = get_stats(plotter, class_aggregation_function)
    class_plots.append((model_name, claimed_stats, unclaimed_stats))

## Radar plot

In [9]:
for name, stat, _ in class_plots:
    print(f'\\tkzKiviatLine[thick,color={colors_map[name]}](', end='')
    print(','.join([f"{stat[c]['mean'] * 100:.3f}" if c in stat else '' for c in categories]), end='')
    print(')')

\tkzKiviatLine[thick,color=color0](19.570,41.300,31.110,43.400,19.050,51.110,68.750,53.570,30.000,56.520)
\tkzKiviatLine[thick,color=color1](23.910,23.910,37.780,37.740,40.480,68.890,59.380,60.710,17.500,65.220)
\tkzKiviatLine[thick,color=color2](39.130,43.480,42.220,41.510,23.810,48.890,40.620,64.290,30.000,60.870)
\tkzKiviatLine[thick,color=color3](0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000)
\tkzKiviatLine[thick,color=color4](0.000,2.170,0.000,3.770,2.380,4.440,0.000,7.140,0.000,8.700)
\tkzKiviatLine[thick,color=color5](0.000,6.520,0.000,0.000,0.000,0.000,0.000,10.710,0.000,0.000)
\tkzKiviatLine[thick,color=color6](13.040,21.740,17.780,9.430,4.760,28.890,6.250,35.710,12.500,30.430)
\tkzKiviatLine[thick,color=color7](2.170,30.430,4.440,9.430,9.520,15.560,25.000,42.860,12.500,4.350)
\tkzKiviatLine[thick,color=color8](21.740,21.740,20.000,26.420,16.670,62.220,43.750,46.430,10.000,43.480)


In [11]:
for k, v in colors_map.items():
    print(f'\\addlegendimage{{{v},ultra thick}}')
    print(f'\\addlegendentry{{{k}}};')
    

\addlegendimage{color0,ultra thick}
\addlegendentry{GPT4o};
\addlegendimage{color1,ultra thick}
\addlegendentry{Sonnet};
\addlegendimage{color2,ultra thick}
\addlegendentry{Gemini};
\addlegendimage{color3,ultra thick}
\addlegendentry{Phi};
\addlegendimage{color4,ultra thick}
\addlegendentry{InternVL};
\addlegendimage{color5,ultra thick}
\addlegendentry{llava-interleave-7b};
\addlegendimage{color6,ultra thick}
\addlegendentry{Qwen2-VL-72B};
\addlegendimage{color7,ultra thick}
\addlegendentry{llava-onevision};
\addlegendimage{color8,ultra thick}
\addlegendentry{Pixtral-Large-Instruct-2411};


## Main plot

In [12]:
for idx, (m, p, _) in enumerate(main_plots):
    p = p['main']
    mean = p['mean']
    lower = p['mean'] - p['conf_int'][0]
    upper = p['conf_int'][1] - p['mean']
    print(f'\\addplot[style={{fill={colors_map[m]}}},error bars/.cd, y dir=both, y explicit] coordinates {{({idx+1}, {p["mean"]*100:.3f}) += (0,{upper*100:.3f}) -= (0,{lower*100:.3f})}};')



\addplot[style={fill=color0},error bars/.cd, y dir=both, y explicit] coordinates {(1, 39.500) += (0,4.980) -= (0,4.820)};
\addplot[style={fill=color1},error bars/.cd, y dir=both, y explicit] coordinates {(2, 41.250) += (0,5.000) -= (0,4.870)};
\addplot[style={fill=color2},error bars/.cd, y dir=both, y explicit] coordinates {(3, 42.000) += (0,5.010) -= (0,4.890)};
\addplot[style={fill=color3},error bars/.cd, y dir=both, y explicit] coordinates {(4, 0.000) += (0,0.920) -= (0,0.000)};
\addplot[style={fill=color4},error bars/.cd, y dir=both, y explicit] coordinates {(5, 2.500) += (0,2.050) -= (0,1.290)};
\addplot[style={fill=color5},error bars/.cd, y dir=both, y explicit] coordinates {(6, 1.500) += (0,1.740) -= (0,0.950)};
\addplot[style={fill=color6},error bars/.cd, y dir=both, y explicit] coordinates {(7, 17.000) += (0,4.050) -= (0,3.550)};
\addplot[style={fill=color7},error bars/.cd, y dir=both, y explicit] coordinates {(8, 14.750) += (0,3.860) -= (0,3.330)};
\addplot[style={fill=color8

## Anomaly

In [13]:
for idx, (m, p, _) in enumerate(main_plots):
    p = p['anomaly']
    mean = p['mean']
    lower = p['mean'] - p['conf_int'][0]
    upper = p['conf_int'][1] - p['mean']
    print(f'\\addplot[style={{fill={colors_map[m]}}},error bars/.cd, y dir=both, y explicit] coordinates {{({idx+1}, {p["mean"]*100:.3f}) += (0,{upper*100:.3f}) -= (0,{lower*100:.3f})}};')



\addplot[style={fill=color0},error bars/.cd, y dir=both, y explicit] coordinates {(1, 27.000) += (0,6.720) -= (0,6.020)};
\addplot[style={fill=color1},error bars/.cd, y dir=both, y explicit] coordinates {(2, 27.500) += (0,6.740) -= (0,6.060)};
\addplot[style={fill=color2},error bars/.cd, y dir=both, y explicit] coordinates {(3, 35.500) += (0,7.060) -= (0,6.620)};
\addplot[style={fill=color3},error bars/.cd, y dir=both, y explicit] coordinates {(4, 0.000) += (0,1.830) -= (0,0.000)};
\addplot[style={fill=color4},error bars/.cd, y dir=both, y explicit] coordinates {(5, 3.500) += (0,3.580) -= (0,2.080)};
\addplot[style={fill=color5},error bars/.cd, y dir=both, y explicit] coordinates {(6, 0.000) += (0,1.830) -= (0,0.000)};
\addplot[style={fill=color6},error bars/.cd, y dir=both, y explicit] coordinates {(7, 7.500) += (0,4.570) -= (0,3.240)};
\addplot[style={fill=color7},error bars/.cd, y dir=both, y explicit] coordinates {(8, 8.500) += (0,4.760) -= (0,3.470)};
\addplot[style={fill=color8},

## Non-claim

In [14]:
print(r'\addplot+[ybar] plot coordinates {', end='')
for idx, (m, p, pn) in enumerate(main_plots):
    p = p['main']
    pn = pn['main']   
    print(f'({idx+1},{p["mean"]*100:.3f})', end='')
print('};')
print(r'\addplot+[ybar] plot coordinates {', end='')
for idx, (m, p, pn) in enumerate(main_plots):
    p = p['main']
    pn = pn['main']   
    print(f'({idx+1},{(pn["mean"] - p["mean"])*100:.3f})', end='')
print('};')



    

\addplot+[ybar] plot coordinates {(1,39.500)(2,41.250)(3,42.000)(4,0.000)(5,2.500)(6,1.500)(7,17.000)(8,14.750)(9,29.750)};
\addplot+[ybar] plot coordinates {(1,0.250)(2,0.250)(3,1.500)(4,4.250)(5,4.750)(6,7.750)(7,3.250)(8,1.500)(9,0.250)};


## Main v2

In [25]:
for k, p in zip(main_plots, scenario_plots):
    print(f"{model_name_map[p[0]]} & $ {k[1]['main']['mean']*100:.1f}\\% \\pm {k[1]['main']['sem']*100:.1f} $ & $ {p[1]['forest']['mean']*100:.1f}\\% \\pm {p[1]['forest']['sem']*100:.1f} $ " +
    f"& $ {p[1]['city']['mean']*100:.1f}\\% \\pm {p[1]['city']['sem']*100:.1f} $", end='')
    print(f"& $ {k[1]['anomaly']['mean']*100:.1f}\\% \\pm {k[1]['anomaly']['sem']*100:.1f} $ & $ {p[1]['anomaly-forest']['mean']*100:.1f}\\% \\pm {p[1]['anomaly-forest']['sem']*100:.1f} $ " +
    f"& $ {p[1]['anomaly-city']['mean']*100:.1f}\\% \\pm {p[1]['anomaly-city']['sem']*100:.1f} $ \\\\")



    

GPT-4o & $ 39.5\% \pm 2.4 $ & $ 45.5\% \pm 3.5 $ & $ 33.5\% \pm 3.3 $& $ 27.0\% \pm 3.1 $ & $ 39.0\% \pm 4.9 $ & $ 15.0\% \pm 3.6 $ \\
Claude 3.5 Sonnet & $ 41.2\% \pm 2.5 $ & $ 52.0\% \pm 3.5 $ & $ 30.5\% \pm 3.3 $& $ 27.5\% \pm 3.2 $ & $ 37.0\% \pm 4.9 $ & $ 18.0\% \pm 3.9 $ \\
Gemini 2.0 flash & $ 42.0\% \pm 2.5 $ & $ 42.5\% \pm 3.5 $ & $ 41.5\% \pm 3.5 $& $ 35.5\% \pm 3.4 $ & $ 46.0\% \pm 5.0 $ & $ 25.0\% \pm 4.4 $ \\
Phi 3.5 vision & $ 0.0\% \pm 0.0 $ & $ 0.0\% \pm 0.0 $ & $ 0.0\% \pm 0.0 $& $ 0.0\% \pm 0.0 $ & $ 0.0\% \pm 0.0 $ & $ 0.0\% \pm 0.0 $ \\
InternVL-2.5 8B MPO & $ 2.5\% \pm 0.8 $ & $ 3.5\% \pm 1.3 $ & $ 1.5\% \pm 0.9 $& $ 3.5\% \pm 1.3 $ & $ 6.0\% \pm 2.4 $ & $ 1.0\% \pm 1.0 $ \\
Llava-Interleave-7b & $ 1.5\% \pm 0.6 $ & $ 1.5\% \pm 0.9 $ & $ 1.5\% \pm 0.9 $& $ 0.0\% \pm 0.0 $ & $ 0.0\% \pm 0.0 $ & $ 0.0\% \pm 0.0 $ \\
Qwen2-VL-72B & $ 17.0\% \pm 1.9 $ & $ 16.0\% \pm 2.6 $ & $ 18.0\% \pm 2.7 $& $ 7.5\% \pm 1.9 $ & $ 10.0\% \pm 3.0 $ & $ 5.0\% \pm 2.2 $ \\
Llava-Onevisio

In [21]:
main_plots[0][1]

{'main': {'mean': np.float64(0.395),
  'std': np.float64(0.4889),
  'sem': np.float64(0.024473145222727952),
  'conf_int': (0.3468, 0.4448),
  'n': 400,
  'total_successes': 158},
 'anomaly': {'mean': np.float64(0.27),
  'std': np.float64(0.444),
  'sem': np.float64(0.03147145152843339),
  'conf_int': (0.2098, 0.3372),
  'n': 200,
  'total_successes': 54}}

In [26]:
print("\n<Context>\n    You are in command of a UAV, tasked with finding a large trash pile.\n</Context>\n\n<Objective>\n    You should fly BELOW 10 meters above the object and then reply with \"FOUND\". Being lower (closer to the object) (like 9, 8, or less meters) is good, being higher than that (like 11, 12, or more meters) is bad. \n\n    You may not be able to see the object in the first image, so you need to perform a careful search. Your performance will be evaluated based on whether the object was at most 10 meters below the drone when you replied with \"FOUND\". The object MUST be in your field of view when you reply with \"FOUND\". You must be centered on the object. \n</Objective>\n\n<Coordinates>\n    There is a grid overlaid on each image you are presented with. It is meant to (roughly) communicate which point will be in drone's center of vision if you move in that direction. Note that height of the drone is not represented in the grid.\n</Coordinates>\n\n<Controls>\n    <Action space>\n        To move the drone in a certain direction, use the following format: <Action>(x, y, z)</Action>. For example, if you want to fly to the place denoted as (10, 10) on the grid without changing the altitude, you should reply with <Action>(10, 10, 0)</Action>.\n\n        x and y are the coordinates on the grid, and z is the altitude difference. For example, <Action>(0, 0, -10)</Action> means that you are moving 10 meters down. This is especially important, since you need to get close to the object in question.\n\n    </Action space>\n\n    <Formatting>\n\n        Your each response should contain XML <Reasoning> tag and <Action> tag.\n        <Reasoning> tag should contain your reasoning for the move you are making.\n        <Action> tag should contain the move you are making.\n\n        If you find the object, fly below 10 meters relative to it and reply with \"FOUND\". Remember, it must be in your field of view when you reply with \"FOUND\" and you must be 10 meters above it or closer. Being too far away is not acceptable.\n\n        For example:\n\n        <Reasoning>This yellow point might be the object in question. I need to go lower to check for that. If it's not the object in question, I will continue the search. I will also slightly go to the north.</Reasoning>\n        <Action>(5, 0, -30)</Action>\n\n    </Formatting>\n\n    <Limitations>\n        You shouldn't move into coordinates that are outside of your view. Otherwise, you may hit something which is not ideal.\n        You can make at most 9 moves. Your altitude cannot exceed 120 meters. Your search area is 400x400m from the drone's starting position. \n    </Limitations>\n</Controls>\n")


<Context>
    You are in command of a UAV, tasked with finding a large trash pile.
</Context>

<Objective>
    You should fly BELOW 10 meters above the object and then reply with "FOUND". Being lower (closer to the object) (like 9, 8, or less meters) is good, being higher than that (like 11, 12, or more meters) is bad. 

    You may not be able to see the object in the first image, so you need to perform a careful search. Your performance will be evaluated based on whether the object was at most 10 meters below the drone when you replied with "FOUND". The object MUST be in your field of view when you reply with "FOUND". You must be centered on the object. 
</Objective>

<Coordinates>
    There is a grid overlaid on each image you are presented with. It is meant to (roughly) communicate which point will be in drone's center of vision if you move in that direction. Note that height of the drone is not represented in the grid.
</Coordinates>

<Controls>
    <Action space>
        To move