## Processed responses

In [9]:
import pandas as pd
import numpy as np
from tacl_analysis.evaluation_metrics import EvaluationMetrics
evaluation_metric = EvaluationMetrics()
from numpy.random import Generator, PCG64
from joblib import Parallel, delayed
import json

Could not import SentenceTransformer. Please install the library using 'pip install sentence-transformers'


In [289]:
ggr_questions = pd.read_csv("https://data-visualization-benchmark.s3.us-west-2.amazonaws.com/ggr/questions.csv")
vlat_questions = pd.read_csv("https://data-visualization-benchmark.s3.us-west-2.amazonaws.com/vlat/questions.csv")
holf_questions = pd.read_csv("https://data-visualization-benchmark.s3.us-west-2.amazonaws.com/holf/questions.csv")

In [283]:
ggr = pd.read_csv("https://data-visualization-benchmark.s3.us-west-2.amazonaws.com/ggr/responses/indist_instructions_question/p04/t1/processed_extracted_responses.csv")
holf = pd.read_csv("https://data-visualization-benchmark.s3.us-west-2.amazonaws.com/holf/responses/indist_instructions_question/p04/t1/processed_extracted_responses.csv")
vlat = pd.read_csv('https://data-visualization-benchmark.s3.us-west-2.amazonaws.com/vlat/responses/indist_instructions_question/p04/t1/processed_extracted_responses.csv')

In [360]:
ggr_human = pd.read_csv("https://data-visualization-benchmark.s3.us-west-2.amazonaws.com/ggr/responses/indist_instructions_question/p04/t1/model_responses.csv")
ggr_human = ggr_human[(ggr_human["agentType"] == "Human/Math-2-1") | 
                        (ggr_human["agentType"] == "Human/Math-3") | 
                        (ggr_human["agentType"] == "Human")]
vlat_human = pd.read_csv("https://data-visualization-benchmark.s3.us-west-2.amazonaws.com/vlat/responses/indist_instructions_question/p04/t1/model_responses.csv")
vlat_human = vlat_human[(vlat_human["agentType"] == "Human/Math-2-1") | 
                        (vlat_human["agentType"] == "Human/Math-3") | 
                        (vlat_human["agentType"] == "Human")]
holf_human = pd.read_csv("https://data-visualization-benchmark.s3.us-west-2.amazonaws.com/holf/responses/indist_instructions_question/p04/t1/model_responses.csv")
holf_human = holf_human[(holf_human["agentType"] == "Human/Math-2-1") | 
                        (holf_human["agentType"] == "Human/Math-3") | 
                        (holf_human["agentType"] == "Human")]

ggr = pd.concat([ggr, ggr_human])
vlat = pd.concat([vlat, vlat_human])
holf = pd.concat([holf, holf_human])

In [373]:
# holf.drop(columns=["min_label", "max_label"])

In [374]:
ggr = ggr[ggr["testType"] == "ggr"]
vlat = vlat[vlat["testType"] == "vlat"]
holf = holf[holf["testType"] == "holf"]

ggr["correct_answer"] = ggr["correctAnswer"]
ggr["a_in_b"] = ggr.apply(
    lambda r: int(evaluation_metric.a_in_b(r['correct_answer'], r['agent_response'])), axis=1
)

vlat["correct_answer"] = vlat["correctAnswer"]
vlat["a_in_b"] = vlat.apply(
    lambda r: int(evaluation_metric.a_in_b(r['correct_answer'], r['agent_response'])), axis=1
)

holf = holf.drop(columns=["min_label", "max_label"])
holf["correct_answer"] = holf["correctAnswer"]
holf["agent_response"] = holf["agent_response"].astype(float)
holf["error"] = holf.apply(
    lambda r: evaluation_metric.get_absolute_error(r['agent_response'], r['correct_answer']), axis=1
)
holf = holf.dropna(subset=["error"])
merged_response = holf_questions[["question", "image_file", "min_label", "max_label"]]
holf = holf.merge(merged_response, on=["question", "image_file"])

holf["minmax_axis_normalized_error"] = holf.apply(
    lambda r: evaluation_metric.minmax_normalized_error(r['error'], r['min_label'], r['max_label']), axis=1
)

In [375]:
def bootstrap_ci(
        raw_data, 
        n_iterations=1000,
        statistic=np.mean,
        units_of_measure="a_in_b"):
    data = raw_data.copy()
    data["question_image"] = data["question"] + " & " + data["image_file"]
    rng = Generator(PCG64())
    questions = list(data["question_image"].unique())
    n_size = len(questions)
    df = data.copy()

    # sample within the data
    df = df.sample(frac=1, replace=True, random_state=1)

    def bootstrap_iteration(data, chosen_qs):
        filter_df = data[data["question_image"].isin(chosen_qs)] # Filter based on chosen questions
        bs_mean = statistic(filter_df[units_of_measure]) #.mean() # Calculate mean of the filtered data
        return bs_mean
    means = Parallel(n_jobs=-1)(
        delayed(bootstrap_iteration)(df, rng.choice(questions, n_size,  replace=True)) for _ in range(n_iterations)
    )
    
    # 95% confidence interval
    lower = np.percentile(means, 2.5)
    upper = np.percentile(means, 97.5)
    
    return lower, upper


def create_confidence_interval_df(data, statistic=np.mean, units_of_measure="a_in_b"):
    data_list = []
    # num_questions = len(self.questions[["question", "image_file"]])

    for agent in data["agentType"].unique():
        agent_res = data[data["agentType"] == agent]

        lower, upper = bootstrap_ci(agent_res, statistic=statistic, units_of_measure=units_of_measure)

        data_list.append({
            "agentType": agent,
            "ci_upper": upper, 
            "ci_lower": lower,
            # "value_count": len(agent_res[['question', 'image_file']].value_counts()) / num_questions,
            "mean": statistic(agent_res[units_of_measure])
        })

    return pd.DataFrame(data_list)

In [328]:
def test(in_df,
        statistic=np.mean,
        units_of_measure="a_in_b"
        ):
    agent_map = {
        "llava-hf/llava-1.5-7b-hf": "model",
        'Salesforce/blip2-flan-t5-xl': "model",
        'Salesforce/blip2-flan-t5-xxl': "model",
        'GPT-4V': "GPT-4V",
    }
    
    df = in_df.replace(agent_map)
    return create_confidence_interval_df(
        df[df["agentType"] == "model"], 
        statistic=statistic,
        units_of_measure=units_of_measure
    )

In [330]:
test(ggr)

Unnamed: 0,agentType,ci_upper,ci_lower,mean
0,model,0.07489,0.013495,0.051282


In [331]:
test(vlat)

Unnamed: 0,agentType,ci_upper,ci_lower,mean
0,model,0.397253,0.255589,0.320126


In [329]:
test(holf, statistic=np.median, units_of_measure="minmax_axis_normalized_error")

Unnamed: 0,agentType,ci_upper,ci_lower,mean
0,model,0.330317,0.296,0.3


### ACC difference of means

In [376]:
def dom_bootstrap_ci(
    raw_data,
    col1,
    col2,
    n_iterations=1000,
    statistic=np.mean,
    units_of_measure="a_in_b",
):
    
    data = raw_data.copy()
    data["question_image"] = data["question"] + " & " + data["image_file"]
    rng = Generator(PCG64())
    questions = list(data["question_image"].unique())
    n_size = len(questions)
    df = data.copy()

    # sample within the data
    df = df.sample(frac=1, replace=True, random_state=1)

    def bootstrap_iteration(data, chosen_qs):
        filter_df = data[data["question_image"].isin(chosen_qs)] # Filter based on chosen questions
        col1_res = filter_df[(filter_df["agentType"] == col1)][units_of_measure]
        col2_res = filter_df[(filter_df["agentType"] == col2)][units_of_measure]
        
        bs_mean = statistic(col1_res) - statistic(col2_res)
        return bs_mean
        
    means = Parallel(n_jobs=-1)(
        delayed(bootstrap_iteration)(df, rng.choice(questions, n_size,  replace=True)) for _ in range(n_iterations)
    )
    
    # 95% confidence interval
    lower = np.percentile(means, 2.5)
    upper = np.percentile(means, 97.5)
    
    return lower, upper

def create_confidence_interval_dom_df(data, col1, col2, statistic=np.mean, units_of_measure="a_in_b"):
    data_list = []
    lower, upper = dom_bootstrap_ci(
        data, 
        col1,
        col2,
        statistic=statistic, 
        units_of_measure=units_of_measure
    )

    data_list.append({
        # "agentType": agent,
        "ci_upper": upper, 
        "ci_lower": lower,
        "mean": (
            statistic(data[data["agentType"] == col1][units_of_measure]) - 
            statistic(data[data["agentType"] == col2][units_of_measure]))
    })

    return pd.DataFrame(data_list)

In [380]:
create_confidence_interval_dom_df(ggr, "Human/Math-2-1", "GPT-4V")

Unnamed: 0,ci_upper,ci_lower,mean
0,0.602054,0.260507,0.439888


In [379]:
create_confidence_interval_dom_df(vlat, "Human/Math-2-1", "GPT-4V")

Unnamed: 0,ci_upper,ci_lower,mean
0,0.272629,0.081865,0.133108


In [381]:
create_confidence_interval_dom_df(holf, "Human/Math-2-1", "GPT-4V", 
                                  statistic=np.median, units_of_measure="minmax_axis_normalized_error")

Unnamed: 0,ci_upper,ci_lower,mean
0,-0.033715,-0.056118,-0.040556


In [161]:
vlat.dropna(subset=["agent_response"]).groupby("agentType")['a_in_b'].mean()

agentType
GPT-4V                          0.620424
Salesforce/blip2-flan-t5-xl     0.305882
Salesforce/blip2-flan-t5-xxl    0.297456
llava-hf/llava-1.5-7b-hf        0.379245
Name: a_in_b, dtype: float64

In [146]:
vlat.dropna(subset=["agent_response"]).groupby(["agentType"]).apply(lambda g : len(g) / 45).reset_index()

agentType
GPT-4V                          0.330357
Salesforce/blip2-flan-t5-xl     0.019802
Salesforce/blip2-flan-t5-xxl    0.025862
llava-hf/llava-1.5-7b-hf        0.500000
Name: answer_in_response, dtype: float64

## Heatmap Error

In [427]:
def bootstrap_pairwise_ci( 
        raw_data, 
        n_iterations=1000,
        statistic=np.mean,
        units_of_measure="jaccard_similarity"
    ):
    
    data = raw_data.copy()
    data["question_image"] = data["question"] + " & " + data["image_file"]
    rng = Generator(PCG64())
    questions = list(data["question_image"].unique())
    n_size = len(questions)
    df = data.copy()

    # sample within the data
    df = df.sample(frac=1, replace=True, random_state=1)

    def bootstrap_iteration(data, chosen_qs):
        filter_df = data[data["question_image"].isin(chosen_qs)] # Filter based on chosen questions
        bs_mean = statistic(filter_df[units_of_measure]) # Calculate mean of the filtered data
        return bs_mean
    means = Parallel(n_jobs=-1)(
        delayed(bootstrap_iteration)(df, rng.choice(questions, n_size,  replace=True)) for _ in range(n_iterations)
    )
    
    # 95% confidence interval
    lower = np.percentile(means, 2.5)
    upper = np.percentile(means, 97.5)
    
    return lower, upper


def create_pairwise_confidence_interval_df(agent_res, units_of_measure="jaccard_similarity", statistic=np.mean):
    
    lower, upper = bootstrap_pairwise_ci(agent_res, statistic=statistic, units_of_measure=units_of_measure)
    stats = {
        "ci_upper": upper, 
        "ci_lower": lower,
        "mean": statistic(agent_res[units_of_measure])
    }
    return stats
    # return pd.DataFrame(stats)

In [8]:
ggr_csv = pd.read_csv("/Users/arnav/Desktop/contextvis/vlm-datavis-benchmark/analysis/tacl_analysis/heatmap/ggr_all_pairwise.csv", low_memory=True)
vlat_csv = pd.read_csv("/Users/arnav/Desktop/contextvis/vlm-datavis-benchmark/analysis/tacl_analysis/heatmap/vlat_all_pairwise.csv", low_memory=True)
holf_csv = pd.read_csv("/Users/arnav/Desktop/contextvis/vlm-datavis-benchmark/analysis/tacl_analysis/heatmap/holf_all_pairwise.csv", low_memory=True)

  ggr_csv = pd.read_csv("/Users/arnav/Desktop/contextvis/vlm-datavis-benchmark/analysis/tacl_analysis/heatmap/ggr_all_pairwise.csv", low_memory=True)
  vlat_csv = pd.read_csv("/Users/arnav/Desktop/contextvis/vlm-datavis-benchmark/analysis/tacl_analysis/heatmap/vlat_all_pairwise.csv", low_memory=True)


### Between Human / Model

In [11]:
# vlat_csv

In [434]:
def between_human_model_comparison(data, test_type):
    agent_map = {
        "llava-hf/llava-1.5-7b-hf": "model",
        'Salesforce/blip2-flan-t5-xl': "model",
        'Salesforce/blip2-flan-t5-xxl': "model",
        'GPT-4V': "model",
        'Human/Math-2-1': 'human',
        'Human/Math-3': 'human'
    }
    
    df = data.replace(agent_map)
    df = df[
        (df["agentType_A"] == "human") &
        (df["agentType_B"] == "model")
    ]

    metric="jaccard_similarity"
    if (test_type == "holf"):
        # df = test_df.groupby(["agentType_B", "agentType_A"])[metric].median().reset_index()
        return create_pairwise_confidence_interval_df(df, statistic=np.median)
    else:
        return create_pairwise_confidence_interval_df(df)
        # df = test_df.replace(agent_map).groupby(["agentType_B", "agentType_A"])[metric].mean().reset_index()
    # return df[(df["agentType_A"] == "human") & (df["agentType_B"] == "model")][metric].iloc[0]

between_human_model_comparison(ggr_csv, "ggr")


KeyboardInterrupt



In [207]:
between_human_model_comparison(ggr_csv, "ggr")

0.18405284010842363

In [13]:
# ggr_csv

In [None]:
between_human_model_comparison(holf_csv, "holf")

In [6]:
holf_csv = pd.read_csv("/Users/arnav/Desktop/contextvis/vlm-datavis-benchmark/analysis/tacl_analysis/heatmap/holf_all_pairwise.csv", low_memory=False)

### Between Humans

In [240]:
def between_human_comparison(curr_df, test_type):
    agent_map = {
        "llava-hf/llava-1.5-7b-hf": np.nan,
        'Salesforce/blip2-flan-t5-xl': np.nan,
        'Salesforce/blip2-flan-t5-xxl': np.nan,
        'GPT-4V': np.nan,
        'Human/Math-2-1': 'Human/Math-2-1',
        'Human/Math-3': 'Human/Math-3'
    }
    test_df = curr_df.replace(agent_map).dropna(subset=["agentType_B", "agentType_A"])

    metric="jaccard_similarity"
    if (test_type == "holf"):
        df = test_df.groupby(["agentType_B", "agentType_A"])[metric].median().reset_index()
    else:
        df = test_df.groupby(["agentType_B", "agentType_A"])[metric].mean().reset_index()

    return df[(df["agentType_A"] == "Human/Math-2-1") & (df["agentType_B"] == "Human/Math-3")][metric].iloc[0]

In [241]:
between_human_comparison(ggr_csv, "ggr")

0.6828717036231868

In [242]:
between_human_comparison(vlat_csv, "vlat")

0.6804695119137334

In [243]:
between_human_comparison(holf_csv, "holf")

0.0475

In [15]:
# holf_csv

### Between Models

In [244]:
def between_model_comparison(curr_df, test_type):
    agent_map = {
        "llava-hf/llava-1.5-7b-hf": "model",
        'Salesforce/blip2-flan-t5-xl': "model",
        'Salesforce/blip2-flan-t5-xxl': "model",
        'GPT-4V': "model",
    }
    test_df = curr_df.replace(agent_map).dropna(subset=["agentType_B", "agentType_A"])

    metric="jaccard_similarity"
    if (test_type == "holf"):
        df = test_df.groupby(["agentType_B", "agentType_A"])[metric].median().reset_index()
    else:
        df = test_df.groupby(["agentType_B", "agentType_A"])[metric].mean().reset_index()

    return df[(df["agentType_A"] == "model") & (df["agentType_B"] == "model")][metric].iloc[0]

In [245]:
between_model_comparison(ggr_csv, "ggr")

0.2624092888243832

In [246]:
between_model_comparison(vlat_csv, "vlat")

0.6186843590163636

In [247]:
between_model_comparison(holf_csv, "holf")

0.25