In [28]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, f1_score

### Surprise/Different experiments

In [2]:
df = pd.read_json("../data/annotated/vision/misalign_sample_100_pixtral_anno.jsonl", lines=True, orient="records")
len(df)

100

In [3]:
label_cols = [col for col in df.columns if 'decision' in col or 'surprise' in col]
conf_cols = [col for col in df.columns if 'conf' in col]

In [38]:
for i, col in df[label_cols].items():
    print(col.value_counts())

qsur_dcat_ccat_surprise
not-surprising    100
Name: count, dtype: int64
tasksur_dbin_cscore_decision
False    93
True      7
Name: count, dtype: int64
tasksur_dcat_ccat_decision
not-surprising     98
very-surprising     1
surprising          1
Name: count, dtype: int64
taskdiff_dcat_ccat_decision
not-different     51
different         38
very-different    11
Name: count, dtype: int64
taskdiffeg_dbin_cscore_decision
False    85
True     15
Name: count, dtype: int64
verbose_taskdiff_dbin_cscore_decision
True     92
False     6
Name: count, dtype: int64
verbose_taskdiffeg_dcat_ccat_decision
different         85
not-different      9
very-different     2
Name: count, dtype: int64
verbose_task_eg_frames_dbin_cscore_decision
True     78
False    11
Name: count, dtype: int64


In [39]:
for i, col in df[conf_cols].items():
    print(col.value_counts())

qsur_dcat_ccat_confidence
high-confidence    100
Name: count, dtype: int64
tasksur_dbin_cscore_confidence-score
9     47
10    41
8      9
7      3
Name: count, dtype: int64
tasksur_dcat_ccat_confidence
high-confidence      96
medium-confidence     4
Name: count, dtype: int64
taskdiff_dcat_ccat_confidence
high-confidence      64
medium-confidence    36
Name: count, dtype: int64
taskdiffeg_dbin_cscore_confidence-score
10    46
9     35
8     15
7      4
Name: count, dtype: int64
verbose_taskdiff_dbin_cscore_confidence-score
8.0     70
9.0     17
7.0     10
10.0     1
Name: count, dtype: int64
verbose_taskdiffeg_dcat_ccat_confidence
medium-confidence    73
high-confidence      23
Name: count, dtype: int64
verbose_task_eg_frames_dbin_cscore_confidence-score
8.0     73
10.0     6
7.0      6
9.0      4
Name: count, dtype: int64


In [40]:
pd.set_option('display.max_colwidth', None)

In [41]:
cols_to_show = ['qsur_dcat_ccat_surprise', 'qsur_dcat_ccat_confidence',
       'qsur_dcat_ccat_reasoning-and-evidence', 'tasksur_dcat_ccat_decision',
       'tasksur_dcat_ccat_explanation', 'tasksur_dcat_ccat_confidence',
       'taskdiff_dcat_ccat_decision', 'taskdiff_dcat_ccat_explanation',
       'taskdiff_dcat_ccat_confidence', 
       'verbose_taskdiffeg_dcat_ccat_decision',
       'verbose_taskdiffeg_dcat_ccat_explanation',
       'verbose_taskdiffeg_dcat_ccat_confidence','image_url', 'title']
df = df[cols_to_show]

In [42]:
# for i, row in pos_df.iterrows():
#     plot_image_title(row['image_url'], "Decision: "+row['verbose-task-eg-frames-sbin-csore_decision']+", Explanation:"+row['verbose-task-eg-frames-sbin-csore_explanation'], row['title'])
#     plt.show()

In [43]:
# df = pd.read_csv("../data/human/sample_data.csv", index_col=[0])
# df.sample(100, random_state=11).to_csv("../data/human/sample_100.csv")

##### Display all as html

In [44]:
from IPython.display import display, HTML
from PIL import Image
from io import BytesIO
import base64

def image_base64(im):
    if isinstance(im, str):
        im = Image.open(im)
    with BytesIO() as buffer:
        im.save(buffer, 'jpeg')
        return base64.b64encode(buffer.getvalue()).decode()

In [45]:
from matplotlib.backends.backend_pdf import PdfPages

In [46]:
img_paths = df['image_url'].apply(lambda x: "../"+x).tolist()
imgStrs = [image_base64(img_path) for img_path in img_paths]

df['Image'] = [f'<img width=400 src="data:image/png;base64,{imgStr}">' for imgStr in imgStrs]
df[['Image', 'title'] + cols_to_show[:-1]].to_html("../data/annotated/vision/misalign_sample_100_pixtral_anno.html", escape=False)

In [47]:
# fig, ax =plt.subplots(figsize=(12,4))
# ax.axis('tight')
# ax.axis('off')
# the_table = ax.table(cellText=df.values,colLabels=df.columns,loc='center')

# #https://stackoverflow.com/questions/4042192/reduce-left-and-right-margins-in-matplotlib-plot
# pp = PdfPages("foo.pdf")
# pp.savefig(fig, bbox_inches='tight')
# pp.close()

#### Accuracy

In [4]:
human_anno_df = pd.read_csv("../data/human/surprise_annotations_srishti.csv", index_col=[0])

In [None]:
labels = human_anno_df['surprising']

In [16]:
human_anno_df['surprising'].value_counts()

surprising
False    66
True     34
Name: count, dtype: int64

In [None]:
results = {}
for prompt_setting, col in df[label_cols].items():
    # print(prompt_setting)
    col.fillna("False", inplace=True)
    preds = col.values
    if 'cat' in prompt_setting:
        bool_preds = [False if 'not' in x else True for x in preds]
    elif 'bin' in prompt_setting:
        bool_preds = [False if x=='False' else True for x in preds]
    results[prompt_setting] = {'accuracy':accuracy_score(labels, bool_preds), 'f1_macro':f1_score(labels, bool_preds, average='macro').round(2)}

In [40]:
pd.DataFrame(results).T

Unnamed: 0,accuracy,f1_macro
qsur_dcat_ccat_surprise,0.66,0.4
tasksur_dbin_cscore_decision,0.61,0.4
tasksur_dcat_ccat_decision,0.64,0.39
taskdiff_dcat_ccat_decision,0.55,0.54
taskdiffeg_dbin_cscore_decision,0.59,0.45
verbose_taskdiff_dbin_cscore_decision,0.4,0.36
verbose_taskdiffeg_dcat_ccat_decision,0.41,0.37
verbose_task_eg_frames_dbin_cscore_decision,0.44,0.43
