In [2]:
import json
import prettytable
import random
from collections import defaultdict 
import numpy as np

random.seed(53)

category_correct_1 = defaultdict(int)
category_correct_2 = defaultdict(int)
category_correct_3 = defaultdict(int)
category_correct_majority = defaultdict(int)
category_correct_absolute = defaultdict(int)
category_total = defaultdict(int)


with open('results/illusionvqa_comprehension_human_annotator_results.json') as f:
    dataset = json.load(f)

time1 = []
time2 = []
time3 = []
for item in dataset:
    time1.append(item['HUMAN_TIME_1'])
    time2.append(item['HUMAN_TIME_2'])
    time3.append(item['HUMAN_TIME_3'])
    
    human_answer_1 = item['HUMAN_ANSWER_1']
    human_answer_2 = item['HUMAN_ANSWER_2']
    human_answer_3 = item['HUMAN_ANSWER_3']
    answer = item['answer']
    
    if human_answer_1 == answer:
        category_correct_1[item['category']] += 1
    if human_answer_2 == answer:
        category_correct_2[item['category']] += 1
    if human_answer_3 == answer:
        category_correct_3[item['category']] += 1

    #majority vote
    if human_answer_1 == human_answer_2:
        majority_vote = human_answer_1
    elif human_answer_1 == human_answer_3:
        majority_vote = human_answer_1
    elif human_answer_2 == human_answer_3:
        majority_vote = human_answer_2
    else:
        majority_vote = random.choice([human_answer_1, human_answer_2, human_answer_3])
        
    if majority_vote == answer:
        category_correct_majority[item['category']] += 1
    
    #absolute consensus
    if human_answer_1 == human_answer_2 == human_answer_3:
        if human_answer_1 == answer:
            category_correct_absolute[item['category']] += 1
    category_total[item['category']] += 1


In [3]:
total_correct_1 = 0
total_correct_2 = 0
total_correct_3 = 0
total_correct_majority = 0
total_correct_absolute = 0
total = 0

for category in category_total:
    total_correct_1 += category_correct_1[category]
    total_correct_2 += category_correct_2[category]
    total_correct_3 += category_correct_3[category]
    total_correct_majority += category_correct_majority[category]
    total_correct_absolute += category_correct_absolute[category]
    total += category_total[category]



print(total_correct_1/total, total_correct_2/total, total_correct_3/total, total_correct_majority/total, total_correct_absolute/total)

0.871264367816092 0.8666666666666667 0.9287356321839081 0.9103448275862069 0.7839080459770115


In [4]:
np.mean(time1), np.mean(time2), np.mean(time3), np.mean([np.mean(time1), np.mean(time2), np.mean(time3)])

(16.461253224296133,
 12.945708226061415,
 15.569797342125026,
 14.992252930827526)

In [5]:

table = prettytable.PrettyTable()
#column names
table.field_names = ["Category", "Total", "Accuracy Majority"]
for category, total in category_total.items():
    table.add_row([category, category_total[category], category_correct_majority[category]/category_total[category]])

In [6]:
#sort by total
table.sortby = "Total"
table.reversesort = True
print(table)



+-------------------------+-------+--------------------+
|         Category        | Total | Accuracy Majority  |
+-------------------------+-------+--------------------+
|    impossible object    |  134  | 0.9850746268656716 |
|        real-scene       |   64  |      0.984375      |
|           size          |   46  | 0.6304347826086957 |
|          hidden         |   45  |        1.0         |
|     deceptive design    |   37  | 0.9459459459459459 |
|      angle illusion     |   26  | 0.8461538461538461 |
|          color          |   23  | 0.6086956521739131 |
|       edited-scene      |   21  |        1.0         |
|         counting        |   11  |        1.0         |
|       upside-down       |   7   |        1.0         |
| positive-negative space |   7   |        1.0         |
|      circle-spiral      |   6   | 0.6666666666666666 |
|    repeating pattern    |   2   |        1.0         |
|       perspective       |   2   |        1.0         |
|        occlusion        |   2

In [7]:
#misc
misc_correct = category_correct_majority["counting"]+category_correct_majority["repeating pattern"]+category_correct_majority["perspective"]+category_correct_majority["occlusion"]+category_correct_majority["angle constancy"]
misc_total = category_total["counting"]+category_total["repeating pattern"]+category_total["perspective"]+category_total["occlusion"]+category_total["angle constancy"]

print(misc_correct/misc_total)

0.8947368421052632


In [8]:
h1_h2 = 0
h1_h3 = 0
h2_h3 = 0
h1_h2_h3 = 0
none = 0


for item in dataset:
    human_answer_1 = item['HUMAN_ANSWER_1']
    human_answer_2 = item['HUMAN_ANSWER_2']
    human_answer_3 = item['HUMAN_ANSWER_3']
    answer = item['answer']
    
    if human_answer_1 == human_answer_2:
        h1_h2 += 1

    if human_answer_1 == human_answer_3:
        h1_h3 += 1

    if human_answer_2 == human_answer_3:
        h2_h3 += 1

    if human_answer_1 == human_answer_2 == human_answer_3:
        h1_h2_h3 += 1

    if human_answer_1 != human_answer_2 and human_answer_1 != human_answer_3 and human_answer_2 != human_answer_3:
        none += 1

print("evaluating agreement between human answers")
print("human 1 and human 2: ", h1_h2/len(dataset))
print("human 1 and human 3: ", h1_h3/len(dataset))
print("human 2 and human 3: ", h2_h3/len(dataset))
print("all:  ", h1_h2_h3/len(dataset))
print("none: ", none/len(dataset))


evaluating agreement between human answers
human 1 and human 2:  0.8574712643678161
human 1 and human 3:  0.8482758620689655
human 2 and human 3:  0.8298850574712644
all:   0.7862068965517242
none:  0.0367816091954023


In [9]:
binary_qa = []

for item in dataset:
    human_answer_1 = item['HUMAN_ANSWER_1']
    human_answer_2 = item['HUMAN_ANSWER_2']
    human_answer_3 = item['HUMAN_ANSWER_3']
    # answer = item['answer']
    
    human_answers = []
    for option in item["options"]:
        human_answers.append([human_answer_1==option, human_answer_2==option, human_answer_3==option])
    binary_qa+=human_answers

print(binary_qa)

[[False, False, False], [True, True, True], [False, False, False], [False, False, False], [False, False, False], [False, False, False], [True, True, True], [False, False, False], [True, True, True], [False, False, False], [False, False, False], [False, False, False], [True, True, True], [False, False, False], [True, True, False], [False, False, False], [False, False, True], [True, True, True], [False, False, False], [False, False, False], [False, False, False], [True, True, True], [False, False, False], [False, False, False], [False, False, False], [False, False, False], [False, False, False], [True, True, True], [False, False, False], [True, True, True], [False, False, False], [False, False, False], [False, False, False], [False, False, False], [False, False, False], [True, True, True], [False, False, False], [False, False, False], [False, False, False], [False, True, True], [True, False, False], [False, False, False], [True, True, True], [False, False, False], [False, False, False], 

In [10]:

binary_qa = np.array(binary_qa)

#covert True/False to 1/0
binary_qa = binary_qa.astype(int)


In [11]:
from sklearn.metrics import cohen_kappa_score

evaluator1_answers = binary_qa[:,0]
evaluator2_answers = binary_qa[:,1]
evaluator3_answers = binary_qa[:,2]

kappa_1_2 = cohen_kappa_score(evaluator1_answers, evaluator2_answers)
kappa_1_3 = cohen_kappa_score(evaluator1_answers, evaluator3_answers)
kappa_2_3 = cohen_kappa_score(evaluator2_answers, evaluator3_answers)

print(kappa_1_2, kappa_1_3, kappa_2_3)


0.808130982582557 0.7958513654678406 0.7726538123049844
