### This file contains example using tools to evaluate the performances for color recognition category for VizWiz data.

Evaluation tools from https://github.com/GT-Vision-Lab/VQA.

Codes in eval_util.py from Yanan Wang

In [1]:
import pandas as pd
import numpy as np
from eval_util import *

In [2]:
train_features = pd.read_csv("./azure_features_images/data/vizwiz_train_color_recognition.csv", delimiter=";", engine="python")
train_features['answer'] = train_features['descriptions'].astype(str).apply(lambda x: x.lower())
train_features['QID'] = train_features['qid']

In [3]:
train_targets = pd.read_csv("../vizwiz_skill_typ_train.csv", engine="python")

In [4]:
train = pd.merge(train_features, train_targets, how='left', on='QID')

In [5]:
set(train.values[0, 15:25])

{'basil', 'basil leaves'}

In [6]:
# current visual question answers' accuracy (agreement)
ans_accuracy  = []
# accuracy within each given answer type
type_accuracy = {}

mat = train.values

for i in range(len(mat)):
    resAns = mat[i,5]
    # clean up str
    ans_original    = re.sub('[\n\t\s]', ' ', resAns).strip()
    ans_punctuation = processPunctuation(ans_original)
    ans_final       = str(processDigitArticle(ans_punctuation))
    accuracy_scores = []
    # evaluate 10 different crowd answers
    crowd_answers = mat[i, 15:25]
    if len(set(crowd_answers)) > 1:
        for a in crowd_answers:
            a = processPunctuation(a)
    for curr in crowd_answers:
        diff_ans = [ans for ans in crowd_answers if ans != curr]
        # find agreement to measure accuracy
        matches = []
        for ans in diff_ans:
            ans = str(ans)
            if ans.isdigit() or ans in ["no", "yes"]:
                ans = ans.center(len(ans)+2)
            if ans in ans_final:
                matches.append(ans)
        # get VQA eval score
        score = min(1, float(len(matches)/3))
        accuracy_scores.append(score)
    # record accuracy for agreement & answer type 
    ans_type = mat[i,9]
    ans_accuracy.append(np.mean(accuracy_scores))
    if ans_type not in type_accuracy:
        type_accuracy[ans_type] = []
    type_accuracy[ans_type].append(score)
    qid = mat[i,0]
    setEvalQA(qid, score)
    setEvalAnsType(qid, ans_type, score)

In [7]:
setAccuracy(ans_accuracy, type_accuracy)

In [9]:
len(ans_accuracy)

14257

In [11]:
# mean accuracy dropped from 7% to 5%
print(np.mean(ans_accuracy))

0.05637464637254214


In [20]:
type_accuracy.keys()

dict_keys(['unanswerable', 'yes/no', 'other', 'number'])

In [13]:
print(accuracy['overall'])         
print(accuracy['perAnswerType'])

5.64
{'unanswerable': 3.81, 'yes/no': 0.87, 'other': 6.37, 'number': 0.3}


In [22]:
train.columns

Index(['qid', 'question', 'descriptions', 'tags', 'dominant_colors', 'answer',
       'QID', 'IMG', 'QSN', 'ANS_TYP', 'TXT', 'OBJ', 'COL', 'CNT', 'OTH',
       'ANS1', 'ANS2', 'ANS3', 'ANS4', 'ANS5', 'ANS6', 'ANS7', 'ANS8', 'ANS9',
       'ANS10'],
      dtype='object')

In [23]:
accuracy

{'overall': 5.64,
 'perAnswerType': {'number': 0.3,
  'other': 6.37,
  'unanswerable': 3.81,
  'yes/no': 0.87}}