# Exploring the results

This notebook provides code to explore the annotation results and compute precision and recall for the error analysis.

In [1]:
# These two functions are necessary to display the images in the notebook.
from IPython.core.display import display, HTML

# The annotations are stored in JSON files.
import json
from collections import Counter
from tabulate import tabulate

In [2]:
with open('./val_images.txt') as f:
    images = {str(i):line for i,line in enumerate(f)}

with open('./annotations_emiel/incongruent_final.json') as f:
    incongruent = json.load(f)    

with open('./satyrid/19-sept-2016-error-analysis.dev.txt') as f:
    descriptions = {str(i):line for i,line in enumerate(f)}

def categories_html(i):
    try:
        return "<b>categories</b>:" + ', '.join(incongruent[i])
    except KeyError:
        return ""

    
# These values are useful to exclude the first 100 incongruent examples from the guidelines.
# Else inter-annotator agreement isn't reliable anymore.

incongruent_keys = list(map(str,
                        sorted(map(int, incongruent.keys()))
                        ))
def incongruent_index(i):
    "Get the index for the subset of incongruent descriptions, if we sort by general index."
    try:
        return incongruent_keys.index(i)
    except KeyError:
        return 'NA'

def display_image(i):
    "Display the image."
    image = images[i]
    ii = incongruent_index(i)
    description = descriptions[i]
    cats = categories_html(i)
    image_path = './static/images/' + image
    html= """<center>
              {} ({}) <b>image</b>: {}<br/>
              <img src={} width="300px"><br/>
              {}<br/>
              {}
              </center>""".format(i, ii, image, image_path, description, cats)
    display(HTML(html))

In [3]:
# Find descriptions from particular error categories.
counter = 0
for i, cats in incongruent.items():
    # Option 1: exact matching.
    #if cats == ['other']:
    # Option 2: inclusion.
    if len(cats) ==3:
        display_image(i)
        counter += 1
#     if 'number' in cats and int(i)<100:
#         display_image(i)
#         
print(counter)

83


In [16]:
with open('./annotations_desmond/incongruent_categorized.json') as f:
    incongruent_desmond = json.load(f)

# This image was in the guidelines. 
# Desmond annotated an additional image so as to keep the annotation honest.
del incongruent_desmond["82"]

def precision_recall():
    "Compute the precision and recall"
    total_relevant = 0
    total_retrieved = 0
    total_relevant_retrieved = 0
    for i, desmond_cats in incongruent_desmond.items():
        if i in incongruent:
            retrieved = set(desmond_cats)
            relevant = set(incongruent[i])
            relevant_retrieved = relevant & retrieved
            total_relevant += len(relevant)
            total_retrieved += len(retrieved)
            total_relevant_retrieved += len(relevant_retrieved)
    precision = total_relevant_retrieved/total_retrieved
    recall = total_relevant_retrieved/total_relevant
    f1 = (precision+recall)/2
    results = ('We treat the double annotation for the second task as a retrieval problem: '
               'how many error categories are also found by the second annotator? In our evaluation, '
               'we achieved a precision of {:.2f}, with a recall of {:.2f}, resulting in an F1-score '
               'of {:.2f}.'.format(precision, recall, f1))
    print(results)

def print_table():
    "Print frequency tables for incongruent items."
    
    # Frequency of each error
    c = Counter()
    for i, cats in incongruent.items():
        c.update(cats)
    
    num_errors = sum(c.values())
    print(num_errors)
    num_images = len(incongruent)
    print("Errors per image: {:.2f}".format(num_errors/num_images))
    print(len(incongruent), "Incongruent items, out of", len(images), "images")
    percentage = (len(incongruent)/len(images))*100
    print("{:.2f} percent".format(percentage))
    print()
    table = tabulate(c.most_common(),headers=['Category','Count'],tablefmt="latex_booktabs")
    print(table)
    print()
    # Number of errors per image.
    c = Counter(len(cats) for cats in incongruent.values())
    num, count = zip(*c.most_common())
    num = ['Number of errors'] + list(num)
    count = ['Count'] + list(count)
    table = tabulate([num,count], tablefmt="latex_booktabs")
    print(table)
    print()

In [17]:
print_table()
precision_recall()

1265
Errors per image: 1.56
812 Incongruent items, out of 1014 images
80.08 percent

\begin{tabular}{lr}
\toprule
 Category             &   Count \\
\midrule
 generally unrelated  &     264 \\
 color of clothing    &     195 \\
 activity             &     168 \\
 type of clothing     &     104 \\
 gender               &      98 \\
 scene/event/location &      91 \\
 number               &      61 \\
 inexistent-object    &      47 \\
 age                  &      40 \\
 stance               &      38 \\
 position             &      37 \\
 extra subject        &      34 \\
 similar-object       &      31 \\
 other                &      20 \\
 color                &      14 \\
 inexistent-subject   &      11 \\
 wrong-object         &       7 \\
 similar-subject      &       3 \\
 extra object         &       1 \\
 wrong-subject        &       1 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrrr}
\toprule
 Number of errors &   1 &   2 &  3 &  4 \\
 Count            & 486 & 221 & 83 & 22 

In [15]:
display_image("82")