# Results analysis for ANN recognition

This notebook analyzes results for for the ANN recognition experiments described in "Section 6: Validation Experiment: Human visual selectivity Boosts ANN Recognition."

To begin, follow the instructions in the notebook `create_ANN_recognition_data_human_masks.ipynb` to generate the data, then run the script `run_ann_recognition.sh` to produce outputs.

In [1]:
import numpy as np
from os.path import dirname, join as pjoin
import scipy.io as sio
import matplotlib.pyplot as plt
import fnmatch
import os
import itertools
import pandas as pd
import math

Read in the data.

In [2]:
results_dir = dirname('ANN_recognition_outputs_human_masks/')
results_files = os.listdir(results_dir)
assert results_files
results_files

['ImageNet_attention-branch-network_ResNet-101.txt',
 'ImageNet_baseline-cnns_ViT-small.txt',
 'ImageNet_baseline-cnns_AlexNet.txt',
 'Places365_baseline-cnns_AlexNet.txt',
 'CIFAR-100_baseline-cnns_VGG-19-BN.txt',
 'CIFAR-100_attention-branch-network_DenseNet-BC.txt',
 'CIFAR-100_baseline-cnns_ResNet-110.txt',
 'ImageNet_baseline-cnns_VGG-16-BN.txt',
 'Places365_baseline-cnns_ResNet50.txt',
 'ImageNet_baseline-cnns_EfficientNet-B0.txt',
 'CIFAR-100_attention-branch-network_ResNet-110.txt',
 'ImageNet_baseline-cnns_ResNet-101.txt',
 'CIFAR-100_baseline-cnns_AlexNet.txt',
 'CIFAR-100_learn-to-pay-attention_VGG.txt']

In [3]:
def combine_dicts(d1, d2):
    return {**d1, **d2}

        
def confs_to_ranking(confs):
    order = confs.argsort()[::-1]
    ranks = order.argsort()
    return ranks


def read_outputs(fname, fdir):
    with open(os.path.join(fdir, fname), 'r') as f:
        lines = f.readlines()

    def parse_line(line):
        line = line.strip('\n').split('\t')
        (_, condition, img_mask), confs = line[0].split('/'), line[1:]
        
        img, mask = img_mask.split('_')
        img = int(img[3:])
        if 'nomask' in mask:
            mask = -1
        else:
            mask = mask[:-4]
            mask = int(mask[4:])

        
        hparams = dict(h.split('=') for h in condition.split('_'))
        hparams['rotation'] = int(hparams['rotation'])
        hparams['exp'] = float(hparams['exp'])
        
        confs = np.array(confs, dtype=float)
             
        return combine_dicts({
            "img_id": img,
            "mask_id": mask,
            "confs": confs,
            "ranks": confs_to_ranking(confs),     
        }, hparams)

    parsed_lines = list(map(parse_line, lines))
    
    dataset, attention_type, model = fname[:-4].split('_')
    run_metadata = {
        'dataset': dataset, 
        'attention_type': attention_type, 
        'model': model,
    }
    parsed_lines = [combine_dicts(run_metadata, s) for s in parsed_lines]
    
    return parsed_lines

In [4]:
all_results = []
for results_file in results_files:
    all_results += read_outputs(results_file, results_dir)
data = pd.DataFrame(all_results)
data

Unnamed: 0,dataset,attention_type,model,img_id,mask_id,confs,ranks,mask,null,exp,threshold,rotation
0,ImageNet,attention-branch-network,ResNet-101,10,10,"[2.2482002e-06, 9.736867e-05, 2.4229846e-06, 9...","[844, 155, 835, 953, 614, 761, 972, 296, 488, ...",KDE,black,1.0,,0
1,ImageNet,attention-branch-network,ResNet-101,10,115,"[1.7172952e-05, 0.0002940889, 1.34856455e-05, ...","[814, 236, 852, 895, 563, 801, 939, 293, 555, ...",KDE,black,1.0,,0
2,ImageNet,attention-branch-network,ResNet-101,10,125,"[5.1419697e-06, 0.00030278185, 1.306828e-05, 9...","[913, 218, 774, 831, 567, 732, 834, 417, 717, ...",KDE,black,1.0,,0
3,ImageNet,attention-branch-network,ResNet-101,10,126,"[1.6124724e-05, 0.00025059324, 2.3895567e-05, ...","[841, 300, 764, 717, 437, 646, 696, 439, 728, ...",KDE,black,1.0,,0
4,ImageNet,attention-branch-network,ResNet-101,10,134,"[3.9168767e-06, 5.959208e-05, 3.6086508e-06, 9...","[608, 131, 624, 894, 508, 761, 965, 217, 726, ...",KDE,black,1.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
63695,CIFAR-100,learn-to-pay-attention,VGG,95,790,"[0.026474101, 0.0038573723, 0.00171658, 0.0049...","[8, 50, 94, 39, 84, 70, 12, 40, 36, 0, 21, 42,...",seye,black,1.0,,0
63696,CIFAR-100,learn-to-pay-attention,VGG,95,81,"[0.0735066, 0.005536854, 0.0031309163, 0.06116...","[3, 39, 67, 4, 21, 72, 89, 38, 63, 6, 15, 52, ...",seye,black,1.0,,0
63697,CIFAR-100,learn-to-pay-attention,VGG,95,90,"[0.015662797, 0.0056659468, 0.0026137717, 0.00...","[8, 44, 91, 66, 25, 76, 52, 48, 54, 2, 13, 31,...",seye,black,1.0,,0
63698,CIFAR-100,learn-to-pay-attention,VGG,95,95,"[0.0071014804, 0.005383474, 0.0057327305, 0.00...","[26, 37, 35, 36, 25, 79, 66, 76, 83, 19, 22, 7...",seye,black,1.0,,0


Analyze the results.

In [5]:
def ranking_distance(unmasked_ranks, correctly_masked_ranks, incorrectly_masked_ranks, k=10):
    # Get (unsorted) indices of top k. 
    idx = np.argpartition(unmasked_ranks, k)[:k]

    # Get the rankings at those locations.
    unmasked_top_k_ranks = unmasked_ranks[idx]
    correct_top_k_ranks = correctly_masked_ranks[idx]
    incorrect_top_k_ranks = incorrectly_masked_ranks[idx]

    # Do not normalize because we report fixed k.
    # binomial_coeff = math.comb(k, 2)
    
    return (
        sum(correct_top_k_ranks - unmasked_top_k_ranks),
        sum(incorrect_top_k_ranks - unmasked_top_k_ranks),
    )

In [6]:
# Unmasked data is duplicated across hparams and masks, but this makes it easier to iterate over.
unmasked_data = data[(data['mask_id'] == -1) & (data['rotation'] == 0)]
unmasked_data

Unnamed: 0,dataset,attention_type,model,img_id,mask_id,confs,ranks,mask,null,exp,threshold,rotation
25,ImageNet,attention-branch-network,ResNet-101,10,-1,"[5.0242652e-05, 2.3426071e-05, 3.1531763e-05, ...","[401, 605, 529, 968, 577, 896, 997, 432, 917, ...",KDE,black,1.0,,0
51,ImageNet,attention-branch-network,ResNet-101,115,-1,"[3.0585632e-06, 7.050344e-07, 9.1366826e-07, 1...","[84, 275, 223, 38, 122, 24, 37, 906, 984, 652,...",KDE,black,1.0,,0
77,ImageNet,attention-branch-network,ResNet-101,125,-1,"[1.4614342e-07, 2.5749837e-06, 2.9733073e-07, ...","[965, 599, 912, 836, 738, 556, 929, 734, 881, ...",KDE,black,1.0,,0
103,ImageNet,attention-branch-network,ResNet-101,126,-1,"[3.1096151e-06, 2.2654494e-06, 6.363943e-08, 2...","[365, 422, 974, 840, 957, 669, 983, 560, 905, ...",KDE,black,1.0,,0
129,ImageNet,attention-branch-network,ResNet-101,134,-1,"[2.2633388e-09, 1.5657108e-08, 6.1017985e-10, ...","[846, 575, 962, 944, 950, 347, 983, 760, 866, ...",KDE,black,1.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
63595,CIFAR-100,learn-to-pay-attention,VGG,76,-1,"[0.006354826, 0.013585589, 0.0051908703, 0.005...","[53, 14, 67, 64, 22, 7, 79, 48, 11, 18, 84, 59...",seye,black,1.0,,0
63621,CIFAR-100,learn-to-pay-attention,VGG,790,-1,"[0.00040577984, 0.00068223744, 0.0022077095, 0...","[87, 49, 15, 44, 98, 4, 51, 65, 25, 32, 93, 50...",seye,black,1.0,,0
63647,CIFAR-100,learn-to-pay-attention,VGG,81,-1,"[0.0002182761, 0.00020017404, 5.1607694e-05, 0...","[39, 42, 92, 14, 22, 2, 67, 87, 33, 4, 93, 7, ...",seye,black,1.0,,0
63673,CIFAR-100,learn-to-pay-attention,VGG,90,-1,"[0.0011341472, 0.0017544234, 0.0013007402, 0.0...","[85, 49, 74, 68, 35, 15, 79, 51, 48, 30, 50, 7...",seye,black,1.0,,0


In [7]:
correctly_masked_idx = (
    (data['img_id'] == data['mask_id']) & 
    (data['rotation'] == 0)
)
data[correctly_masked_idx]

Unnamed: 0,dataset,attention_type,model,img_id,mask_id,confs,ranks,mask,null,exp,threshold,rotation
0,ImageNet,attention-branch-network,ResNet-101,10,10,"[2.2482002e-06, 9.736867e-05, 2.4229846e-06, 9...","[844, 155, 835, 953, 614, 761, 972, 296, 488, ...",KDE,black,1.0,,0
27,ImageNet,attention-branch-network,ResNet-101,115,115,"[4.424352e-05, 0.0032245403, 0.00015235787, 0....","[862, 57, 607, 524, 250, 589, 778, 280, 388, 4...",KDE,black,1.0,,0
54,ImageNet,attention-branch-network,ResNet-101,125,125,"[1.5943998e-06, 4.793991e-05, 1.1526313e-05, 1...","[972, 478, 720, 659, 699, 868, 878, 376, 629, ...",KDE,black,1.0,,0
81,ImageNet,attention-branch-network,ResNet-101,126,126,"[4.6545036e-05, 0.0009142669, 3.4490185e-05, 1...","[799, 175, 857, 968, 681, 870, 938, 279, 471, ...",KDE,black,1.0,,0
108,ImageNet,attention-branch-network,ResNet-101,134,134,"[1.4265131e-05, 0.00095442415, 5.933848e-05, 1...","[924, 171, 696, 910, 773, 815, 954, 371, 617, ...",KDE,black,1.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
63590,CIFAR-100,learn-to-pay-attention,VGG,76,76,"[0.0055289054, 0.005709153, 0.010979016, 0.018...","[75, 69, 21, 7, 16, 94, 17, 83, 54, 28, 80, 9,...",seye,black,1.0,,0
63617,CIFAR-100,learn-to-pay-attention,VGG,790,790,"[0.0038734658, 0.004203082, 0.0018982192, 0.01...","[56, 54, 95, 5, 80, 66, 18, 97, 90, 10, 16, 55...",seye,black,1.0,,0
63644,CIFAR-100,learn-to-pay-attention,VGG,81,81,"[0.008888817, 0.0094863, 0.009009125, 0.017068...","[56, 38, 54, 5, 65, 48, 30, 33, 86, 28, 14, 95...",seye,black,1.0,,0
63671,CIFAR-100,learn-to-pay-attention,VGG,90,90,"[0.004992286, 0.0046671503, 0.001602864, 0.002...","[43, 46, 89, 76, 49, 75, 33, 37, 80, 18, 21, 3...",seye,black,1.0,,0


In [8]:
incorrectly_masked_idx = (
    (data['mask_id'] != -1) &
    (data['img_id'] != data['mask_id'])
)
data[incorrectly_masked_idx]

Unnamed: 0,dataset,attention_type,model,img_id,mask_id,confs,ranks,mask,null,exp,threshold,rotation
1,ImageNet,attention-branch-network,ResNet-101,10,115,"[1.7172952e-05, 0.0002940889, 1.34856455e-05, ...","[814, 236, 852, 895, 563, 801, 939, 293, 555, ...",KDE,black,1.0,,0
2,ImageNet,attention-branch-network,ResNet-101,10,125,"[5.1419697e-06, 0.00030278185, 1.306828e-05, 9...","[913, 218, 774, 831, 567, 732, 834, 417, 717, ...",KDE,black,1.0,,0
3,ImageNet,attention-branch-network,ResNet-101,10,126,"[1.6124724e-05, 0.00025059324, 2.3895567e-05, ...","[841, 300, 764, 717, 437, 646, 696, 439, 728, ...",KDE,black,1.0,,0
4,ImageNet,attention-branch-network,ResNet-101,10,134,"[3.9168767e-06, 5.959208e-05, 3.6086508e-06, 9...","[608, 131, 624, 894, 508, 761, 965, 217, 726, ...",KDE,black,1.0,,0
5,ImageNet,attention-branch-network,ResNet-101,10,146,"[7.4736577e-06, 0.0013359549, 3.2349308e-05, 2...","[972, 99, 785, 849, 630, 685, 929, 198, 332, 6...",KDE,black,1.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
63693,CIFAR-100,learn-to-pay-attention,VGG,95,695,"[0.016376944, 0.0060382634, 0.006355504, 0.005...","[8, 59, 54, 61, 23, 71, 38, 50, 52, 24, 79, 5,...",seye,black,1.0,,0
63694,CIFAR-100,learn-to-pay-attention,VGG,95,76,"[0.011385047, 0.0066990685, 0.010248411, 0.009...","[11, 42, 16, 19, 17, 47, 66, 91, 34, 10, 81, 1...",seye,black,1.0,,0
63695,CIFAR-100,learn-to-pay-attention,VGG,95,790,"[0.026474101, 0.0038573723, 0.00171658, 0.0049...","[8, 50, 94, 39, 84, 70, 12, 40, 36, 0, 21, 42,...",seye,black,1.0,,0
63696,CIFAR-100,learn-to-pay-attention,VGG,95,81,"[0.0735066, 0.005536854, 0.0031309163, 0.06116...","[3, 39, 67, 4, 21, 72, 89, 38, 63, 6, 15, 52, ...",seye,black,1.0,,0


In [9]:
results = []

for unmasked_row in unmasked_data.itertuples():
    
    img_id = getattr(unmasked_row, 'img_id')
    dataset = getattr(unmasked_row, 'dataset')
    attention_type = getattr(unmasked_row, 'attention_type')
    model = getattr(unmasked_row, 'model')
    mask = getattr(unmasked_row, 'mask')

    hparam_match_idx = (
        (data['dataset'] == dataset) &
        (data['attention_type'] == attention_type) &
        (data['model'] == model) &
        (data['mask'] == mask) &
        (data['null'] == getattr(unmasked_row, 'null')) &
        (data['exp'] == getattr(unmasked_row, 'exp')) &
        (data['threshold'] == getattr(unmasked_row, 'threshold')) &     
        # Average only over the null rotation.
        (data['rotation'] == 0)
    )
    
    correctly_masked_row = data[(data['img_id'] == img_id) & correctly_masked_idx & hparam_match_idx]
    assert len(correctly_masked_row) == 1
    correctly_masked_row = correctly_masked_row.iloc[0]
    
    incorrectly_masked_rows = data[(data['img_id'] == img_id) & incorrectly_masked_idx & hparam_match_idx]

    k_dists = {}
    for k in range(1, 5+1):
        cumu_dist = np.array([0. ,0.])
        count = 0
        for incorrectly_masked_row in incorrectly_masked_rows.itertuples():
            dist = ranking_distance(
                getattr(unmasked_row, 'ranks'),
                getattr(correctly_masked_row, 'ranks'), 
                getattr(incorrectly_masked_row, 'ranks'),
                k=k)
            cumu_dist += np.array(dist)
            count += 1
        k_dists[k] = cumu_dist/count

    results += [{**{
        'dataset': dataset,
        'attention_type': attention_type,
        'model': model,
        'img_id': img_id,
        'mask': mask,
        }, **{'{}-rank distance (correct mask)'.format(k): k_dists[k][0] for k in k_dists}, 
           **{'{}-rank distance (incorrect mask)'.format(k): k_dists[k][1] for k in k_dists}
    }]

results = pd.DataFrame(results)
results.to_csv("ANN_recognition_human_masks_results.csv", index=False)
results

Unnamed: 0,dataset,attention_type,model,img_id,mask,1-rank distance (correct mask),2-rank distance (correct mask),3-rank distance (correct mask),4-rank distance (correct mask),5-rank distance (correct mask),1-rank distance (incorrect mask),2-rank distance (incorrect mask),3-rank distance (incorrect mask),4-rank distance (incorrect mask),5-rank distance (incorrect mask)
0,ImageNet,attention-branch-network,ResNet-101,10,KDE,0.0,0.0,0.0,28.0,70.0,13.250000,48.458333,51.041667,251.375000,356.208333
1,ImageNet,attention-branch-network,ResNet-101,115,KDE,662.0,1646.0,2546.0,3283.0,4212.0,545.708333,1492.833333,2338.916667,2954.333333,3722.208333
2,ImageNet,attention-branch-network,ResNet-101,125,KDE,502.0,537.0,716.0,734.0,880.0,566.958333,596.583333,842.916667,900.166667,1095.208333
3,ImageNet,attention-branch-network,ResNet-101,126,KDE,495.0,836.0,838.0,906.0,1063.0,486.375000,705.375000,706.708333,822.416667,1040.250000
4,ImageNet,attention-branch-network,ResNet-101,134,KDE,1.0,15.0,21.0,530.0,945.0,16.666667,70.875000,129.291667,550.500000,977.083333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2445,CIFAR-100,learn-to-pay-attention,VGG,76,seye,81.0,143.0,215.0,227.0,287.0,58.916667,91.291667,131.250000,149.666667,203.875000
2446,CIFAR-100,learn-to-pay-attention,VGG,790,seye,36.0,117.0,192.0,215.0,277.0,64.000000,137.041667,188.083333,225.291667,293.958333
2447,CIFAR-100,learn-to-pay-attention,VGG,81,seye,42.0,47.0,93.0,150.0,174.0,22.291667,59.750000,109.208333,164.125000,167.125000
2448,CIFAR-100,learn-to-pay-attention,VGG,90,seye,22.0,113.0,146.0,197.0,287.0,33.666667,125.083333,160.750000,208.916667,293.500000


In [10]:
ir = results
ir = ir.drop(columns=(f"{k}-rank distance (correct mask)" for k in range(2, 5+1)))
ir = ir.drop(columns=(f"{k}-rank distance (incorrect mask)" for k in range(2, 5+1)))

def inverse_rank(row):
    
    N = {
        "CIFAR-100": 100,
        "ImageNet": 1000,
        "Places365": 434,
    }[row["dataset"]]
    
    def invert(r):
        return N / (N + r)
    
    return pd.Series([invert(row["1-rank distance (correct mask)"]), invert(row["1-rank distance (incorrect mask)"])], 
                     index=['correct_inv_rank', 'incorrect_inv_rank'])

add = ir.apply(inverse_rank, axis=1)
ir = ir.drop(columns=("1-rank distance (correct mask)"))
ir = ir.drop(columns=("1-rank distance (incorrect mask)"))
ir = pd.concat((ir, add), axis=1)

ir.to_csv("ANN_recognition_human_masks_just_inverse_rank.csv", index=False)
ir

Unnamed: 0,dataset,attention_type,model,img_id,mask,correct_inv_rank,incorrect_inv_rank
0,ImageNet,attention-branch-network,ResNet-101,10,KDE,1.000000,0.986923
1,ImageNet,attention-branch-network,ResNet-101,115,KDE,0.601685,0.646953
2,ImageNet,attention-branch-network,ResNet-101,125,KDE,0.665779,0.638179
3,ImageNet,attention-branch-network,ResNet-101,126,KDE,0.668896,0.672778
4,ImageNet,attention-branch-network,ResNet-101,134,KDE,0.999001,0.983607
...,...,...,...,...,...,...,...
2445,CIFAR-100,learn-to-pay-attention,VGG,76,seye,0.552486,0.629261
2446,CIFAR-100,learn-to-pay-attention,VGG,790,seye,0.735294,0.609756
2447,CIFAR-100,learn-to-pay-attention,VGG,81,seye,0.704225,0.817717
2448,CIFAR-100,learn-to-pay-attention,VGG,90,seye,0.819672,0.748130
