In [1]:
# These imports enhance Python2/3 compatibility.
from __future__ import print_function, absolute_import, division, unicode_literals, with_statement

In [4]:
import cleanlab
import numpy as np
import torch

# For visualizing images of label errors
from PIL import Image
from torchvision import datasets
from matplotlib import pyplot as plt

# urllib2 for python2 and python3
try:
    # For Python 3.0 and later
    from urllib.request import urlopen
except ImportError:
    # Fall back to Python 2's urllib2
    from urllib2 import urlopen
    
# where imagenet dataset is located
train_dir = '/datasets/datasets/imagenet/val/'

In [5]:
# Set-up name mapping for ImageNet train data
url = 'https://gist.githubusercontent.com/aaronpolhamus/964a4411c0906315deb9f4a3723aac57/'
url += 'raw/aa66dd9dbf6b56649fa3fab83659b2acbf3cbfd1/map_clsloc.txt'
with urlopen(url) as f:
    lines = [x.decode('utf-8') for x in f.readlines()]    
    nid2name = dict([(l.split(" ")[0], l.split(" ")[2][:-1]) for l in lines])
    
dataset = datasets.ImageFolder(train_dir)
nid2idx = dataset.class_to_idx
idx2nid = {v: k for k, v in nid2idx.items()}
name2nid = {v: k for k, v in nid2name.items()}
idx2name = {k: nid2name[v] for k, v in idx2nid.items()}

## Analyze the train set on ImageNet

In [6]:
# CHANGE THIS TO CHANGE EXPERIMENT
# pyx_file = 'imagenet_val_out.npy' # NO FINE TUNING
pyx_file = 'imagenet__train__model_resnet50__pyx.npy' # trained from scratch with 10fold cv

# where imagenet dataset is located
train_dir = '/datasets/datasets/imagenet/train/'
# Stored results directory
pyx_dir = '/datasets/cgn/pyx/imagenet/'

# Load in data
pyx = np.load(pyx_dir + pyx_file)
imgs, labels = [list(z) for  z in zip(*datasets.ImageFolder(train_dir).imgs)]
labels = np.array(labels, dtype=int)

In [38]:
cj = cleanlab.latent_estimation.estimate_confident_joint_from_probabilities(labels, pyx)

# A bad way to approach this problem might be to just look at the correlation of every column in the probability matrix. The problem is correlation is symmetric and this will correlate everything that has small counts. 

In [143]:
corr = np.corrcoef(pyx.T)

In [220]:
corr_non_diag = corr - np.eye(len(corr)) * corr.diagonal()
corr_largest_non_diag_raveled = np.argsort(corr_non_diag.ravel())[::-1]
corr_largest_non_diag = np.unravel_index(corr_largest_non_diag_raveled, corr_non_diag.shape)
corr_largest_non_diag = list(zip(*(list(z) for z in corr_largest_non_diag)))

print([(nid2name[idx2nid[z[0]]], nid2name[idx2nid[z[1]]]) for z in corr_largest_non_diag][:5])
print([nid2name[idx2nid[z]] for z in corr.diagonal().argsort()[:10]])

[('projectile', 'missile'), ('missile', 'projectile'), ('Eskimo_dog', 'Siberian_husky'), ('Siberian_husky', 'Eskimo_dog'), ('sunglass', 'sunglasses')]
['cleaver', 'Australian_terrier', 'miniature_poodle', 'weasel', 'titi', 'ballpoint', 'bow', 'breastplate', 'can_opener', 'CD_player']


# Using confident joint

In [47]:
joint = cleanlab.latent_estimation.estimate_joint(cj, labels, pyx)
joint_non_diag = joint - np.eye(len(joint)) * joint.diagonal()

In [8]:
cj_non_diag = cj - np.eye(len(cj)) * cj.diagonal()
largest_non_diag_raveled = np.argsort(cj_non_diag.ravel())[::-1]
largest_non_diag = np.unravel_index(largest_non_diag_raveled, cj_non_diag.shape)
largest_non_diag = list(zip(*(list(z) for z in largest_non_diag)))

In [69]:
# Checks that joint correctly has rows that are p(s)
assert(all(joint.sum(axis = 1) - np.bincount(labels) / len(labels) < 1e-4))

In [123]:
class_name = 'bighorn'

print("Index of '{}' in sorted diagonal of cj: ".format(class_name), end = "")
print([nid2name[idx2nid[i]] for i in cj.diagonal().argsort()].index(class_name))

print("Index of '{}' in sorted diagonal of joint: ".format(class_name), end = "")
print([nid2name[idx2nid[i]] for i in joint.diagonal().argsort()].index(class_name))

print("Index of '{}' in sorted most noisy classes in cj: ".format(class_name), end = "")
print([nid2name[idx2nid[i]] for i in np.argsort(cj_non_diag.sum(axis = 0))[::-1]].index(class_name))

print("Index of '{}' in sorted most noisy classes in joint: ".format(class_name), end = "")
print([nid2name[idx2nid[i]] for i in np.argsort(joint_non_diag.sum(axis = 0))[::-1]].index(class_name))

print("Index of '{}' in sorted most noisy true classes in cj: ".format(class_name), end = "")
print([nid2name[idx2nid[i]] for i in np.argsort(cj_non_diag.sum(axis = 1))[::-1]].index(class_name))

print("Index of '{}' in sorted most noisy true classes in joint: ".format(class_name), end = "")
print([nid2name[idx2nid[i]] for i in np.argsort(joint_non_diag.sum(axis = 1))[::-1]].index(class_name))

idx = cj.diagonal().argmin()
print("Least confident class by diagonal of cj:", nid2name[idx2nid[idx]], idx)
idx = joint.diagonal().argmin()
print("Least confident class by diagonal of joint:", nid2name[idx2nid[idx]], idx)
idx = cj_non_diag.sum(axis = 0).argmax()
print("Least confident class by max sum of row of non-diagonal elements of cj:", nid2name[idx2nid[idx]], idx)
idx = joint_non_diag.sum(axis = 1).argmax()
print("Least confident class by max sum of column of non-diagonal elements of cj:", nid2name[idx2nid[idx]], idx)
print('Largest noise rate:', [(nid2name[idx2nid[z]], z) for z in largest_non_diag[0]])

Index of 'bighorn' in sorted diagonal of cj: 374
Index of 'bighorn' in sorted diagonal of joint: 374
Index of 'bighorn' in sorted most noisy classes in cj: 150
Index of 'bighorn' in sorted most noisy classes in joint: 150
Index of 'bighorn' in sorted most noisy true classes in cj: 325
Index of 'bighorn' in sorted most noisy true classes in joint: 325
Least confident class by diagonal of cj: English_foxhound 167
Least confident class by diagonal of joint: English_foxhound 167
Least confident class by max sum of row of non-diagonal elements of cj: water_jug 899
Least confident class by max sum of column of non-diagonal elements of cj: maillot 638
Largest noise rate: [('projectile', 744), ('missile', 657)]


In [221]:
cj

array([[1238.9       ,    2.6       ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   1.27326151, 1264.34867777,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,    0.        , 1150.16465423, ...,    0.        ,
           0.        ,    0.        ],
       ...,
       [   0.        ,    0.        ,    0.        , ..., 1136.59621802,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
         874.04809619,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,  861.52019002]])

In [251]:
edges = [(
    idx2name[i].replace('American_chameleon', 'chameleon').replace('typewriter_keyboard', 'keyboard'), 
    idx2name[j].replace('American_chameleon', 'chameleon').replace('typewriter_keyboard', 'keyboard'), 
    idx2nid[i], 
    idx2nid[j], 
    int(round(cj[i,j])),
    joint[i,j].round(6),
) for i,j in largest_non_diag[:30]]
# nodes = list({z for i,j in largest_non_diag[:30] for z in (idx2name[i], idx2name[j])})

In [268]:
df

Unnamed: 0,$\tilde{y}$ name,$y^*$ name,$\tilde{y}$ nid,$y^*$ nid,"$C(\tilde{y},y^*)$","$P(\tilde{y},y^*)$",Rank
0,projectile,missile,n04008634,n03773504,645,0.000503,1
1,tub,bathtub,n04493381,n02808440,539,0.000421,2
2,breastplate,cuirass,n02895154,n03146219,476,0.000371,3
3,green_lizard,chameleon,n01693334,n01682714,437,0.000341,4
4,chameleon,green_lizard,n01682714,n01693334,435,0.00034,5
5,missile,projectile,n03773504,n04008634,433,0.000338,6
6,maillot,maillot,n03710637,n03710721,417,0.000326,7
7,horned_viper,sidewinder,n01753488,n01756291,416,0.000324,8
8,corn,ear,n12144580,n13133613,410,0.00032,9
9,keyboard,space_bar,n04505470,n04264628,406,0.000317,10


In [270]:
df = pd.DataFrame(edges, columns = [r"$\tilde{y}$ name", r"$y^*$ name", r"$\tilde{y}$ nid", r"$y^*$ nid", r"$C(\tilde{y},y^*)$", r"$P(\tilde{y},y^*)$"])[:10]
df.insert(loc = 0, column = 'Rank', value = df.index + 1)
tex = df.to_latex(index = False)
orig = '\\$\\textbackslash tilde\\{y\\}\\$ name &    \\$y\\textasciicircum *\\$ name & \\$\\textbackslash tilde\\{y\\}\\$ nid &  \\$y\\textasciicircum *\\$ nid &  \\$C(\\textbackslash tilde\\{y\\},y\\textasciicircum *)\\$ &  \\$P(\\textbackslash tilde\\{y\\},y\\textasciicircum *)\\$'
new = '$\\tilde{y}$ name   &   $y^*$ name   &   $\\tilde{y}$ nid   &   $y^*$ nid   &   $C(\\tilde{y},y^*)$   &   $P(\\tilde{y},y^*)$ '
tex = tex.replace(orig, new)
print(tex)
df.style.set_properties(subset=[r"$C(\tilde{y},y^*)$"], **{'width': '50px'})

\begin{tabular}{rllllrr}
\toprule
 Rank & $\tilde{y}$ name   &   $y^*$ name   &   $\tilde{y}$ nid   &   $y^*$ nid   &   $C(\tilde{y},y^*)$   &   $P(\tilde{y},y^*)$  \\
\midrule
    1 &       projectile &       missile &       n04008634 &  n03773504 &                 645 &            0.000503 \\
    2 &              tub &       bathtub &       n04493381 &  n02808440 &                 539 &            0.000421 \\
    3 &      breastplate &       cuirass &       n02895154 &  n03146219 &                 476 &            0.000371 \\
    4 &     green\_lizard &     chameleon &       n01693334 &  n01682714 &                 437 &            0.000341 \\
    5 &        chameleon &  green\_lizard &       n01682714 &  n01693334 &                 435 &            0.000340 \\
    6 &          missile &    projectile &       n03773504 &  n04008634 &                 433 &            0.000338 \\
    7 &          maillot &       maillot &       n03710637 &  n03710721 &                 417 &            

Unnamed: 0,Rank,$\tilde{y}$ name,$y^*$ name,$\tilde{y}$ nid,$y^*$ nid,"$C(\tilde{y},y^*)$","$P(\tilde{y},y^*)$"
0,1,projectile,missile,n04008634,n03773504,645,0.000503
1,2,tub,bathtub,n04493381,n02808440,539,0.000421
2,3,breastplate,cuirass,n02895154,n03146219,476,0.000371
3,4,green_lizard,chameleon,n01693334,n01682714,437,0.000341
4,5,chameleon,green_lizard,n01682714,n01693334,435,0.00034
5,6,missile,projectile,n03773504,n04008634,433,0.000338
6,7,maillot,maillot,n03710637,n03710721,417,0.000326
7,8,horned_viper,sidewinder,n01753488,n01756291,416,0.000324
8,9,corn,ear,n12144580,n13133613,410,0.00032
9,10,keyboard,space_bar,n04505470,n04264628,406,0.000317


In [257]:
tex

'\\begin{tabular}{lllllrr}\n\\toprule\n{} & \\$\\textbackslash tilde\\{y\\}\\$ name &    \\$y\\textasciicircum *\\$ name & \\$\\textbackslash tilde\\{y\\}\\$ nid &  \\$y\\textasciicircum *\\$ nid &  \\$C(\\textbackslash tilde\\{y\\},y\\textasciicircum *)\\$ &  \\$P(\\textbackslash tilde\\{y\\},y\\textasciicircum *)\\$ \\\\\n\\midrule\n0 &       projectile &       missile &       n04008634 &  n03773504 &                 645 &            0.000503 \\\\\n1 &              tub &       bathtub &       n04493381 &  n02808440 &                 539 &            0.000421 \\\\\n2 &      breastplate &       cuirass &       n02895154 &  n03146219 &                 476 &            0.000371 \\\\\n3 &     green\\_lizard &     chameleon &       n01693334 &  n01682714 &                 437 &            0.000341 \\\\\n4 &        chameleon &  green\\_lizard &       n01682714 &  n01693334 &                 435 &            0.000340 \\\\\n5 &          missile &    projectile &       n03773504 &  n04008634 &

In [250]:
for i,j in largest_non_diag[:30]:
    print(int(round(cj[i,j])), "|", idx2nid[i], idx2name[i], "|",  idx2nid[j], idx2name[j])

645 | n04008634 projectile | n03773504 missile
539 | n04493381 tub | n02808440 bathtub
476 | n02895154 breastplate | n03146219 cuirass
437 | n01693334 green_lizard | n01682714 American_chameleon
435 | n01682714 American_chameleon | n01693334 green_lizard
433 | n03773504 missile | n04008634 projectile
417 | n03710637 maillot | n03710721 maillot
416 | n01753488 horned_viper | n01756291 sidewinder
410 | n12144580 corn | n13133613 ear
406 | n04505470 typewriter_keyboard | n04264628 space_bar
399 | n04355933 sunglass | n04356056 sunglasses
389 | n02110185 Siberian_husky | n02109961 Eskimo_dog
370 | n02979186 cassette_player | n04392985 tape_player
369 | n04356056 sunglasses | n04355933 sunglass
367 | n02109961 Eskimo_dog | n02110185 Siberian_husky
364 | n01797886 ruffed_grouse | n01807496 partridge
360 | n02412080 ram | n02415577 bighorn
360 | n13133613 ear | n12144580 corn
354 | n02395406 hog | n02396427 wild_boar
354 | n03642806 laptop | n03832673 notebook
351 | n03146219 cuirass | n02895

In [129]:
print("Top 30 row sums in confident joint (most noisy classes):\n")
[(idx2nid[i], idx2name[i]) for i in np.argsort(cj_non_diag.sum(axis = 0))[::-1][:30]]

Top 30 row sums in confident joint (most noisy classes):



[('n04560804', 'water_jug'),
 ('n03710637', 'maillot'),
 ('n04392985', 'tape_player'),
 ('n09332890', 'lakeside'),
 ('n03180011', 'desktop_computer'),
 ('n02808440', 'bathtub'),
 ('n03832673', 'notebook'),
 ('n04041544', 'radio'),
 ('n09428293', 'seashore'),
 ('n03179701', 'desk'),
 ('n03773504', 'missile'),
 ('n02123045', 'tabby'),
 ('n01756291', 'sidewinder'),
 ('n04026417', 'purse'),
 ('n07579787', 'plate'),
 ('n04356056', 'sunglasses'),
 ('n02109961', 'Eskimo_dog'),
 ('n03976657', 'pole'),
 ('n03782006', 'monitor'),
 ('n04008634', 'projectile'),
 ('n03866082', 'overskirt'),
 ('n03871628', 'packet'),
 ('n03532672', 'hook'),
 ('n04152593', 'screen'),
 ('n01693334', 'green_lizard'),
 ('n01740131', 'night_snake'),
 ('n04355933', 'sunglass'),
 ('n03216828', 'dock'),
 ('n07930864', 'cup'),
 ('n02988304', 'CD_player')]

# Analye the validation set on ImageNet

In [5]:
# CHANGE THIS TO CHANGE EXPERIMENT
# pyx_file = 'imagenet_val_out.npy' # NO FINE TUNING
pyx_file = 'imagenet_val_out_cv_10fold.npy' # fine tuned with 10fold cv

# where imagenet dataset is located
val_dir = '/datasets/datasets/imagenet/val/'
# Stored results directory
pyx_dir = '/datasets/cgn/pyx/imagenet/'

# Load in data
with open(pyx_dir + 'imagenet_val_out_cv_10fold.npy', 'rb') as f:
    out = np.load(f)
with open(pyx_dir + 'imagenet_val_labels.npy', 'rb') as f:
    labels = np.load(f)
pyx = torch.nn.functional.softmax(torch.from_numpy(out), dim = 1).numpy()

In [7]:
# set up mapping for imagenet validation data
url = 'https://gist.githubusercontent.com/aaronpolhamus/964a4411c0906315deb9f4a3723aac57/'
url += 'raw/aa66dd9dbf6b56649fa3fab83659b2acbf3cbfd1/map_clsloc.txt'
with urlopen(url) as f:
    lines = [x.decode('utf-8') for x in f.readlines()]    
    nid2name = dict([(l.split(" ")[0], l.split(" ")[2][:-1]) for l in lines])
    
dataset = datasets.ImageFolder(val_dir)
nid2idx = dataset.class_to_idx
idx2nid = {v: k for k, v in nid2idx.items()}
name2nid = {v: k for k, v in nid2name.items()}
idx2name = {k: nid2name[v] for k, v in idx2nid.items()}

In [8]:
cj = cleanlab.latent_estimation.estimate_confident_joint_from_probabilities(labels, pyx)
py, nm, inv = cleanlab.latent_estimation.estimate_latent(cj, labels)

In [9]:
cj_non_diag = cj - np.eye(len(cj)) * cj.diagonal()
largest_non_diag_raveled = np.argsort(cj_non_diag.ravel())[::-1]
largest_non_diag = np.unravel_index(largest_non_diag_raveled, cj_non_diag.shape)
largest_non_diag = list(zip(*(list(z) for z in largest_non_diag)))

In [39]:
print("Top 30 row sums in confident joint (most noisy classes):\n")
[(idx2nid[i], idx2name[i]) for i in np.argsort(cj_non_diag.sum(axis = 1))[::-1][:30]]

Top 30 row sums in confident joint (most noisy classes):



[('n02123159', 'tiger_cat'),
 ('n04355933', 'sunglass'),
 ('n04152593', 'screen'),
 ('n02113624', 'toy_poodle'),
 ('n02979186', 'cassette_player'),
 ('n02808440', 'bathtub'),
 ('n03642806', 'laptop'),
 ('n03637318', 'lampshade'),
 ('n02988304', 'CD_player'),
 ('n02107908', 'Appenzeller'),
 ('n02871525', 'bookshop'),
 ('n01753488', 'horned_viper'),
 ('n01682714', 'American_chameleon'),
 ('n03180011', 'desktop_computer'),
 ('n02110185', 'Siberian_husky'),
 ('n02412080', 'ram'),
 ('n03710637', 'maillot'),
 ('n12144580', 'corn'),
 ('n02441942', 'weasel'),
 ('n07734744', 'mushroom'),
 ('n03179701', 'desk'),
 ('n01740131', 'night_snake'),
 ('n02106030', 'collie'),
 ('n01667778', 'terrapin'),
 ('n04008634', 'projectile'),
 ('n03950228', 'pitcher'),
 ('n02119022', 'red_fox'),
 ('n03710721', 'maillot'),
 ('n03782006', 'monitor'),
 ('n04560804', 'water_jug')]

In [127]:
for i,j in largest_non_diag[:30]:
    print(cj[i,j], "|", idx2nid[i], idx2name[i], "|",  idx2nid[j], idx2name[j])

644.983461962514 | n04008634 projectile | n03773504 missile
538.9486260454004 | n04493381 tub | n02808440 bathtub
475.6818181818183 | n02895154 breastplate | n03146219 cuirass
436.84210526315803 | n01693334 green_lizard | n01682714 American_chameleon
435.0343473994112 | n01682714 American_chameleon | n01693334 green_lizard
432.89205702647666 | n03773504 missile | n04008634 projectile
417.0893054024257 | n03710637 maillot | n03710721 maillot
415.62819203268646 | n01753488 horned_viper | n01756291 sidewinder
410.09463722397487 | n12144580 corn | n13133613 ear
406.25000000000006 | n04505470 typewriter_keyboard | n04264628 space_bar
399.1764705882354 | n04355933 sunglass | n04356056 sunglasses
389.451476793249 | n02110185 Siberian_husky | n02109961 Eskimo_dog
369.8744769874478 | n02979186 cassette_player | n04392985 tape_player
368.79432624113485 | n04356056 sunglasses | n04355933 sunglass
366.8049792531121 | n02109961 Eskimo_dog | n02110185 Siberian_husky
363.6563876651983 | n01797886 ruf