In [1]:
# These imports enhance Python2/3 compatibility.
from __future__ import print_function, absolute_import, division, unicode_literals, with_statement

In [2]:
import cleanlab
import numpy as np
import torch

# For visualizing images of label errors
from PIL import Image
from torchvision import datasets
from matplotlib import pyplot as plt

# urllib2 for python2 and python3
try:
    # For Python 3.0 and later
    from urllib.request import urlopen
except ImportError:
    # Fall back to Python 2's urllib2
    from urllib2 import urlopen

In [22]:
# Set-up name mapping for ImageNet
url = 'https://gist.githubusercontent.com/aaronpolhamus/964a4411c0906315deb9f4a3723aac57/'
url += 'raw/aa66dd9dbf6b56649fa3fab83659b2acbf3cbfd1/map_clsloc.txt'
with urlopen(url) as f:
    lines = [x.decode('utf-8') for x in f.readlines()]    
    nid2name = dict([(l.split(" ")[0], l.split(" ")[2][:-1]) for l in lines])
    
dataset = datasets.ImageFolder(data_dir)
nid2idx = dataset.class_to_idx
idx2nid = {v: k for k, v in nid2idx.items()}
name2nid = {v: k for k, v in nid2name.items()}
idx2name = {k: nid2name[v] for k, v in idx2nid.items()}

## Analyze the train set on ImageNet

In [15]:
# CHANGE THIS TO CHANGE EXPERIMENT
# pyx_file = 'imagenet_val_out.npy' # NO FINE TUNING
pyx_file = 'imagenet__train__model_resnet50__pyx.npy' # fine tuned with 10fold cv

# where imagenet dataset is located
train_dir = '/datasets/datasets/imagenet/train/'
# Stored results directory
pyx_dir = '/datasets/cgn/pyx/imagenet/'

# Load in data
pyx = np.load(pyx_dir + pyx_file)
imgs, labels = [list(z) for  z in zip(*datasets.ImageFolder(train_dir).imgs)]
labels = np.array(labels, dtype=int)

In [19]:
cj = cleanlab.latent_estimation.estimate_confident_joint_from_probabilities(labels, pyx)

In [21]:
cj_non_diag = cj - np.eye(len(cj)) * cj.diagonal()
largest_non_diag_raveled = np.argsort(cj_non_diag.ravel())[::-1]
largest_non_diag = np.unravel_index(largest_non_diag_raveled, cj_non_diag.shape)
largest_non_diag = list(zip(*(list(z) for z in largest_non_diag)))

In [26]:
for i,j in largest_non_diag[:10]:
    print(int(round(cj[i,j])), "|", idx2nid[i], idx2name[i], "|",  idx2nid[j], idx2name[j])

645 | n04008634 projectile | n03773504 missile
539 | n04493381 tub | n02808440 bathtub
476 | n02895154 breastplate | n03146219 cuirass
437 | n01693334 green_lizard | n01682714 American_chameleon
435 | n01682714 American_chameleon | n01693334 green_lizard
433 | n03773504 missile | n04008634 projectile
417 | n03710637 maillot | n03710721 maillot
416 | n01753488 horned_viper | n01756291 sidewinder
410 | n12144580 corn | n13133613 ear
406 | n04505470 typewriter_keyboard | n04264628 space_bar


# Analye the validation set on ImageNet

In [3]:
# CHANGE THIS TO CHANGE EXPERIMENT
# pyx_file = 'imagenet_val_out.npy' # NO FINE TUNING
pyx_file = 'imagenet_val_out_cv_10fold.npy' # fine tuned with 10fold cv

# where imagenet dataset is located
data_dir = '/media/ssd/datasets/datasets/imagenet/val/'
# Stored results directory
pyx_dir = '/media/ssd/datasets/pyx/imagenet/'

# Load in data
with open(pyx_dir + 'imagenet_val_out_cv_10fold.npy', 'rb') as f:
    out = np.load(f)
with open(pyx_dir + 'imagenet_val_labels.npy', 'rb') as f:
    labels = np.load(f)
pyx = torch.nn.functional.softmax(torch.from_numpy(out), dim = 1).numpy()

In [4]:
url = 'https://gist.githubusercontent.com/aaronpolhamus/964a4411c0906315deb9f4a3723aac57/'
url += 'raw/aa66dd9dbf6b56649fa3fab83659b2acbf3cbfd1/map_clsloc.txt'
with urlopen(url) as f:
    lines = [x.decode('utf-8') for x in f.readlines()]    
    nid2name = dict([(l.split(" ")[0], l.split(" ")[2][:-1]) for l in lines])
    
dataset = datasets.ImageFolder(data_dir)
nid2idx = dataset.class_to_idx
idx2nid = {v: k for k, v in nid2idx.items()}
name2nid = {v: k for k, v in nid2name.items()}
idx2name = {k: nid2name[v] for k, v in idx2nid.items()}

In [5]:
cj = cleanlab.latent_estimation.estimate_confident_joint_from_probabilities(labels, pyx)
py, nm, inv = cleanlab.latent_estimation.estimate_latent(cj, labels)

In [6]:
cj_non_diag = cj - np.eye(len(cj)) * cj.diagonal()
largest_non_diag_raveled = np.argsort(cj_non_diag.ravel())[::-1]
largest_non_diag = np.unravel_index(largest_non_diag_raveled, cj_non_diag.shape)
largest_non_diag = list(zip(*(list(z) for z in largest_non_diag)))

In [7]:
for i,j in largest_non_diag[:100]:
    print(cj[i,j], "|", idx2nid[i], idx2name[i], "|",  idx2nid[j], idx2name[j])

26 | n02979186 cassette_player | n04392985 tape_player
23 | n03773504 missile | n04008634 projectile
23 | n03642806 laptop | n03832673 notebook
23 | n02808440 bathtub | n04493381 tub
23 | n13133613 ear | n12144580 corn
22 | n03710721 maillot | n03710637 maillot
22 | n01682714 American_chameleon | n01693334 green_lizard
21 | n02895154 breastplate | n03146219 cuirass
20 | n02412080 ram | n02415577 bighorn
19 | n04008634 projectile | n03773504 missile
18 | n01753488 horned_viper | n01756291 sidewinder
18 | n02107908 Appenzeller | n02107574 Greater_Swiss_Mountain_dog
18 | n12144580 corn | n13133613 ear
17 | n03146219 cuirass | n02895154 breastplate
17 | n02113624 toy_poodle | n02113712 miniature_poodle
16 | n03710637 maillot | n03710721 maillot
16 | n01693334 green_lizard | n01682714 American_chameleon
16 | n02123159 tiger_cat | n02123045 tabby
16 | n02110185 Siberian_husky | n02109961 Eskimo_dog
15 | n04355933 sunglass | n04356056 sunglasses
15 | n04152593 screen | n03782006 monitor
14 | 