In [None]:
import json
import torch
import torchvision.transforms as transforms
from tqdm import tqdm
from collections import defaultdict
import numpy as np
from torchvision.datasets import CelebA, CIFAR10, CIFAR100, MNIST
from IPython.display import display
from PIL import Image
from main import DatasetLoader
import util

# Parameters
params = [#'runtime.dataset_dir=./', # Where to load the datasets
         ]
args = util.get_config(params)


DATASETS = (CIFAR100, CelebA,)
label_names = {}
label_names[CIFAR100] = ['apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle', 'bicycle', 'bottle', 'bowl', 'boy', 'bridge', 'bus', 'butterfly', 'camel', 'can', 'castle', 'caterpillar', 'cattle', 'chair', 'chimpanzee', 'clock', 'cloud', 'cockroach', 'couch', 'crab', 'crocodile', 'cup', 'dinosaur', 'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster', 'house', 'kangaroo', 'keyboard', 'lamp', 'lawn_mower', 'leopard', 'lion', 'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse', 'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear', 'pickup_truck', 'pine_tree', 'plain', 'plate', 'poppy', 'porcupine', 'possum', 'rabbit', 'raccoon', 'ray', 'road', 'rocket', 'rose', 'sea', 'seal', 'shark', 'shrew', 'skunk', 'skyscraper', 'snail', 'snake', 'spider', 'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'table', 'tank', 'telephone', 'television', 'tiger', 'tractor', 'train', 'trout', 'tulip', 'turtle', 'wardrobe', 'whale', 'willow_tree', 'wolf', 'woman', 'worm']

def lname(dataset, l):
    if dataset in label_names:
        return label_names[dataset][l]
    else:
        return str(l)


In [None]:
# Duplicate Detection
#
# You may skip this cell. Results can be found in duplicates.json and are loaded subsequently.

duplicates = defaultdict(dict)
#indices = np.random.choice(116412, size=5, replace=False)


for dataset in DATASETS:
    test_in_train = defaultdict(list)
    print(f"Doing {dataset.__name__} ...")
    
    if dataset in (CelebA,):
        ds_train = dataset(args.runtime.dataset_dir, split='train', target_type='identity')
        ds_test = dataset(args.runtime.dataset_dir, split='test', target_type='identity')
    else:
        ds_train = dataset(args.runtime.dataset_dir, train=True)
        ds_test = dataset(args.runtime.dataset_dir, train=False)
        

    ids_train = defaultdict(list)
    print(f"  training:")
    for i, sample in enumerate(tqdm(ds_train)):
        x = np.array(sample[0]).flatten()
        x.flags.writeable = False
        ids_train[hash(x.data.tobytes())].append(i)
        
    ids_test = defaultdict(list)
    dups_test_in_train = dict()
    print(f"  testing:")
    for i, sample in enumerate(tqdm(ds_test)):
        x = np.array(sample[0]).flatten()
        x.flags.writeable = False
        ids_test[hash(x.data.tobytes())].append(i)
        if hash(x.data.tobytes()) in ids_train:
            dups_test_in_train[i] = ids_train[hash(x.data.tobytes())]
        
    
    for ids, part in ((ids_train, 'training'), (ids_test, 'testing')):
        dups = []
        for v in ids.values():
            if len(v) > 1:
                dups.append(v)
        print(f"Found {len(dups)} {part} duplicates")
        duplicates[dataset.__name__][f"{part}_duplicates"] = dups
    
    print(f"Found {len(dups_test_in_train)} duplicates in training and testing")    
    duplicates[dataset.__name__]["test_in_train"] = dups_test_in_train

    with open('duplicates.json', 'w') as fp:
        json.dump(duplicates, fp, indent=1)

    



In [None]:
# Print LaTeX Table with duplicates and save PNG files to ./imgs


# Detection
with open('duplicates.json', 'r') as fp:
    duplicates = json.load(fp)
    
include_cmd = (r"\includegraphics[align=c,width=.8cm]{", r"}\vspace{1px}")
    
for dataset in DATASETS:
    print(f"% {dataset.__name__}:")
    print(r"""\begin{tabular}{cll}
  \toprule
  Image & Training (ID:Label) & Testing (ID:Label) \\
  \midrule""")
    train_dups = duplicates[dataset.__name__]['training_duplicates']
    test_dups = duplicates[dataset.__name__]['testing_duplicates']
    test_in_train = duplicates[dataset.__name__]['test_in_train']
    if dataset in (CelebA,):
        ds_train = dataset(args.runtime.dataset_dir, split='train', target_type='identity')
        ds_test = dataset(args.runtime.dataset_dir, split='test', target_type='identity')
    else:
        ds_train = dataset(args.runtime.dataset_dir, train=True)
        ds_test = dataset(args.runtime.dataset_dir, train=False)
    
    for dups, ds, part in zip ( (train_dups, test_dups), (ds_train, ds_test), ('TRAIN', 'TEST')):
        print(f"  % {part}")
        i_dups = 0
        i_dups_wrong_label = 0
        for v in dups:
            x = np.array([np.array(ds[i][0]) for i in v])
            y = np.array([ds[i][1] for i in v])
            D = np.abs(x - x[0:1,:]).sum()
            Dy = y - y[0]
            
            assert (D == 0).all()
            i_dups += 1
            if Dy.sum() != 0:
                i_dups_wrong_label += 1
            
            names = [lname(dataset, i).replace("_", "\\_") for i in y]

            img_name = f'./imgs/{dataset.__name__!s}_{part}_{v[0]}.png'
            img_txt = ", ".join([f"{i}:\\texttt{{{name}}}" for name, i in zip(names, v)])
            ds[v[0]][0].save(img_name)
            if part == "TRAIN":
                print(f'  {include_cmd[0]}{img_name}{include_cmd[1]} & {img_txt} & -- \\\\')
            if part == "TEST":
                print(f'  {include_cmd[0]}{img_name}{include_cmd[1]} & -- & {img_txt} \\\\')
        
        print(f"  % {part}: {i_dups} duplicates with {i_dups_wrong_label} label mismatch")
                                                  
    print(f"  % TEST and TRAIN")
    i_dups = 0
    i_dups_wrong_label = 0
    for k, v in test_in_train.items():
        k = int(k)
        for v0 in test_dups:
            assert k not in v0
        y0 = int(ds_test[k][1])
        y1 = np.array([ds_train[i][1] for i in v])
        Dy = y1 - y0
        i_dups += 1
        if Dy.sum() != 0:
            i_dups_wrong_label += 1
        name0 = lname(dataset,y0).replace("_", "\\_")
        names1 = [lname(dataset,i).replace("_", "\\_") for i in y1]
        img_name = f'./imgs/{dataset.__name__!s}_TEST_{k}.png'
        img_txt1 = ", ".join([f"{i}:\\texttt{{{name}}}" for name, i in zip(names1, v)])
        img_txt0 = f"{k}:\\texttt{{{name0}}}"
        ds_test[k][0].save(img_name)
        print(f'  {include_cmd[0]}{img_name}{include_cmd[1]} & {img_txt1} & {img_txt0} \\\\')
    print(f"  % TRAIN in TEST: {i_dups} duplicates with {i_dups_wrong_label} label mismatch")
    print(r"\bottomrule")    
    print(r"\end{tabular}")

In [None]:
# Display Duplicates
for dataset in DATASETS:
    print(f"\n\n{dataset.__name__}:")
    if dataset in (CelebA,):
        ds_train = dataset(args.runtime.dataset_dir, split='train', target_type='identity')
        ds_test = dataset(args.runtime.dataset_dir, split='test', target_type='identity')
    else:
        ds_train = dataset(args.runtime.dataset_dir, train=True)
        ds_test = dataset(args.runtime.dataset_dir, train=False)
    
    for ds, part in ((ds_train, 'training'), (ds_test, 'testing')):

        dups=duplicates[dataset.__name__][f"{part}_duplicates"]

        print(f"\n  {part}")

        i_dups = 0
        i_dups_wrong_label = 0
        for v in dups:
            x = np.array([np.array(ds[i][0]) for i in v])
            y = np.array([ds[i][1] for i in v])
            D = np.abs(x - x[0:1,:]).sum()
            Dy = y - y[0]

            assert (D == 0).all()
 
            i_dups += 1
            if Dy.sum() != 0:
                i_dups_wrong_label += 1

            names = [lname(dataset,i) for i in y]

            img_txt = ", ".join([f"{i}:{name}" for name, i in zip(names, v)])
            img1 = ds[v[0]][0]
            img2 = ds[v[1]][0]
            display(img1)
            print(f'    {img_txt}')
            
        print(f"{part} had {i_dups} duplicates with {i_dups_wrong_label} label mismatches")
        
    i_dups = 0
    i_dups_wrong_label = 0
    for i, v in duplicates[dataset.__name__]["test_in_train"].items():
        i = int(i)
        img1 = ds_test[i][0]
        y0 = int(ds_test[i][1])
        y1 = np.array([ds_train[j][1] for j in v])
        Dy = y1 - y0
        i_dups += 1
        if Dy.sum() != 0:
            i_dups_wrong_label += 1
        name0 = lname(dataset,y0)
        names1 = [lname(dataset,i) for i in y1]
        display(img1)
        img_txt = f"testing:: {i}:{name0} -- training:: " + ",".join([f"{j}:{name}" for name, j in zip(names1, v)])
        print(f'    {img_txt}')
    print(f"{i_dups} duplicates in training and testing with {i_dups_wrong_label} label mismatches")


