In [None]:
import matplotlib.pyplot as plt 
import numpy as np

In [None]:
##############################################
## Load, partition, and resize CIFAR10 Data ##
##############################################
def loadData():
    import pickle

    # unpickle the binary files
    def unpickle(file):
        with open(file, 'rb') as fo:
            dict = pickle.load(fo, encoding='bytes')
        return dict

    labels = ['airplane',  # index 0
          'automobile',  # index 1
          'bird',  # index 2 
          'cat',  # index 3 
          'deer',  # index 4
          'dog',  # index 5
          'frog',  # index 6 
          'horse',  # index 7 
          'ship',  # index 8 
          'truck']  # index 9
    
    # paths to each batch of data
    batch1 = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/data_batch_1")
    batch2 = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/data_batch_2")
    batch3 = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/data_batch_3")
    batch4 = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/data_batch_4")
    batch5 = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/data_batch_5")
    meta = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/batches.meta")
    test = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/test_batch")

    # separate labels and image data from each batch
    y_train1 = batch1[b'labels']
    x_train1 = batch1[b'data']
    y_train2 = batch2[b'labels']
    x_train2 = batch2[b'data']
    y_train3 = batch3[b'labels']
    x_train3 = batch3[b'data']
    y_train4 = batch4[b'labels']
    x_train4 = batch4[b'data']
    y_train5 = batch5[b'labels']
    x_train5 = batch5[b'data']

    # concatenate into big training and testing arrays
    y_train = np.concatenate((y_train1, y_train2, y_train3, y_train4, y_train5))
    x_train = np.concatenate((x_train1, x_train2, x_train3, x_train4, x_train5), axis=0)
    
#     # shuffle the training data identically to before
#     def shuffle_in_unison(x, y, permutation):
#         assert x.shape[0] == y.shape[0]
#         shuffled_x = np.empty(x.shape, dtype=x.dtype)
#         shuffled_y = np.empty(y.shape, dtype=y.dtype)
#         for old_index, new_index in enumerate(permutation):
#             shuffled_x[new_index] = x[old_index]
#             shuffled_y[new_index] = y[old_index]

#         return shuffled_x, shuffled_y

#     permutation = np.loadtxt('/scratch/gpfs/eysu/src_data/cifar-10-batches-py/permutation.csv', delimiter=',').astype(np.int64)

#     x_train, y_train = shuffle_in_unison(x_train, y_train, permutation)

    y_test = test[b'labels']
    x_test = test[b'data']
    
    # Further break training data into train / validation sets 
    # put 5000 into validation set and keep remaining 45,000 for train
    (x_train, x_valid) = x_train[5000:], x_train[:5000] 
    (y_train, y_valid) = y_train[5000:], y_train[:5000]

    # reshape data to match dimensions of cifar10.load_data
    x_train = x_train.reshape(45000, 3, 32, 32)
    x_train = x_train.transpose(0, 2, 3, 1)
    x_train = x_train.astype('float32')
    x_train /= 255

    # y_train = tf.keras.utils.to_categorical(y_train, 10)

    x_valid = x_valid.reshape(5000, 3, 32, 32)
    x_valid = x_valid.transpose(0, 2, 3, 1)
    x_valid = x_valid.astype('float32')
    x_valid /= 255

    x_test = x_test.reshape(10000, 3, 32, 32)
    x_test = x_test.transpose(0, 2, 3, 1)
    x_test = x_test.astype('float32')
    x_test /= 255
    
    y_train = np.array(y_train)
    y_valid = np.array(y_valid)
    y_test = np.array(y_test)
    
    return x_train, x_valid, x_test, y_train, y_valid, y_test, labels

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()

# Analysis 1: Divisions of classes

In [None]:
########################################################
## Examine the divisions in classes across iterations ##
########################################################
# load the hard labels sampled after every iteration for every image
all_labels = np.load("/scratch/gpfs/eysu/Sampling/CIFAR_1000/labels.npy")

# store all divisions of classes in array
# dimensions are iters x classes
divisions = np.zeros((all_labels.shape[1], 10))

for i in range(all_labels.shape[1]):
    _, counts = np.unique(all_labels[:, i], return_counts=True)
    divisions[i] = counts

plt.figure(figsize=(12,8))
for j in range(10):
    x = np.arange(all_labels.shape[1])
    plt.plot(x, divisions[:, j], label = "Class: " + labels[j])
    
plt.title("Divisions of Classes over 1000 Iterations on Untrained Model")
plt.ylim([0, 12000])
plt.ylabel("Number of images")
plt.xlabel("Iterations")
plt.legend()
plt.show()

In [None]:
########################################################
## Examine the divisions in classes across iterations ##
########################################################
from scipy.stats import entropy
# load the hard labels sampled after every iteration for every image
all_labels = np.load("/scratch/gpfs/eysu/Sampling/pretrained_CIFAR_1000/labels.npy")

# store all divisions of classes in array
# dimensions are iters x classes
divisions = np.zeros((all_labels.shape[1], 10))

for i in range(all_labels.shape[1]):
    _, counts = np.unique(all_labels[:, i], return_counts=True)
    divisions[i] = counts

plt.figure(figsize=(12,8))
for j in range(10):
    x = np.arange(all_labels.shape[1])
    plt.plot(x, divisions[:, j], label = "Class: " + labels[j])
    
    ent = entropy(divisions[:, j])
    print(ent)
    
plt.title("Divisions of Classes over 1000 Iterations on Pretrained Model")
plt.ylim([0, 12000])
plt.ylabel("Number of images")
plt.xlabel("Iterations")
plt.legend()
plt.show()

# Analysis 2: Examine images of each class

In [None]:
###############################################################################
## Look at 10 random images labeled as each class at each 200 iteration mark ##
###############################################################################
from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()
all_labels = np.load("/scratch/gpfs/eysu/Sampling/CIFAR_1000/labels.npy")

for CLASS in range(10):
#     pdf = PdfPages('/home/eysu/Sampling/Outputs/Untrained_Class_' + str(CLASS) + '.pdf')
    for iter in [0, 200, 400, 600, 800, 1000]:
        # isolate the labels and images that are labeled as CLASS
        class_idxs = np.where(all_labels[:, iter] == CLASS)
        class_labels = all_labels[class_idxs, iter]
        # double check that only the images labeled with CLASS at that iter have been selected
        assert np.unique(class_labels[:, iter])[0] == CLASS
        class_imgs = x_train[class_idxs]
   
        # print 10 random images from the correctly labeled subset
        rand_idx = np.random.randint(0, high=class_labels.shape[1], size=10)
    
        for j in rand_idx:
            fig = plt.figure()
            plt.imshow(class_imgs[j])
            plt.title("Class " + str(CLASS) + ", iteration " + str(iter))
            plt.show()
            
#             pdf.savefig(fig, bbox_inches = 'tight')
#     pdf.close()
        

In [None]:
###############################################################################
## Look at 10 random images labeled as each class at each 200 iteration mark ##
###############################################################################
from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()
all_labels = np.load("/scratch/gpfs/eysu/Sampling/pretrained_1000/labels.npy")

for CLASS in range(10):
    pdf = PdfPages('/home/eysu/Sampling/Outputs/Pretrained_Class_' + str(CLASS) + '.pdf')
    for iter in [0, 200, 400, 600, 800, 1000]:
        # isolate the labels and images that are labeled as CLASS
        class_idxs = np.where(all_labels[:, iter] == CLASS)
        class_labels = all_labels[class_idxs, iter]
        # double check that only the images labeled with CLASS at that iter have been selected
        assert np.unique(class_labels[:, iter])[0] == CLASS
        class_imgs = x_train[class_idxs]
   
        # print 10 random images from the correctly labeled subset
        rand_idx = np.random.randint(0, high=class_labels.shape[1], size=10)
    
        for j in rand_idx:
            fig = plt.figure()
            plt.imshow(class_imgs[j])
            plt.title("Class " + str(CLASS) + ", iteration " + str(iter))
            plt.show()
            
            pdf.savefig(fig, bbox_inches = 'tight')
    pdf.close()

# Analysis 3: Mean Images

In [None]:
##################################################################
## Find the average image for each class and iteration interval ##
##################################################################
from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()

#reshape images back to 28x28
# w, h = 32, 32
# x_train = x_train.reshape(x_train.shape[0], w, h)

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/CIFAR_1000/labels.npy")

for CLASS in range(10):
    pdf = PdfPages('/home/eysu/Sampling/Outputs/mean_imgs/CIFAR_Untrained_Class_' + str(CLASS) + '_mean_img.pdf')
    for iter in [0, 200, 400, 600, 800, 1000]:
        # isolate the labels and images that are labeled as CLASS
        class_idxs = np.where(all_labels[:, iter] == CLASS)
        class_labels = all_labels[class_idxs, iter]
        # double check that only the images labeled with CLASS at that iter have been selected
        assert np.unique(class_labels[:, iter])[0] == CLASS
        class_imgs = x_train[class_idxs]
        
        # find the mean image for each class at each iteration
        mean_img = np.array(np.mean(class_imgs, axis=0))

        fig = plt.figure()
        plt.imshow(mean_img)
        plt.xticks([])
        plt.yticks([])
        plt.title("Class " + str(CLASS) + ", iteration " + str(iter))
        plt.show()
        
        pdf.savefig(fig, bbox_inches = 'tight')
    pdf.close()

In [None]:
##################################################################
## Find the average image for each class and iteration interval ##
##################################################################
### SAME THING JUST PLOT ALL TOGETHER ##### 

from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()

#reshape images back to 28x28
# w, h = 28, 28
# x_train = x_train.reshape(x_train.shape[0], w, h)

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/CIFAR_1000/labels.npy")
pdf = PdfPages('/home/eysu/Sampling/Outputs/mean_imgs/CIFAR_Untrained_Class_mean_imgs.pdf')
fig, axes = plt.subplots(6, 10, figsize=(15, 8))

plt.setp(axes, xticks=[], xticklabels=[],
        yticks=[])
plt.suptitle("Average Images of Each Class Label by Iteration for Untrained Model")
for CLASS in range(10):
    for i, iter in enumerate([0, 200, 400, 600, 800, 1000]):
        # isolate the labels and images that are labeled as CLASS
        class_idxs = np.where(all_labels[:, iter] == CLASS)
        class_labels = all_labels[class_idxs, iter]
        # double check that only the images labeled with CLASS at that iter have been selected
        assert np.unique(class_labels[:, iter])[0] == CLASS
        class_imgs = x_train[class_idxs]
        
        # find the mean image for each class at each iteration
        mean_img = np.array(np.mean(class_imgs, axis=0))

        
        axes[i, CLASS].imshow(mean_img)
        if i == 0:
            axes[i, CLASS].set_title(labels[CLASS])
            
        if CLASS == 0:
            axes[i, CLASS].set_ylabel("Iter " + str(iter))

plt.show()
        
pdf.savefig(fig, bbox_inches = 'tight')
pdf.close()

In [None]:
# also show mean image of all data
from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/CIFAR_1000/labels.npy")
pdf = PdfPages('/home/eysu/Sampling/Outputs/mean_imgs/mean_img_all_data.pdf')


mean_total = np.array(np.mean(x_train, axis=0))
fig = plt.imshow(mean_total)
plt.title("Dataset average")
plt.xticks([])
plt.yticks([])

pdf.savefig(fig, bboc_inches = 'tight')
pdf.close()

In [None]:
##################################################################
## Find the average image for each class and iteration interval ##
##################################################################
from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()

#reshape images back to 28x28
# w, h = 28, 28
# x_train = x_train.reshape(x_train.shape[0], w, h)

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/pretrained_CIFAR_1000/labels.npy")

for CLASS in range(10):
    pdf = PdfPages('/home/eysu/Sampling/Outputs/mean_imgs/CIFAR_Pretrained_Class_' + str(CLASS) + '_mean_img.pdf')
    for iter in [0, 200, 400, 600, 800, 1000]:
        # isolate the labels and images that are labeled as CLASS
        class_idxs = np.where(all_labels[:, iter] == CLASS)
        class_labels = all_labels[class_idxs, iter]
        # double check that only the images labeled with CLASS at that iter have been selected
        assert np.unique(class_labels[:, iter])[0] == CLASS
        class_imgs = x_train[class_idxs]
        
        # find the mean image for each class at each iteration
        mean_img = np.array(np.mean(class_imgs, axis=0))

        fig = plt.figure()
        plt.imshow(mean_img)
        plt.title("Mean Image of Class " + str(CLASS) + ", iteration " + str(iter))
        plt.show()
        
        pdf.savefig(fig, bbox_inches = 'tight')
    pdf.close()

In [None]:
##################################################################
## Find the average image for each class and iteration interval ##
##################################################################
### SAME THING JUST PLOT ALL TOGETHER ##### 

from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()

#reshape images back to 28x28
# w, h = 28, 28
# x_train = x_train.reshape(x_train.shape[0], w, h)

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/pretrained_CIFAR_1000/labels.npy")
pdf = PdfPages('/home/eysu/Sampling/Outputs/mean_imgs/CIFAR_Pretrained_Class_mean_imgs.pdf')
fig, axes = plt.subplots(6, 10, figsize=(15, 8))

plt.setp(axes, xticks=[], xticklabels=[],
        yticks=[])
plt.suptitle("Average Images of Each Class Label by Iteration for Pretrained Model")
for CLASS in range(10):
    for i, iter in enumerate([0, 200, 400, 600, 800, 1000]):
        # isolate the labels and images that are labeled as CLASS
        class_idxs = np.where(all_labels[:, iter] == CLASS)
        class_labels = all_labels[class_idxs, iter]
        # double check that only the images labeled with CLASS at that iter have been selected
        assert np.unique(class_labels[:, iter])[0] == CLASS
        class_imgs = x_train[class_idxs]
        
        # find the mean image for each class at each iteration
        mean_img = np.array(np.mean(class_imgs, axis=0))

        
        axes[i, CLASS].imshow(mean_img)
        if i == 0:
            axes[i, CLASS].set_title(labels[CLASS])
            
        if CLASS == 0:
            axes[i, CLASS].set_ylabel("Iter " + str(iter))

plt.show()
        
pdf.savefig(fig, bbox_inches = 'tight')
pdf.close()

# Analysis 4: Cluster Similarity Score

In [None]:
#####################################################################
## Find the similarity of each iter labels to original seed labels ##
#####################################################################
# aka how many images are correctly labeled
from sklearn.metrics.cluster import adjusted_rand_score

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/CIFAR_1000/labels.npy")

true_labels = all_labels[:, 0]

x = np.arange(all_labels.shape[1])
all_sims = np.zeros((11, all_labels.shape[1]))
for iter in range(all_labels.shape[1]):
    iter_labels = all_labels[:, iter]
    avg_sim = adjusted_rand_score(true_labels, iter_labels)
    all_sims[10, iter] = avg_sim
    
    # also find each class's similarity score
    for CLASS in range(10):
        true_class_labels = true_labels == CLASS
        pred_class_labels = iter_labels == CLASS
        
        # calculate similarity score
        class_sim = adjusted_rand_score(true_class_labels, pred_class_labels)
        all_sims[CLASS, iter] = class_sim

In [None]:
fig = plt.figure(figsize=(8, 6))
for i in range(11):
    if i == 10:
        plt.plot(x, all_sims[i, :], label="Average Similarity", linewidth=2.0)
    else:
        plt.plot(x, all_sims[i, :], label=labels[i], linewidth=0.5)
plt.legend()
plt.title("Similarity Between Predicted and True Labels (Untrained Model)")
plt.ylabel("Similarity Score")
plt.xlabel("Iterations")
plt.show()

In [None]:
#####################################################################
## Find the similarity of each iter labels to original seed labels ##
#####################################################################
# aka how many images are correctly labeled
from sklearn.metrics.cluster import adjusted_rand_score

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/pretrained_CIFAR_1000/labels.npy")

true_labels = all_labels[:, 0]

x = np.arange(all_labels.shape[1])
all_sims = np.zeros((11, all_labels.shape[1]))
for iter in range(all_labels.shape[1]):
    iter_labels = all_labels[:, iter]
    avg_sim = adjusted_rand_score(true_labels, iter_labels)
    all_sims[10, iter] = avg_sim
    
    # also find each class's similarity score
    for CLASS in range(10):
        true_class_labels = true_labels == CLASS
        pred_class_labels = iter_labels == CLASS
        
        # calculate similarity score
        class_sim = adjusted_rand_score(true_class_labels, pred_class_labels)
        all_sims[CLASS, iter] = class_sim

In [None]:
fig = plt.figure(figsize=(8, 6))
for i in range(11):
    if i == 10:
        plt.plot(x, all_sims[i, :], label="Average Similarity", linewidth=2.0)
    else:
        plt.plot(x, all_sims[i, :], label=labels[i], linewidth=0.5)
plt.legend()
plt.title("Similarity Between Predicted and True Labels (Pretrained Model)")
plt.ylabel("Similarity Score")
plt.xlabel("Iterations")
plt.show()

# Analysis 5: Label Changing Visualization

In [None]:
# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/CIFAR_1000/labels.npy")

rand_idx = np.random.randint(0, all_labels.shape[0])

# save the true class to include it in title
true_cl = all_labels[rand_idx, 0]
x = np.arange(all_labels.shape[1])

plt.scatter(x, all_labels[rand_idx, :], marker = 'o', s = 2, c = '#2ca02c')


plt.title("Label Changes for a Class " + labels[int(true_cl)] + " Image")
plt.ylabel("Sampled Label Class")
plt.yticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], labels=labels)
plt.xlabel("Iterations")
plt.show()

# Analysis 6: CIFAR10H Agreement

In [None]:
######################################################################
## Compare the fractions of label classes with the CIFAR10H dataset ##
######################################################################

all_labels = np.load("/scratch/gpfs/eysu/Sampling/CIFAR2_1000/test_labels.npy")
soft = np.load('/scratch/gpfs/eysu/src_data/cifar-10h/data/cifar10h-probs.npy')

cos_sims = np.zeros(soft.shape[0])


# find the fractional divisions of labels over some slice of labels
for i in range(all_labels.shape[0]):
    if i % 1000 == 0:
        print(i)
    
    # find fraction of each class label 
    vals, num_labels = np.unique(all_labels[i][:900], return_counts=True)
    if len(vals) == 10:
        frac_labels = num_labels / all_labels.shape[1]
    
    else:
        frac_labels = np.zeros((10,))
        for i, num in enumerate(vals):
            frac_labels[int(num)] = num_labels[i]
        frac_labels = frac_labels / all_labels.shape[1]
    
    cos_sim = np.dot(soft[i], frac_labels) / (np.linalg.norm(soft[i])* np.linalg.norm(frac_labels))

    cos_sims[i] = cos_sim

x = np.arange(all_labels.shape[0])
plt.scatter(x, cos_sims, marker='.', linewidths=0.05)
plt.title("Similarity Between Fractions of Sampled Labels to \nHuman Soft Similarity Lables (Iteration 900)")
plt.ylabel("Cosine Similarity")
plt.xlabel("Test Image Index")
plt.show()

In [None]:
#############################################
## Find the entropy of class distributions ##
#############################################
from scipy.stats import entropy

# load the hard labels sampled after every iteration for every image
all_labels = np.load("/scratch/gpfs/eysu/Sampling/CIFAR_1000/labels.npy")

# store all divisions of classes in array
# dimensions are iters x classes
divisions = np.zeros((all_labels.shape[0], 10))

for i in range(all_labels.shape[0]):
    if i % 10000 == 0:
        print(i)
    
    vals, counts = np.unique(all_labels[i], return_counts=True)
    counts = counts / all_labels.shape[1]
    if len(vals) ==10:
        divisions[i] = counts 
        
    else:
        for j, val in enumerate(vals):
            divisions[i, int(val)] = counts[j]

In [None]:
print(np.mean(entropy(divisions, axis=1)))