In [None]:
import matplotlib.pyplot as plt 
import numpy as np

In [None]:
##############################################
## Load, partition, and resize MNIST Digits ##
##############################################

def loadData():
    all_data = np.load("/scratch/gpfs/eysu/src_data/mnist.npz")

    x_test = all_data['x_test']
    x_train = all_data['x_train']
    y_train = all_data['y_train']
    y_test = all_data['y_test']

    labels = ["0",  # index 0
              "1",  # index 1
              "2",  # index 2 
              "3",  # index 3 
              "4",  # index 4
              "5",  # index 5
              "6",  # index 6 
              "7",  # index 7 
              "8",  # index 8 
              "9"]  # index 9

    # save train labels
    x_train = x_train.astype('float32') / 255
    x_test = x_test.astype('float32') / 255


    # y_train_labels = y_train
    # y_test_labels = y_test

    # Further break training data into train / validation sets (# put 5000 into validation set and keep remaining 55,000 for train)
    (x_train, x_valid) = x_train[5000:], x_train[:5000] 
    (y_train, y_valid) = y_train[5000:], np.array(y_train[:5000]).squeeze()

    # Reshape input data from (28, 28) to (28, 28, 1)
    w, h = 28, 28
    x_train = x_train.reshape(x_train.shape[0], w, h, 1)
    x_valid = x_valid.reshape(x_valid.shape[0], w, h, 1)
    x_test = x_test.reshape(x_test.shape[0], w, h, 1)
    
    return x_train, x_valid, x_test, y_train, y_valid, y_test, labels

# Analysis 1: Divisions of classes

In [None]:
########################################################
## Examine the divisions in classes across iterations ##
########################################################
# load the hard labels sampled after every iteration for every image
all_labels = np.load("/scratch/gpfs/eysu/Sampling/MNIST_1000/labels.npy")

# store all divisions of classes in array
# dimensions are iters x classes
divisions = np.zeros((all_labels.shape[1], 10))

for i in range(all_labels.shape[1]):
    _, counts = np.unique(all_labels[:, i], return_counts=True)
    divisions[i] = counts

plt.figure(figsize=(12,8))
for j in range(10):
    x = np.arange(all_labels.shape[1])
    plt.plot(x, divisions[:, j], label = "Class: " + str(j))
    
plt.title("Divisions of Classes over 1000 Iterations on Untrained Model")
plt.ylim([0, 12000])
plt.ylabel("Number of images")
plt.xlabel("Iterations")
plt.legend()
plt.show()

In [None]:
########################################################
## Examine the divisions in classes across iterations ##
########################################################
# load the hard labels sampled after every iteration for every image
all_labels = np.load("/scratch/gpfs/eysu/Sampling/pretrained_1000/labels.npy")

# store all divisions of classes in array
# dimensions are iters x classes
divisions = np.zeros((all_labels.shape[1], 10))

for i in range(all_labels.shape[1]):
    _, counts = np.unique(all_labels[:, i], return_counts=True)
    divisions[i] = counts

plt.figure(figsize=(12,8))
for j in range(10):
    x = np.arange(all_labels.shape[1])
    plt.plot(x, divisions[:, j], label = "Class: " + str(j))

    
plt.title("Divisions of Classes over 1000 Iterations on Pretrained Model")
plt.ylim([0, 12000])
plt.ylabel("Number of images")
plt.xlabel("Iterations")
plt.legend()
plt.show()

# Analysis 2: Examine images of each class

In [None]:
###############################################################################
## Look at 10 random images labeled as each class at each 200 iteration mark ##
###############################################################################
from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()
all_labels = np.load("/scratch/gpfs/eysu/Sampling/MNIST_1000/labels.npy")

for CLASS in range(10):
    pdf = PdfPages('/home/eysu/Sampling/Outputs/Untrained_Class_' + str(CLASS) + '.pdf')
    for iter in [0, 200, 400, 600, 800, 1000]:
        # isolate the labels and images that are labeled as CLASS
        class_idxs = np.where(all_labels[:, iter] == CLASS)
        class_labels = all_labels[class_idxs, iter]
        # double check that only the images labeled with CLASS at that iter have been selected
        assert np.unique(class_labels[:, iter])[0] == CLASS
        class_imgs = x_train[class_idxs]
   
        # print 10 random images from the correctly labeled subset
        rand_idx = np.random.randint(0, high=class_labels.shape[1], size=10)
    
        for j in rand_idx:
            fig = plt.figure()
            plt.imshow(class_imgs[j])
            plt.title("Class " + str(CLASS) + ", iteration " + str(iter))
            plt.show()
            
            pdf.savefig(fig, bbox_inches = 'tight')
    pdf.close()
        

In [None]:
###############################################################################
## Look at 10 random images labeled as each class at each 200 iteration mark ##
###############################################################################
from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()
all_labels = np.load("/scratch/gpfs/eysu/Sampling/pretrained_1000/labels.npy")

for CLASS in range(10):
    pdf = PdfPages('/home/eysu/Sampling/Outputs/Pretrained_Class_' + str(CLASS) + '.pdf')
    for iter in [0, 200, 400, 600, 800, 1000]:
        # isolate the labels and images that are labeled as CLASS
        class_idxs = np.where(all_labels[:, iter] == CLASS)
        class_labels = all_labels[class_idxs, iter]
        # double check that only the images labeled with CLASS at that iter have been selected
        assert np.unique(class_labels[:, iter])[0] == CLASS
        class_imgs = x_train[class_idxs]
   
        # print 10 random images from the correctly labeled subset
        rand_idx = np.random.randint(0, high=class_labels.shape[1], size=10)
    
        for j in rand_idx:
            fig = plt.figure()
            plt.imshow(class_imgs[j])
            plt.title("Class " + str(CLASS) + ", iteration " + str(iter))
            plt.show()
            
            pdf.savefig(fig, bbox_inches = 'tight')
    pdf.close()

# Analysis 3: Mean Images

In [None]:
##################################################################
## Find the average image for each class and iteration interval ##
##################################################################
from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()

#reshape images back to 28x28
w, h = 28, 28
x_train = x_train.reshape(x_train.shape[0], w, h)

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/MNIST_1000/labels.npy")

for CLASS in range(10):
    pdf = PdfPages('/home/eysu/Sampling/Outputs/mean_imgs/Untrained_Class_' + str(CLASS) + '_mean_img.pdf')
    for iter in [1000]:
        # isolate the labels and images that are labeled as CLASS
        class_idxs = np.where(all_labels[:, iter] == CLASS)
        class_labels = all_labels[class_idxs, iter]
        # double check that only the images labeled with CLASS at that iter have been selected
        assert np.unique(class_labels[:, iter])[0] == CLASS
        class_imgs = x_train[class_idxs]
        
        # find the mean image for each class at each iteration
        mean_img = np.array(np.mean(class_imgs, axis=0))

        fig = plt.figure()
        plt.imshow(mean_img)
        plt.xticks([])
        plt.yticks([])
        plt.title("Class " + str(CLASS) + ", iteration " + str(iter))
        plt.show()
        
        pdf.savefig(fig, bbox_inches = 'tight')
    pdf.close()

In [None]:
##################################################################
## Find the average image for each class and iteration interval ##
##################################################################
### SAME THING JUST PLOT ALL TOGETHER ##### 

from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()

#reshape images back to 28x28
w, h = 28, 28
x_train = x_train.reshape(x_train.shape[0], w, h)

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/MNIST_1000/labels.npy")
pdf = PdfPages('/home/eysu/Sampling/Outputs/mean_imgs/Untrained_Class_mean_imgs.pdf')
fig, axes = plt.subplots(6, 10, figsize=(15, 8))

plt.setp(axes, xticks=[], xticklabels=[],
        yticks=[])
plt.suptitle("Difference Between Average Images of Each Class and Dataset Average for Untrained Model")
for CLASS in range(10):
    for i, iter in enumerate([0, 200, 400, 600, 800, 1000]):
        # isolate the labels and images that are labeled as CLASS
        class_idxs = np.where(all_labels[:, iter] == CLASS)
        class_labels = all_labels[class_idxs, iter]
        # double check that only the images labeled with CLASS at that iter have been selected
        assert np.unique(class_labels[:, iter])[0] == CLASS
        class_imgs = x_train[class_idxs]
        
        # find the mean image for each class at each iteration
        mean_img = np.array(np.mean(class_imgs, axis=0))

        # uncomment the below line to show the difference between the class means and the total mean
        mean_img = mean_total - mean_img
        
        axes[i, CLASS].imshow(mean_img)
        if i == 0:
            axes[i, CLASS].set_title("Class " + str(CLASS))
            
        if CLASS == 0:
            axes[i, CLASS].set_ylabel("Iter " + str(iter))

plt.show()
        
pdf.savefig(fig, bbox_inches = 'tight')
pdf.close()

In [None]:
# also show mean image of all data
from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()

#reshape images back to 28x28
w, h = 28, 28
x_train = x_train.reshape(x_train.shape[0], w, h)

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/MNIST_1000/labels.npy")
pdf = PdfPages('/home/eysu/Sampling/Outputs/mean_imgs/mean_img_all_data.pdf')


mean_total = np.array(np.mean(x_train, axis=0))
fig = plt.imshow(mean_total)
plt.title("Dataset average")
plt.xticks([])
plt.yticks([])
plt.show()

# pdf.savefig(fig)
# pdf.close()

In [None]:
##################################################################
## Find the average image for each class and iteration interval ##
##################################################################
from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()

#reshape images back to 28x28
w, h = 28, 28
x_train = x_train.reshape(x_train.shape[0], w, h)

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/pretrained_1000/labels.npy")

for CLASS in range(10):
    pdf = PdfPages('/home/eysu/Sampling/Outputs/mean_imgs/Pretrained_Class_' + str(CLASS) + '_mean_img.pdf')
    for iter in [0, 200, 400, 600, 800, 1000]:
        # isolate the labels and images that are labeled as CLASS
        class_idxs = np.where(all_labels[:, iter] == CLASS)
        class_labels = all_labels[class_idxs, iter]
        # double check that only the images labeled with CLASS at that iter have been selected
        assert np.unique(class_labels[:, iter])[0] == CLASS
        class_imgs = x_train[class_idxs]
        
        # find the mean image for each class at each iteration
        mean_img = np.array(np.mean(class_imgs, axis=0))

        fig = plt.figure()
        plt.imshow(mean_img)
        plt.title("Mean Image of Class " + str(CLASS) + ", iteration " + str(iter))
        plt.show()
        
        pdf.savefig(fig, bbox_inches = 'tight')
    pdf.close()

In [None]:
##################################################################
## Find the average image for each class and iteration interval ##
##################################################################
### SAME THING JUST PLOT ALL TOGETHER ##### 

from matplotlib.backends.backend_pdf import PdfPages

x_train, x_valid, x_test, y_train, y_valid, y_test, labels = loadData()

#reshape images back to 28x28
w, h = 28, 28
x_train = x_train.reshape(x_train.shape[0], w, h)

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/pretrained_1000/labels.npy")
pdf = PdfPages('/home/eysu/Sampling/Outputs/mean_imgs/Pretrained_Class_mean_imgs.pdf')
fig, axes = plt.subplots(6, 10, figsize=(15, 8))

plt.setp(axes, xticks=[], xticklabels=[],
        yticks=[])
plt.suptitle("Difference Between Average Images of Each Class and Dataset Average for Pretrained Model")
for CLASS in range(10):
    for i, iter in enumerate([0, 200, 400, 600, 800, 1000]):
        # isolate the labels and images that are labeled as CLASS
        class_idxs = np.where(all_labels[:, iter] == CLASS)
        class_labels = all_labels[class_idxs, iter]
        # double check that only the images labeled with CLASS at that iter have been selected
        assert np.unique(class_labels[:, iter])[0] == CLASS
        class_imgs = x_train[class_idxs]
        
        # find the mean image for each class at each iteration
        mean_img = np.array(np.mean(class_imgs, axis=0))

        # uncomment the below line to show the difference between the class means and the total mean
        mean_img = mean_total - mean_img
        
        axes[i, CLASS].imshow(mean_img)
        if i == 0:
            axes[i, CLASS].set_title("Class " + str(CLASS))
            
        if CLASS == 0:
            axes[i, CLASS].set_ylabel("Iter " + str(iter))

plt.show()
        
pdf.savefig(fig, bbox_inches = 'tight')
pdf.close()

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score

a = [True]
b = [False]
print(adjusted_rand_score(a, b))

# Analysis 4: Cluster Similarity Score

In [None]:
#####################################################################
## Find the similarity of each iter labels to original seed labels ##
#####################################################################
# aka how many images are correctly labeled
from sklearn.metrics.cluster import adjusted_rand_score

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/MNIST_1000/labels.npy")

true_labels = all_labels[:, 0]

x = np.arange(all_labels.shape[1])
all_sims = np.zeros((11, all_labels.shape[1]))
for iter in range(all_labels.shape[1]):
    iter_labels = all_labels[:, iter]
    avg_sim = adjusted_rand_score(true_labels, iter_labels)
    all_sims[10, iter] = avg_sim
    
    # also find each class's similarity score
    for CLASS in range(10):
        true_class_labels = true_labels == CLASS
        pred_class_labels = iter_labels == CLASS
        
        # calculate similarity score
        class_sim = adjusted_rand_score(true_class_labels, pred_class_labels)
        all_sims[CLASS, iter] = class_sim


In [None]:
fig = plt.figure(figsize=(8, 6))
for i in range(11):
    if i == 10:
        plt.plot(x, all_sims[i, :], label="Average Similarity", linewidth=2.0)
    else:
        plt.plot(x, all_sims[i, :], label="Class: " + str(i), linewidth=0.5)
plt.legend()
plt.title("Similarity Between Predicted and True Labels (Untrained Model)")
plt.ylabel("Similarity Score")
plt.xlabel("Iterations")
plt.show()

In [None]:
#####################################################################
## Find the similarity of each iter labels to original seed labels ##
#####################################################################
# aka how many images are correctly labeled
from sklearn.metrics.cluster import adjusted_rand_score

# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/pretrained_1000/labels.npy")

true_labels = all_labels[:, 0]

x = np.arange(all_labels.shape[1])
all_sims = np.zeros((11, all_labels.shape[1]))
for iter in range(all_labels.shape[1]):
    iter_labels = all_labels[:, iter]
    avg_sim = adjusted_rand_score(true_labels, iter_labels)
    all_sims[10, iter] = avg_sim
    
    # also find each class's similarity score
    for CLASS in range(10):
        true_class_labels = true_labels == CLASS
        pred_class_labels = iter_labels == CLASS
        
        # calculate similarity score
        class_sim = adjusted_rand_score(true_class_labels, pred_class_labels)
        all_sims[CLASS, iter] = class_sim

In [None]:
fig = plt.figure(figsize=(12, 8))
for i in range(11):
    if i == 10:
        plt.plot(x, all_sims[i, :], label="Average Similarity", linewidth=2.0)
    else:
        plt.plot(x, all_sims[i, :], label="Class: " + str(i), linewidth=0.5)
plt.legend()
plt.title("Similarity Between Predicted and True Labels (Pretrained Model)")
plt.ylabel("Similarity Score")
plt.xlabel("Iterations")
plt.show()

# Analysis 5: Label Changing Visualization

In [None]:
# load labels from training
all_labels = np.load("/scratch/gpfs/eysu/Sampling/MNIST_1000/labels.npy")

rand_idx = np.random.randint(0, all_labels.shape[0])

# save the true class to include it in title
true_cl = all_labels[rand_idx, 0]
x = np.arange(all_labels.shape[1])

plt.scatter(x, all_labels[rand_idx, :], marker = 'o', s = 2, c = '#2ca02c')


plt.title("Label Changes for a Class " + str(int(true_cl)) + " Image")
plt.ylabel("Sampled Label Class")
plt.yticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
plt.xlabel("Iterations")
plt.show()

# Analysis 6: Entropy of Class Distribution

In [None]:
#############################################
## Find the entropy of class distributions ##
#############################################
from scipy.stats import entropy

# load the hard labels sampled after every iteration for every image
all_labels = np.load("/scratch/gpfs/eysu/Sampling/pretrained_1000/labels.npy")

# store all divisions of classes in array
# dimensions are iters x classes
divisions = np.zeros((all_labels.shape[0], 10))

for i in range(all_labels.shape[0]):
    if i % 10000 == 0:
        print(i)
    
    vals, counts = np.unique(all_labels[i], return_counts=True)
    counts = counts / all_labels.shape[1]
    if len(vals) ==10:
        divisions[i] = counts 
        
    else:
        for j, val in enumerate(vals):
            divisions[i, int(val)] = counts[j]

In [None]:
print(np.mean(entropy(divisions, axis=1)))