# Copy of KNN_Confusions.ipynb 
Except this copy now pulls the activations from the last layer latent space of the model after 1 iteration of training rather than 24


In [None]:
###############
## Libraries ##
###############

import tensorflow as tf
import matplotlib.pyplot as plt 
import numpy as np
from tensorflow.keras import datasets, layers, models, losses
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

In [None]:
# load in mnist digit data
all_data = np.load("/scratch/gpfs/eysu/src_data/mnist.npz")
print(all_data.files)
x_test = all_data['x_test']
x_train = all_data['x_train']
y_train = all_data['y_train']
y_test = all_data['y_test']

# Split data and reset dimensions
labels = ["0",  # index 0
          "1",  # index 1
          "2",  # index 2 
          "3",  # index 3 
          "4",  # index 4
          "5",  # index 5
          "6",  # index 6 
          "7",  # index 7 
          "8",  # index 8 
          "9"]  # index 9

# Print training set shape - note there are 60,000 training data of image size of 28x28, 60,000 train labels)
print("x_train shape:", x_train.shape, "y_train shape:", y_train.shape)

# save train labels
y_train_labels = y_train
y_test_labels = y_test

# Print the number of training and test datasets
print(x_train.shape[0], 'train set')
print(x_test.shape[0], 'test set')


x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255

# Further break training data into train / validation sets (# put 5000 into validation set and keep remaining 55,000 for train)
(x_train, x_valid) = x_train[5000:], x_train[:5000] 
(y_train, y_valid) = y_train[5000:], y_train[:5000]

# Reshape input data from (28, 28) to (28, 28, 1)
w, h = 28, 28
x_train = x_train.reshape(x_train.shape[0], w, h, 1)
x_valid = x_valid.reshape(x_valid.shape[0], w, h, 1)
x_test = x_test.reshape(x_test.shape[0], w, h, 1)

# Validation set
y_valid = tf.keras.utils.to_categorical(y_valid, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)
    
# Image index, you can pick any number between 0 and 59,999
img_index = 5
# y_train contains the lables, ranging from 0 to 9
label_index = y_train[img_index]
# Print the label, for example 2 Pullover
print ("y = " + str(label_index) + " " +(labels[label_index]))
# # Show one of the images from the training dataset
plt.imshow(x_train[img_index])
plt.show()

In [None]:
################################################
## Recreate model from final iterations of SR ##
################################################

# specify path of model to load in
# NOW USING ITER1 MODEL
model_weights = 'Serial-Reproductions-CNN-Research/weights_concise/weights_digits_1_trunc/model.weights.best.iter1.hdf5'

# load model
model = load_model(model_weights)

# examine model
model.summary()
model.get_weights()
model.optimizer

layer_output = model.layers[-2].output
activation_model = models.Model(inputs=model.input, outputs=layer_output)

# Run image through model and get activations
activations = activation_model.predict(x_train) # should be 2 numpy arrays of dimension N images x 256D

print(activations.shape)

In [None]:
############################
## Find Nearest Neighbors ##
############################

NUM_IMAGES = x_train.shape[0]
NUM_NEIGHBORS = 25

# create arrays to store results
nearest_neighbors_in_class = np.zeros([NUM_IMAGES, NUM_NEIGHBORS])
nearest_neighbors_in_class_norms = np.zeros([NUM_IMAGES, NUM_NEIGHBORS])
nearest_neighbors_other_class = np.zeros([NUM_IMAGES, NUM_NEIGHBORS])
nearest_neighbors_other_class_norms = np.zeros([NUM_IMAGES, NUM_NEIGHBORS])


for image_idx in range(NUM_IMAGES):
    # determine image class and divide indices by in/out of class
    image_class = y_train[image_idx]
    same_class_idxs = np.array(np.where(y_train == image_class)).squeeze()
    other_class_idxs = np.array(np.where(y_train != image_class)).squeeze()

    # find norm of differences between original image and every other image
    diffs = activations - activations[image_idx]
    norms = np.linalg.norm(diffs, axis=1)

    # get nearest neighbors in and out of class
    min_in_class_idxs = np.argpartition(norms[same_class_idxs], range(25))
    min_other_class_idxs = np.argpartition(norms[other_class_idxs], range(25))
    
    # this is a bit confusing but since np.argpartition returns the indices of the min
    # values in the subarray that we pass it, we need to use the in_class_idxs and other_class_idxs
    # arrays to convert these indices back to the main array indices
    min_in_class_idxs = same_class_idxs[min_in_class_idxs]
    min_other_class_idxs = other_class_idxs[min_other_class_idxs]
    
    # update all results arrays
    # start with the 2nd index since the nearest neighbor is always itself
    nearest_neighbors_in_class[image_idx, :] = min_in_class_idxs[1:26]
    nearest_neighbors_in_class_norms[image_idx, :] = norms[min_in_class_idxs[1:26]]
    
    nearest_neighbors_other_class[image_idx, :] = min_other_class_idxs[1:26]
    nearest_neighbors_other_class_norms[image_idx, :] = norms[min_other_class_idxs[1:26]]
    
    # rudimentary method of tracking progress
    if image_idx % 100 == 0:
        print(image_idx / NUM_IMAGES)
        
nearest_neighbors_in_class = nearest_neighbors_in_class.astype(int)
nearest_neighbors_other_class = nearest_neighbors_other_class.astype(int)
    
print("done")

In [None]:
# export nearest neighbors of all images
import pandas as pd
pd.DataFrame(nearest_neighbors_in_class).to_csv("Outputs/NearestNeighbors/digits/nearest_neighbors_in_class_iter1.csv", sep = ',', header=None, index=None)
pd.DataFrame(nearest_neighbors_in_class_norms).to_csv("Outputs/NearestNeighbors/digits/nearest_neighbors_in_class_norms_iter1.csv", sep = ',', header=None, index=None)

pd.DataFrame(nearest_neighbors_other_class).to_csv("Outputs/NearestNeighbors/digits/nearest_neighbors_other_class_iter1.csv", sep = ',', header=None, index=None)
pd.DataFrame(nearest_neighbors_other_class_norms).to_csv("Outputs/NearestNeighbors/digits/nearest_neighbors_other_class_norms_iter1.csv", sep = ',', header=None, index=None)


In [None]:
# Rename iter 1 model matrices and load in iter 24 model matrices
import pandas as pd

# iter 1 matrices
nearest_neighbors_in_class_iter1 = pd.read_csv("Serial-Reproductions-CNN-Research/Outputs/NearestNeighbors/digits/nearest_neighbors_in_class_iter1.csv", sep = ',', header=None).to_numpy()
nearest_neighbors_in_class_norms_iter1 = pd.read_csv("Serial-Reproductions-CNN-Research/Outputs/NearestNeighbors/digits/nearest_neighbors_in_class_norms_iter1.csv", sep = ',', header=None).to_numpy()
nearest_neighbors_other_class_iter1 = pd.read_csv("Serial-Reproductions-CNN-Research/Outputs/NearestNeighbors/digits/nearest_neighbors_other_class_iter1.csv", sep = ',', header=None).to_numpy()
nearest_neighbors_other_class_norms_iter1 = pd.read_csv("Serial-Reproductions-CNN-Research/Outputs/NearestNeighbors/digits/nearest_neighbors_other_class_norms_iter1.csv", sep = ',', header=None).to_numpy()

# iter 24 matrices
nearest_neighbors_in_class_iter24 = pd.read_csv("Serial-Reproductions-CNN-Research/Outputs/NearestNeighbors/digits/nearest_neighbors_in_class.csv", sep = ',', header=None).to_numpy()
nearest_neighbors_in_class_norms_iter24 = pd.read_csv("Serial-Reproductions-CNN-Research/Outputs/NearestNeighbors/digits/nearest_neighbors_in_class_norms.csv", sep = ',', header=None).to_numpy()
nearest_neighbors_other_class_iter24 = pd.read_csv("Serial-Reproductions-CNN-Research/Outputs/NearestNeighbors/digits/nearest_neighbors_other_class.csv", sep = ',', header=None).to_numpy()
nearest_neighbors_other_class_norms_iter24 = pd.read_csv("Serial-Reproductions-CNN-Research/Outputs/NearestNeighbors/digits/nearest_neighbors_other_class_norms.csv", sep = ',', header=None).to_numpy()


In [None]:
# correlation
from scipy.stats import kendalltau
NUM_IMAGES = x_train.shape[0]

in_class_corr = np.zeros(NUM_IMAGES)
other_class_corr = np.zeros(NUM_IMAGES)

for image_idx in range(NUM_IMAGES):
    in_class_corr[image_idx] = kendalltau(nearest_neighbors_in_class_iter1[image_idx, :], nearest_neighbors_in_class_iter24[image_idx, :])[0]
    other_class_corr[image_idx] = kendalltau(nearest_neighbors_other_class_iter1[image_idx, :], nearest_neighbors_other_class_iter24[image_idx, :])[0]
    
print("Average in class correlation: ", np.mean(np.absolute(in_class_corr)))
print("Average out of class correlation: ", np.mean(np.absolute(other_class_corr)))
    

In [None]:
print(nearest_neighbors_other_class_iter1[0])
print(nearest_neighbors_other_class_iter24[0])

In [None]:
from scipy.stats import kendalltau
print(kendalltau(nearest_neighbors_other_class_iter1[0], nearest_neighbors_other_class_iter24[0])[0])

In [None]:
print(np.corrcoef(nearest_neighbors_other_class_iter1[0], nearest_neighbors_other_class_iter24[0])[0, 1])