In [None]:
###############
## Libraries ##
###############

import tensorflow as tf
import matplotlib.pyplot as plt 
import numpy as np
from tensorflow.keras import datasets, layers, models, losses
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import keras

# Import Data

In [None]:
######################################
## Import CIFAR10 data from Scratch ##
######################################
import pickle

# unpickle the binary files
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

# paths to each batch of data
batch1 = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/data_batch_1")
batch2 = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/data_batch_2")
batch3 = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/data_batch_3")
batch4 = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/data_batch_4")
batch5 = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/data_batch_5")
meta = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/batches.meta")
test = unpickle("/scratch/gpfs/eysu/src_data/cifar-10-batches-py/test_batch")

# separate labels and image data from each batch
y_train1 = batch1[b'labels']
x_train1 = batch1[b'data']
y_train2 = batch2[b'labels']
x_train2 = batch2[b'data']
y_train3 = batch3[b'labels']
x_train3 = batch3[b'data']
y_train4 = batch4[b'labels']
x_train4 = batch4[b'data']
y_train5 = batch5[b'labels']
x_train5 = batch5[b'data']

# concatenate into big training and testing arrays
y_train = np.concatenate((y_train1, y_train2, y_train3, y_train4, y_train5))
x_train = np.concatenate((x_train1, x_train2, x_train3, x_train4, x_train5), axis=0)

# def shuffle_in_unison(x, y):
#     assert x.shape[0] == y.shape[0]
#     shuffled_x = np.empty(x.shape, dtype=x.dtype)
#     shuffled_y = np.empty(y.shape, dtype=y.dtype)
    
#     # If rerunning: 
#     # permutation = np.loadtxt('/scratch/gpfs/eysu/src_data/cifar-10-batches-py/permutation.csv', delimiter=',').astype(np.int64)
    
#     # If repermuting: 
#     # permutation = np.random.permutation(y.shape[0])
    
#     for old_index, new_index in enumerate(permutation):
#         shuffled_x[new_index] = x[old_index]
#         shuffled_y[new_index] = y[old_index]

#     # IF I WANT TO RUN THIS AGAIN DONT REPERMUTE OR WILL LOSE THIS ORDERING
# #     np.savetxt('/scratch/gpfs/eysu/src_data/cifar-10-batches-py/permutation.csv', permutation, delimiter=',')
#     return shuffled_x, shuffled_y

# x_train, y_train = shuffle_in_unison(x_train, y_train)

y_test = test[b'labels']
x_test = test[b'data']

In [None]:
#################################################
## Preprocess data by reshaping and separating ##
#################################################
labels = ['airplane',  # index 0
          'automobile',  # index 1
          'bird',  # index 2 
          'cat',  # index 3 
          'deer',  # index 4
          'dog',  # index 5
          'frog',  # index 6 
          'horse',  # index 7 
          'ship',  # index 8 
          'truck']  # index 9

# Further break training data into train / validation sets 
# put 5000 into validation set and keep remaining 45,000 for train
(x_train, x_valid) = x_train[5000:], x_train[:5000] 
(y_train, y_valid) = y_train[5000:], y_train[:5000]

# reshape data to match dimensions of cifar10.load_data
x_train = x_train.reshape(45000, 3, 32, 32)
x_train = x_train.transpose(0, 2, 3, 1)
x_train = x_train.astype('float32')
x_train /= 255

# y_train = tf.keras.utils.to_categorical(y_train, 10)

x_valid = x_valid.reshape(5000, 3, 32, 32)
x_valid = x_valid.transpose(0, 2, 3, 1)
x_valid = x_valid.astype('float32')
x_valid /= 255
y_valid = tf.keras.utils.to_categorical(y_valid, 10)

x_test = x_test.reshape(10000, 3, 32, 32)
x_test = x_test.transpose(0, 2, 3, 1)
x_test = x_test.astype('float32')
x_test /= 255


# assert dimensions of data
print("TRAINING DATA")
print(x_train.shape)
print(y_train.shape)

print("VALIDATION DATA")
print(x_valid.shape)
print(y_valid.shape)

print("TESTING DATA")
print(x_test.shape)
print(y_test.shape)

In [None]:
# Examine any image

# Image index, you can pick any number between 0 and 44,999
img_index = 35
label_index = y_train[img_index]
# Print the label, for example 2 Pullover
print("y = " + str(label_index) + " (" +(labels[label_index]) + ")")
plt.imshow(x_train[img_index])
plt.show()

# Iterated Retraining By Sampling

In [None]:
##############################
## Sampling Helper Function ##
##############################

def sample(distributions):
    N = distributions.shape[0]
    labels = [None] * N
    for i in range(N):
        label = np.random.choice(10, p=distributions[i])
        labels[i] = label
    return labels

In [None]:
##############################################################
## This cell runs the iterated learning training procedure. ##
##############################################################

# Number of iterations in the serial reproduction
MAX_ITER = 1000
# Number of epochs per training run
EPOCHS = 10

# create an empty array to store the new labels for every iter
all_labels = np.zeros((x_train.shape[0], MAX_ITER + 1))
test_labels = np.zeros((x_test.shape[0], MAX_ITER + 1))

for iteration in range(0,MAX_ITER):
    # If iteration is seed, train on original target vectors, else, train on y_hat from time t-1
    if iteration == 0:
        # Save the label and then one-hot encode the labels
        all_labels[:, 0] = y_train
        test_labels[:, 0] = y_test
        y_train = tf.keras.utils.to_categorical(y_train, 10)
        y_test = tf.keras.utils.to_categorical(y_test, 10)
        mpth = 'model.weights.best.hdf5'
        y_hat_test_name = 'y_hat_test_seed'
        y_hat_train_name = 'y_hat_train_seed'      
    elif iteration > 0:
        # Key step: set new targets as y_hat
        y_train = new_train
        mpth = 'model.weights.best.' + 'iter' + str(iteration) + '.hdf5'
        y_hat_test_name = 'y_hat_test_' + 'iter' + str(iteration)
        y_hat_train_name = 'y_hat_train_' + 'iter' + str(iteration)

    # Define the model: a small CNN model (could probably be done outside loop)
    model = tf.keras.Sequential()

    # Must define the input shape in the first layer of the neural network
    model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=(32,32,3))) 
    model.add(tf.keras.layers.MaxPooling2D(pool_size=2))
    model.add(tf.keras.layers.Dropout(0.3))

    model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=2, padding='same', activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=2))
    model.add(tf.keras.layers.Dropout(0.3))

    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(256, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(10, activation='softmax'))

    # Take a look at the model summary
    # model.summary()

    # define optimization and energy parameters
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Save checkpoints
    checkpointer = ModelCheckpoint(filepath= '/scratch/gpfs/eysu/Sampling/CIFAR2_1000/' + mpth, verbose = 1, save_best_only=True) #True
    # Train the model
    model.fit(x_train,
             y_train,
             batch_size=64,
             epochs=EPOCHS,
             validation_data=(x_valid, y_valid),
             callbacks=[checkpointer])

    # Load the weights with the best validation accuracy
    y_hat = model.predict(x_train) #feed back serial reproduction targets
    y_hat_test = model.predict(x_test)
    
    #### START OF SAMPLING ####

    # use helper function to sample label for every image in train 
    new_labels = np.array(sample(y_hat))
    new_test_labels = np.array(sample(y_hat_test))
    
    # store new labels for all images under its corresponding iteration
    all_labels[:, iteration + 1] = new_labels
    test_labels[:, iteration + 1] = new_test_labels
    # expand dimensions of new labels and set this as new training vector
    new_train = tf.keras.utils.to_categorical(new_labels, 10)
    
    #### END OF SAMPLING ####
    
    model.load_weights('/scratch/gpfs/eysu/Sampling/CIFAR2_1000/' + mpth)
    # Evaluate the model on test set
    score = model.evaluate(x_test, y_test, verbose=0)
    # Print test accuracy
    print('\n', 'Test accuracy:', score[1])

    # Save results for each iteration in the serial reproduction chain
    np.save('/scratch/gpfs/eysu/Sampling/CIFAR2_1000/' + y_hat_train_name + '.npy', y_train)
    print('/scratch/gpfs/eysu/Sampling/CIFAR2_1000/' + y_hat_train_name)

    np.save('/scratch/gpfs/eysu/Sampling/CIFAR2_1000/' + y_hat_test_name + '.npy', y_hat_test)
    print('/scratch/gpfs/eysu/Sampling/CIFAR2_1000/' + y_hat_test_name)

    
np.save('/scratch/gpfs/eysu/Sampling/CIFAR2_1000/labels.npy', all_labels)
np.save('/scratch/gpfs/eysu/Sampling/CIFAR2_1000/test_labels.npy', test_labels)
print('Saved labels!')


# Analyze Sampling

In [None]:
##########################################
## Look at divisions in labels by class ##
##########################################

labels = np.load('/scratch/gpfs/eysu/Sampling/MNIST_1000/labels.npy')
print(labels.shape)
# print(labels)

# for i in range(201):
#     print(np.unique(labels[:, i], return_counts = True))

In [None]:
########################################################
## Examine the divisions in classes across iterations ##
########################################################
all_labels = np.load('/scratch/gpfs/eysu/Sampling/MNIST_200_1/labels.npy')

# store all divisions of classes in array
# dimensions are iters x classes
divisions = np.zeros((all_labels.shape[1], 10))

for i in range(all_labels.shape[1]):
    _, counts = np.unique(all_labels[:, i], return_counts=True)
    divisions[i] = counts

plt.figure(figsize=(12,8))
for j in range(10):
    x = np.arange(all_labels.shape[1])
    plt.plot(x, divisions[:, j], label = "Class: " + str(j))
    
plt.title("Divisions of classes over 200 iterations - Run 1")
plt.ylim([0, 12000])
plt.legend()
plt.show()

In [None]:
########################################################
## Examine the divisions in classes across iterations ##
########################################################
all_labels = np.load('/scratch/gpfs/eysu/Sampling/MNIST_200_2/labels.npy')

# store all divisions of classes in array
# dimensions are iters x classes
divisions = np.zeros((all_labels.shape[1], 10))

for i in range(all_labels.shape[1]):
    _, counts = np.unique(all_labels[:, i], return_counts=True)
    divisions[i] = counts

plt.figure(figsize=(12,8))
for j in range(10):
    x = np.arange(all_labels.shape[1])
    plt.plot(x, divisions[:, j], label = "Class: " + str(j))
    
plt.title("Divisions of classes over 200 iterations - Run 2")
plt.ylim([0, 12000])
plt.legend()
plt.show()

In [None]:
########################################################
## Examine the divisions in classes across iterations ##
########################################################
all_labels = np.load('/scratch/gpfs/eysu/Sampling/MNIST_200_3/labels.npy')

# store all divisions of classes in array
# dimensions are iters x classes
divisions = np.zeros((all_labels.shape[1], 10))

for i in range(all_labels.shape[1]):
    _, counts = np.unique(all_labels[:, i], return_counts=True)
    divisions[i] = counts

plt.figure(figsize=(12,8))
for j in range(10):
    x = np.arange(all_labels.shape[1])
    plt.plot(x, divisions[:, j], label = "Class: " + str(j))
    
plt.title("Divisions of classes over 200 iterations - Run 3")
plt.ylim([0, 12000])
plt.legend()
plt.show()

In [None]:
########################################################
## Examine the divisions in classes across iterations ##
########################################################
all_labels = np.load('/scratch/gpfs/eysu/Sampling/MNIST_200_4/labels.npy')

# store all divisions of classes in array
# dimensions are iters x classes
divisions = np.zeros((all_labels.shape[1], 10))

for i in range(all_labels.shape[1]):
    _, counts = np.unique(all_labels[:, i], return_counts=True)
    divisions[i] = counts

plt.figure(figsize=(12,8))
for j in range(10):
    x = np.arange(all_labels.shape[1])
    plt.plot(x, divisions[:, j], label = "Class: " + str(j))
    
plt.title("Divisions of classes over 200 iterations - Run 4")
plt.ylim([0, 12000])
plt.legend()
plt.show()

In [None]:
########################################################
## Examine the divisions in classes across iterations ##
########################################################
all_labels = np.load('/scratch/gpfs/eysu/Sampling/MNIST_200_5/labels.npy')

# store all divisions of classes in array
# dimensions are iters x classes
divisions = np.zeros((all_labels.shape[1], 10))

for i in range(all_labels.shape[1]):
    _, counts = np.unique(all_labels[:, i], return_counts=True)
    divisions[i] = counts

plt.figure(figsize=(12,8))
for j in range(10):
    x = np.arange(all_labels.shape[1])
    plt.plot(x, divisions[:, j], label = "Class: " + str(j))
    
plt.title("Divisions of classes over 200 iterations - Run 5")
plt.ylim([0, 12000])
plt.legend()
plt.show()

# Look at specific samples

In [None]:
from matplotlib.backends.backend_pdf import PdfPages

labels = np.load('/scratch/gpfs/eysu/Sampling/MNIST_200_1/labels.npy')

for i in range(10):
    pdf = PdfPages('/home/eysu/Sampling/Outputs/Class_' + str(i) + '.pdf')
    for j in range(100):
        # isolate the labels that start off as each index
        # plot the first example 
        x = np.arange(labels.shape[1])
        class_labels = labels[labels[:, 0] == i]

        # print all class labels for random image in class i
        rand_idx = np.random.randint(0, class_labels.shape[0])
        fig = plt.figure()
        plt.plot(x, class_labels[rand_idx, :])
        plt.ylim(-1, 10)
        plt.title('Class: ' + str(i))
        plt.show()
        
        # save all to pdf
        pdf.savefig(fig, bbox_inches = 'tight')
    pdf.close()
                          

# Class Images Analysis

In [None]:
#####################################################
## Look at how image examples in each class change ##
#####################################################
from matplotlib.backends.backend_pdf import PdfPages

labels = np.load('/scratch/gpfs/eysu/Sampling/MNIST_1000/labels.npy')
CLASS = 0
 
for iter in [0, 200, 400, 600, 800, 1000]:
    pdf = PdfPages('/home/eysu/Sampling/Outputs/Class_images_' + str(CLASS) + '_iter_' + str(iter) + '.pdf')

    # find indices of images labeled with CLASS
    class_idxs = np.where(labels[:, iter] == CLASS)
    # class images
    class_imgs = x_train[class_idxs]
    # class labels
    class_labels = labels[class_idxs]

    # make sure class_labels only includes CLASS
    assert np.unique(class_labels[:, iter])[0] == CLASS

    for i in range(class_labels.shape[0]):
        fig = plt.figure()
        plt.imshow(class_imgs[i])
        plt.show()

        # save all to pdf
        pdf.savefig(fig, bbox_inches = 'tight')
    pdf.close()

In [None]:
CLASS = 0
# find indices of images labeled with CLASS
class_idxs = np.where(labels[:, 1000] == CLASS)
# class images
class_imgs = x_train[class_idxs]
# class labels
class_labels = labels[class_idxs]

x = [0, 200, 400, 600, 800, 1000]
counts = np.zeros((len(x), 10))
for i, iter in enumerate(x):
    classes, iter_counts = np.unique(class_labels[:, iter], return_counts = True)
    
    # if all labels are the same only update one index
    if len(classes) == 1:
        counts[i, int(classes[0])] = iter_counts[0]
    
    else: counts[i] = iter_counts
    
print(counts)
    
for j in range(10):
    plt.scatter(x, counts[:, j], label = "class: " + str(j))
    
plt.legend()
plt.title("Distribution of classes at each iteration for images with final label=0")
plt.show()

In [None]:
CLASS = 0
# find indices of images labeled with CLASS
class_idxs = np.where(labels[:, 1000] == CLASS)
# class images
class_imgs = x_train[class_idxs]
# class labels
class_labels = labels[class_idxs]

x = np.arange(800, 1001)
counts = np.zeros((len(x), 10))
for i, iter in enumerate(x):
    classes, iter_counts = np.unique(class_labels[:, iter], return_counts = True)
    
    # if all labels are the same only update one index
    if len(classes) == 1:
        counts[i, int(classes[0])] = iter_counts[0]
    else: counts[i] = iter_counts
    
print(counts)
    
for j in range(10):
    plt.scatter(x, counts[:, j], label = "class: " + str(j))
    
plt.legend()
plt.title("Distribution of classes at each iteration for images with final label=0")
plt.show()

In [None]:
# just print the distribution before the very last one
x = [999, 999, 999, 999, 999, 999, 999, 999, 999, 999]

y = counts[-2]
plt.scatter(x, counts[-2])
x = x + 1
plt.scatter(x, counts[-1])
plt.show()

# Look at fraction of images remaining in their true classes

In [None]:
x = [200, 400, 600, 800, 1000]
true = labels[:, 0]
for iter in x:
    final = labels[:, iter]

    remain = 1 - (np.count_nonzero((true - final).astype(int)) / labels.shape[0])
    print("iter " + str(iter) + ": " + str(remain))


# Scratch work