In [None]:
import os
import pathlib

base_directory = os.getcwd() # store your base directory for easy reference
assignment_two_data = base_directory + '/Homework_Two_Data/'

In [None]:
# Load core libraries and utilities
import numpy as np
import matplotlib.pyplot as plt
from math_596_image_analysis_helper_functions import *
from sklearn.cluster import KMeans

# Jupyter notebook "magic"
%matplotlib inline

In [None]:
cropped_directory = assignment_two_data + '/CroppedYale/'
cropped_image_list = file_builder(cropped_directory)

# For Windows users, you might need this instead:
# win_cropped_directory = pathlib.PureWindowsPath(cropped_directory)
# cropped_image_list = file_builder(win_cropped_directory)

In [None]:
def mode_and_kmeans_label_maker(images_mat):
    zero_avg, _ = zero_mean(images_mat)
    u_modes, s_vals = mode_builder(zero_avg)
    u_modes_red = mode_selector(u_modes, s_vals, 1.)

    projected_images = u_modes_red.T @ zero_avg
    
    cluster_data = np.array([projected_images[0, :], projected_images[1, :]])
    print(cluster_data.shape)
    
    kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(cluster_data.T)
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_
    
    return labels, centers, cluster_data  

In [None]:
# Same stuff from lecture notes:

num_crp_images = len(cropped_image_list)
crp_row, crp_col = np.shape(cropped_image_list[0])
crp_image_mat = np.zeros((crp_row*crp_col, num_crp_images), dtype=np.float64)
for cnt, image in enumerate(cropped_image_list):
    crp_image_mat[:, cnt] = image.flatten()

print(f"Total number of images is: {num_crp_images}")
print(f"Pixel counts are {crp_row} by {crp_col}")

**Problem 1**: Using the cropped image list from Homework 2, following our study of using k-means clusters on the scatter plot of `projected_images[0, :]` and `projected_images[1, :]` as shown in `Applications_of_the_SVD_and_Cluster_Analysis.ipynb`, we want to study how sensisitve the labels we used are.  To do this:

* Remove 10% of the total images and then label the remaining 90% of the images.  Compare your labels that you generated for the full data set to your new ones (relative of course to the common 90% of images you are working with) and compute the average difference as a percentage of the total number of images in your 90% set.  

* Repeat this process 9 more times by removing a different 10% of the data each time.  This should generate 9 more average differences.  

* Plot your results and comment upon them.  How robust are the labels?  How effective would you say that our labeling scheme is?  

In [None]:
# To build your 90%/10% splits in the data, we shuffle our images first 
# and then build our modes and find our labels for the full data set 
# relative to a fixed shuffle of the image indices.  

rng = np.random.default_rng()
image_indices = np.arange(num_crp_images)
rng.shuffle(image_indices)
shuffled_images = crp_image_mat[:, image_indices]

labels, centers, cluster_data = mode_and_kmeans_label_maker(shuffled_images)

group_one = labels == 1
group_two = labels == 0
cluster_one = cluster_data[:, group_one]
cluster_two = cluster_data[:, group_two]

plt.scatter(cluster_one[0, :], cluster_one[1, :], s=2., c='r')
plt.scatter(cluster_two[0, :], cluster_two[1, :], s=2., c='k')
plt.scatter(centers[:, 0], centers[:, 1], s=40., c='g', marker='x')

In [None]:
# Now start doing your 90/10 splits.  Note, you don't need to do anymore index shuffling, 
# and if you do, the problem gets categorically more difficult.  

**Problem 2**: Repeat the above analysis using two kmeans labels and the modes `projected_images[0, :]`, `projected_images[1, :]`, and `projected_images[3, :]`.  How do your results change?  

**Problem 3**: Repeat the above analysis using three kmeans labels and the modes `projected_images[0, :]`, `projected_images[1, :]`, and `projected_images[3, :]`.  How do your results change?  Which would you say is the best labeling scheme?  