# Incremental PCA

## Import dependencies for the model

In [None]:
import pickle

import numpy as np
np.random.seed(13)

from sklearn.decomposition import IncrementalPCA
from sklearn.neighbors import KernelDensity

from scipy.spatial import distance

import matplotlib.pyplot as plt

from tqdm import tqdm

import h5py as h5


## Fit the model to the data

In [None]:
dataset_name = "3iyf-10K-mixed-hit-99"
downsampled_images_output_subdir = "downsample-128x128"

dataset_size = 10000
batch_size = 5
n_latent_dims = 3
n_iters_to_measure_convergence = 10

h5_file = "/reg/data/ana03/scratch/deebanr/{}/dataset/{}/cspi_synthetic_dataset_diffraction_patterns_3iyf-10K-mixed-hit_uniform_quat_dataset-size={}_diffraction-pattern-shape=1024x1040.hdf5".format(dataset_name, downsampled_images_output_subdir, dataset_size)
h5_file_handle = h5.File(h5_file, 'r')

img_data = h5_file_handle["downsampled_diffraction_patterns"]
data_to_project = img_data[:].reshape((dataset_size, -1))

incremental_pca = IncrementalPCA(n_components=n_latent_dims)

n_batches = dataset_size // batch_size 
convergence_measures = []

for t in tqdm(range(n_batches)):
    
    data_batch_to_fit = img_data[t * batch_size : (t + 1) * batch_size].reshape((batch_size, -1))
    
    incremental_pca.partial_fit(data_batch_to_fit)
    
    if t == 0:
        
        mean_previous = incremental_pca.mean_
        V_previous = incremental_pca.components_
        
        projected_data_previous = np.dot(data_to_project - mean_previous, V_previous.T)
        kernel_density_estimate_previous = KernelDensity(kernel="gaussian", bandwidth=0.2).fit(projected_data_previous)
        probability_density_estimate_previous = np.exp(kernel_density_estimate_previous.score_samples(projected_data_previous))
    
    elif (t + 1) % n_iters_to_measure_convergence == 0:
        
        mean_current = incremental_pca.mean_
        V_current = incremental_pca.components_
        
        projected_data_current = np.dot(data_to_project - mean_current, V_current.T)
        kernel_density_estimate_current = KernelDensity(kernel="gaussian", bandwidth=0.2).fit(projected_data_current)
        probability_density_estimate_current = np.exp(kernel_density_estimate_current.score_samples(projected_data_current))
        
        convergence_measure = distance.jensenshannon(probability_density_estimate_previous, probability_density_estimate_current)
        convergence_measures.append(convergence_measure)
        
        mean_previous = mean_current
        V_previous = V_current
        
        projected_data_previous = projected_data_current
        kernel_density_estimate_previous = kernel_density_estimate_current
        probability_density_estimate_previous = probability_density_estimate_current

h5_file_handle.close()


## Plot the convergence for the model

In [None]:
plt.plot(range(n_iters_to_measure_convergence, n_batches + 1, n_iters_to_measure_convergence), convergence_measures)
plt.ylabel("Jensen-Shannon Divergence")
plt.xlabel("Number of batches processed by Incremental PCA")
plot_title = "Convergence of Incremental PCA on {}".format(dataset_name)
plt.title(plot_title)
plt.savefig("incremental-pca-convergence-jensen-shannon-{}-{}-dataset_size={}-batch_size={}-n_iters_to_measure_convergence={}.png".format(dataset_name, downsampled_images_output_subdir, dataset_size, batch_size, n_iters_to_measure_convergence))
plt.show()


## Save the convergence measures

In [None]:
convergence_measures_file = "incremental-pca-convergence-jensen-shannon-{}-{}-dataset_size={}-batch_size={}-n_iters_to_measure_convergence={}.npy".format(dataset_name, downsampled_images_output_subdir, dataset_size, batch_size, n_iters_to_measure_convergence)
np.save(convergence_measures_file, convergence_measures)
print("Saved convergence measures to: {}".format(convergence_measures_file))


## Save the model

In [None]:
incremental_pca_file = "incremental-pca-{}-{}-dataset_size={}-batch_size={}.pkl".format(dataset_name, downsampled_images_output_subdir, dataset_size, batch_size)
with open(incremental_pca_file, 'wb') as incremental_pca_file_handle:
    pickle.dump(incremental_pca, incremental_pca_file_handle)

print("Saved Incremental PCA model to: {}".format(incremental_pca_file))


## Load the model

In [None]:
with open(incremental_pca_file, 'rb') as incremental_pca_file_handle:
    loaded_incremental_pca = pickle.load(incremental_pca_file_handle)

print("Loaded Incremental PCA model from: {}".format(incremental_pca_file))
print("Principal components:\n{}".format(loaded_incremental_pca.components_))
