In [None]:
import numpy as np
np.random.seed(13)

from sklearn.decomposition import IncrementalPCA
from sklearn.neighbors import KernelDensity

from scipy.spatial import distance

import matplotlib.pyplot as plt

from tqdm import tqdm

import h5py as h5


In [None]:
dataset_name = "3iyf-10K-mixed-hit-99"
h5_file = "/reg/data/ana03/scratch/deebanr/{}/dataset/cspi_synthetic_dataset_diffraction_patterns_3iyf-10K-mixed-hit_uniform_quat_dataset-size=10000_diffraction-pattern-shape=1024x1040.hdf5".format(dataset_name)
h5_file_handle = h5.File(h5_file, "r")
data = h5_file_handle["diffraction_patterns"]

total_dataset_size = 10000
dataset_size = 10000
batch_size = 5
n_batches = dataset_size // batch_size 
n_latent_dims = 3
n_iters = 10

dataset_idx = np.arange(total_dataset_size)
random_sample_idx = np.sort(np.random.choice(dataset_idx, dataset_size, replace=False))

incremental_pca = IncrementalPCA(n_components=n_latent_dims)

mean_previous = None
mean_current = None
V_previous = None
V_current = None
dissimilarity_measures = []

for t in tqdm(range(n_batches)):
    
    data_batch_to_fit = data[random_sample_idx[t * batch_size : (t + 1) * batch_size]].reshape((batch_size, -1))
    
    incremental_pca.partial_fit(data_batch_to_fit)
    
    if t == 0:
        
        mean_previous = incremental_pca.mean_
        V_previous = incremental_pca.components_
    
    elif (t + 1) % n_iters == 0:
        
        mean_current = incremental_pca.mean_
        V_current = incremental_pca.components_
        
        data_to_project_previous = data[random_sample_idx].reshape((dataset_size, -1))
        data_to_project_current = data[random_sample_idx].reshape((dataset_size, -1))
        
        projected_data_previous = np.dot(data_to_project_previous - mean_previous, V_previous.T)
        projected_data_current = np.dot(data_to_project_current - mean_current, V_current.T)

        kde_p = KernelDensity(kernel="gaussian", bandwidth=0.2).fit(projected_data_previous)
        kde_q = KernelDensity(kernel="gaussian", bandwidth=0.2).fit(projected_data_current)

        p = np.exp(kde_p.score_samples(projected_data_previous))
        q = np.exp(kde_q.score_samples(projected_data_current))

        dissimilarity_measures.append(distance.jensenshannon(p, q))
        
        mean_previous = mean_current
        V_previous = V_current

h5_file_handle.close()

plt.plot(range(n_iters, n_batches + 1, n_iters), dissimilarity_measures)
plt.ylabel("Jensen-Shannon Divergence")
plt.xlabel("Number of batches processed by Incremental PCA")
plot_title = "Convergence of Incremental PCA on {}\nfor a random sample of {} using a batch size of {}".format(dataset_name, dataset_size, batch_size)
plt.title(plot_title)
plt.savefig("incremental-pca-convergence-jensen-shannon-{}-dataset_size={}-batch_size={}-n_iters={}.png".format(dataset_name, dataset_size, batch_size, n_iters))
plt.show()