In [1]:
%matplotlib notebook
import os
import pandas
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import copy
import glob

In [2]:
data_dir = '/home/timothy/PDF_Lib_2016-03/lib_data'

# loading data
file_list = os.listdir(data_dir)
r_grid_name = glob.glob('./Nyqust/*rgrid*')
gr_list_name = glob.glob('./Nyqust/*Gr.npy')
el_list_name = glob.glob('./Nyqust/*Element.npy')
sg_list_name = glob.glob('./Nyqust/*SpaceGroup*.npy')


gr_list= np.load(gr_list_name[0])
r_grid = np.load(r_grid_name[0])
#el_list = np.load(el_list_name[0])
#sg_list = np.load(sg_list_name[0])


elm = pandas.read_csv(os.path.join(data_dir,'None_2016-03-30_Element.txt'), header=None)
dummy_el_list = elm.values.tolist()
sg = pandas.read_csv(os.path.join(data_dir,'None_2016-03-30_SpaceGroupSymbol.txt'), header=None)
dummy_sg_list = sg.values.tolist()

sg_list = []
for el in dummy_sg_list:
    sg_list.append(el[0])
el_list = []
for el in dummy_el_list:
    el_list.append(el[0])

In [3]:
pca = PCA()
pca.fit(gr_list)
pca.components_.shape

(195, 195)

In [4]:
cum_sum = np.cumsum(pca.explained_variance_ratio_)
ratio_fig = plt.figure()
plt.plot(cum_sum / np.sum(pca.explained_variance_ratio_),'r')
plt.xlabel('number of PCs included')
plt.ylabel('cumulative ratio')
plt.title('Explained Variance Ratio with Nyquist grid')
plt.show()

<IPython.core.display.Javascript object>

In [5]:
fig, axs = plt.subplots(5,2, figsize=(6, 6), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .8, wspace=.8)
axs = axs.ravel()
fig.set_size_inches(8,6)
for i in range(10):
    new_plot = np.zeros_like(pca.components_[0])
    for j in range(int(i*5)):
        new_plot += pca.components_[j]
    axs[i].plot(r_grid, new_plot,'r')
    axs[i].set_title('Number of PCs = {}'.format(i*5))
    axs[i].yaxis.set_visible(False)

<IPython.core.display.Javascript object>

In [6]:
def cal_score(obs, loading):
    score_list = []
    loading_dim = np.shape(loading)
    for i in range(loading_dim[0]):
        score_val = np.dot(loading[i,], obs)
        score_list.append(score_val)
    return np.array(score_list)

def reconstruct(obs, mean, score, loading, PC_num):
    reconstruct = np.zeros_like(mean)
    #print(reconstruct)
    for i in range(PC_num):
        reconstruct += loading[i,] * score[i,]
    return reconstruct + mean

def PC_projection(data, loading, PC_ind):
    data_dim = np.shape(data)
    PC = loading[PC_ind]
    score_to_PC = []
    for i in range(data_dim[0]):
        score_to_PC.append(np.dot(PC, data[i]))
    return np.array(score_to_PC).reshape(data_dim[0],1)

def sort_ind(array):
    return sorted(range(len(array)), key=lambda k: array[k])

In [10]:
from sklearn.cluster import KMeans
dim_gr = np.shape(gr_list)
kmeans_class_dict = []
for i in range(9):
    num = (i+1)*20
    np.random.seed(seed=55)
    kmeans = KMeans(n_clusters=num)
    kmeans_classify = kmeans.fit_predict(gr_list)
    kmeans_dict = {}
    for i in range(dim_gr[0]):
        kmeans_dict.update({str(i):kmeans_classify[i]})
    fig = plt.figure()
    plt.hist(kmeans_classify, bins = num, color='r', alpha=0.65)
    plt.show()
    plt.xlabel('cluster label')
    plt.ylabel('number of counts')
    plt.title('K-means label with number of cluster = {}'.format(num))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [42]:
counts_ind = np.bincount(kmeans_classify)
sorted_label = sort_ind(counts_ind)
first_label = sorted_label[-1]
second_label = sorted_label[-2]
third0_label = sorted_label[-3]
el_list_first_label = []
sg_list_first_label = []
for k,v in kmeans_dict.items():
    if v == first_label:
        el_list_first_label.append(el_list[int(k)])
        sg_list_first_label.append(sg_list[int(k)])

In [54]:
unique_sg_first = np.unique(sg_list_first_label)
print(unique_sg_first)
map_sg_dict = {}
for i in range(len(unique_sg_first)):
    map_sg_dict.update({unique_sg_first[i]:i})
print(map_sg_dict)
ind_sg_list_first_label = []
for i in range(len(sg_list_first_label)):
    label_ind = map_sg_dict[sg_list_first_label[i]]
    ind_sg_list_first_label.append(label_ind)
print(len(sg_list_first_label))
plt.figure()
plt.hist(ind_sg_list_first_label, bins=17, label=map_sg_dict)
plt.show()

['F-43c' 'F-43m' 'Fd-3c' 'Fd-3m' 'Fm-3c' 'Fm-3m' 'I-43d' 'I-43m' 'I23'
 'Ia-3d' 'Im-3m' 'P-43m' 'P-43n' 'Pm-3m' 'Pm-3n' 'Pn-3m' 'Pn-3n']
{'P-43m': 11, 'F-43m': 1, 'Fd-3c': 2, 'Fm-3c': 4, 'I-43m': 7, 'Fm-3m': 5, 'Pm-3n': 14, 'Pm-3m': 13, 'I-43d': 6, 'I23': 8, 'Pn-3n': 16, 'Im-3m': 10, 'P-43n': 12, 'Fd-3m': 3, 'Ia-3d': 9, 'F-43c': 0, 'Pn-3m': 15}
667


<IPython.core.display.Javascript object>

In [106]:
# score along PC
loading = copy.copy(pca.components_)
pc_project_1 = PC_projection(gr_list, loading, 1)
sorted_ind = sort_ind(pc_project_1)
sorted_ind_30 = sorted_ind[-30:]
top5_el_list = []
for el in sorted_ind[-5:]:
    top5_el_list.append(el_list[el])
print(top5_el_list)
pc_fig = plt.figure()
plt.scatter(sorted_ind[-30:], pc_project_1[sorted_ind[-30:]], c='r', alpha=0.6)
plt.xlabel('index to data')
plt.ylabel('score along PC')
plt.title('Score along first PC')
plt.show()

[['Nd1Ho1In2.cif'], ['Cu1S1.cif'], ['Sm1S1.cif'], ['La1Ag1.cif'], ['Hg1Se1.cif']]


<IPython.core.display.Javascript object>

In [31]:
ind = 255
obs = gr_list[ind]
mean = copy.copy(pca.mean_)
loading = copy.copy(pca.components_)
score = cal_score(obs, loading)

fig, axs = plt.subplots(3,2, figsize=(15, 8), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .8, wspace=.8)
axs = axs.ravel()
for i in range(6):
    PC_num = (i+1)
    recon = reconstruct(obs, mean, score, loading, PC_num)
    axs[i].plot(r_grid, obs, 'b')
    axs[i].plot(r_grid, recon, 'r')
    axs[i].plot(r_grid, -((obs-recon)+10), 'g')
    axs[i].set_title('{} with {} PCs'.format(el_list[ind], PC_num))
fig.set_size_inches(12, 8, forward=False)

<IPython.core.display.Javascript object>

In [32]:
fig, axs = plt.subplots(3,2, figsize=(15, 8), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .8, wspace=.8)
axs = axs.ravel()
for i in range(6):
    PC_num = (i+2)*5
    recon = reconstruct(obs, mean, score, loading, PC_num)
    axs[i].plot(r_grid, obs, 'b')
    axs[i].plot(r_grid, recon, 'r')
    axs[i].plot(r_grid, -((obs-recon)+10), 'g')
    axs[i].set_title('{} with {} PCs'.format(el_list[ind], PC_num))
fig.set_size_inches(12, 8, forward=False)

<IPython.core.display.Javascript object>

In [33]:
fig, axs = plt.subplots(3,2, figsize=(15, 8), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .8, wspace=.8)
axs = axs.ravel()
for i in range(6):
    PC_num = (i+1)*20
    recon = reconstruct(obs, mean, score, loading, PC_num)
    axs[i].plot(r_grid, obs, 'b')
    axs[i].plot(r_grid, recon, 'r')
    axs[i].plot(r_grid, -((obs-recon)+10), 'g')
    axs[i].set_title('{} with {} PCs'.format(el_list[ind], PC_num))
fig.set_size_inches(12, 8, forward=False)

<IPython.core.display.Javascript object>

In [34]:
ind = 377
obs = gr_list[ind]
mean = copy.copy(pca.mean_)
loading = copy.copy(pca.components_)
score = cal_score(obs, loading)

fig, axs = plt.subplots(3,2, figsize=(15, 8), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .8, wspace=.8)
axs = axs.ravel()
for i in range(6):
    PC_num = (i+1)
    recon = reconstruct(obs, mean, score, loading, PC_num)
    axs[i].plot(r_grid, obs, 'b')
    axs[i].plot(r_grid, recon, 'r')
    axs[i].plot(r_grid, -((obs-recon)+10), 'g')
    axs[i].set_title('{} with {} PCs'.format(el_list[ind], PC_num))
fig.set_size_inches(12, 8, forward=False)

<IPython.core.display.Javascript object>

In [35]:
ind = 377
obs = gr_list[ind]
mean = copy.copy(pca.mean_)
loading = copy.copy(pca.components_)
score = cal_score(obs, loading)

fig, axs = plt.subplots(3,2, figsize=(15, 8), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .8, wspace=.8)
axs = axs.ravel()
for i in range(6):
    PC_num = (i+1)*5
    recon = reconstruct(obs, mean, score, loading, PC_num)
    axs[i].plot(r_grid, obs, 'b')
    axs[i].plot(r_grid, recon, 'r')
    axs[i].plot(r_grid, -((obs-recon)+10), 'g')
    axs[i].set_title('{} with {} PCs'.format(el_list[ind], PC_num))
fig.set_size_inches(12, 8, forward=False)

<IPython.core.display.Javascript object>

In [36]:
ind = 377
obs = gr_list[ind]
mean = copy.copy(pca.mean_)
loading = copy.copy(pca.components_)
score = cal_score(obs, loading)

fig, axs = plt.subplots(3,2, figsize=(15, 8), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .8, wspace=.8)
axs = axs.ravel()
for i in range(6):
    PC_num = (i+1)*20
    recon = reconstruct(obs, mean, score, loading, PC_num)
    axs[i].plot(r_grid, obs, 'b')
    axs[i].plot(r_grid, recon, 'r')
    axs[i].plot(r_grid, -((obs-recon)+10), 'g')
    axs[i].set_title('{} with {} PCs'.format(el_list[ind], PC_num))
fig.set_size_inches(12, 8, forward=False)

<IPython.core.display.Javascript object>

In [37]:
ind = 255
obs = gr_list[ind]
mean = copy.copy(pca.mean_)
loading = copy.copy(pca.components_)
score = cal_score(obs, loading)
for i in range(10):
    PC_num = (i+1)*10
    recon = reconstruct(obs, mean, score, loading, PC_num)
    fig = plt.figure()
    plt.plot(r_grid, obs, 'b')
    plt.plot(r_grid, recon, 'r')
    plt.plot(r_grid, -((obs-recon)+10), 'g')
    plt.title('{} with {} PCs'.format(el_list[ind], PC_num))
    #plt.legend(['data','PCA reconstruct', 'diff'], loc=2)
    plt.show()
    fig.set_size_inches(10, 8, forward=True)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
ind = 377
obs = gr_list[ind]
mean = copy.copy(pca.mean_)
loading = copy.copy(pca.components_)
score = cal_score(obs, loading)
for i in range(10):
    PC_num = (i+1)*10
    recon = reconstruct(obs, mean, score, loading, PC_num)
    fig = plt.figure()
    plt.plot(r_grid, obs, 'b')
    plt.plot(r_grid, recon, 'r')
    plt.plot(r_grid, -((obs-recon)+10), 'g')
    plt.title('{} with {} PCs'.format(el_list[ind], PC_num))
    #plt.legend(['data','PCA reconstruct', 'diff'], loc=2)
    plt.show()
    fig.set_size_inches(10, 8, forward=True)

[ 0.  0.  0. ...,  0.  0.  0.]


<IPython.core.display.Javascript object>

[ 0.  0.  0. ...,  0.  0.  0.]


<IPython.core.display.Javascript object>

[ 0.  0.  0. ...,  0.  0.  0.]


<IPython.core.display.Javascript object>

[ 0.  0.  0. ...,  0.  0.  0.]


<IPython.core.display.Javascript object>

[ 0.  0.  0. ...,  0.  0.  0.]


<IPython.core.display.Javascript object>

[ 0.  0.  0. ...,  0.  0.  0.]




<IPython.core.display.Javascript object>

[ 0.  0.  0. ...,  0.  0.  0.]


<IPython.core.display.Javascript object>

[ 0.  0.  0. ...,  0.  0.  0.]


<IPython.core.display.Javascript object>

[ 0.  0.  0. ...,  0.  0.  0.]


<IPython.core.display.Javascript object>

[ 0.  0.  0. ...,  0.  0.  0.]


<IPython.core.display.Javascript object>

In [None]:
class myPCA(object):
    ''' inpout matrix dimension should be (num of observation) x (gr points)'''
    def __init__(self, data, el_list = None, sg_list = None):
        self.data = data
        print('your data dimension = {}'.format(np.shape(self.data)))
        self.mean_data = np.mean(data, axis=0)
        if not el_list:
            self.element_list = el_list
        if not sg_list:
            self.space_group_list = sg_list
        
    def fit(self):
        center_mat = self.data - self.mean_data
        cov_mat = np.dot(np.transpose(center_mat), center_mat) / float(np.shape(self.data)[0])
        #cov_mat = np.cov(center_mat, rowvar=0)
        (eig_val, loading) = np.linalg.eig(cov_mat)
        self.eig_val = np.real(eig_val)
        self.loading = np.real(loading)
        
    def cover_ratio(self):
        return np.cumsum(self.eig_val)/np.sum(self.eig_val)
    
    def _cal_score(self, obs, debug=False):
        '''calculate scores of observation along every principle component'''
        print('dim of observation is {}'.format(np.shape(obs)))
        print('dim of loading is {}'.format(np.shape(self.loading)))
        score_list = []
        dim_loading = np.shape(self.loading)
        for i in range(dim_loading[0]):
            #print('processing loading {} over {}'.format(i, dim_loading[0]))
            score = np.dot(self.loading[i,], obs.T)
            if debug:
                score_shape = np.shape(score)
                print('shape of score = {}'.format(score_shape))
            score_list.append(score)
        score_list_output = np.array(score_list)
        print('dim of score = {}'.format(score_list_output.shape))
        print('End of score calculation....')
        return score_list_output
    
    def PC_reconstruct(self, obs_ind, PC_num):
        print('Start reconstruction.....')
        print('using {} principle componets to reconstruct data'.format(PC_num))
        print('index of observation taht is going to be reconstructed = {}'.format(obs_ind))
        obs = self.data[obs_ind]
        obs_dim = np.shape(obs)
        score = self._cal_score(obs)
        reconstruct = np.zeros(obs_dim) + self.mean_data
        for i in range(PC_num):
            reconstruct += np.dot(self.loading[i,],score[i])
        self._plot_and_compare(obs, reconstruct)
        pct_error = abs(reconstruct - obs)/abs(obs)
        return (reconstruct, obs, pct_error)
    
    def _plot_and_compare(self, obs, reconstruct):
        plt.figure()
        plt.plot(obs, 'r')
        plt.plot(reconstruct, 'bo')
        plt.show()
        return