In [1]:
import pandas as pd
import numpy as np

In [8]:
class PCA:
    def __init__(self):
        self.X = []
        self.y = []
        

    def load_data(self):
            """load a data from given path """
            dtypes = {"sepallength":np.float64,  
                      "sepalwidth":np.float64, 
                      "petallength":np.float64, 
                      "petalwidth":np.float64, 
                      "target":np.str}
            path_to_file = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
            df_data = pd.read_csv(path_to_file, header = None, 
                                  dtype= dtypes)

            print(df_data.info())
            self.X = df_data.iloc[:,0:4].values
            self.y = df_data.iloc[:, 4].values
            print("# of features:%d " %self.X.shape[0])
            
    
    def fit(self):
        """
        Eigenvectors and eigenvalues of a covariance matrix 
        is core of a PCA. The principal components (eigenvector)
        determine the direction of the new features, and eigenvalues
        holds the variance of the data along the new feature axes
        that determine their magnitude.
        """
        
        # standarized
        print(np.mean(self.X, axis=1).shape)
        X_norm = (self.X - np.mean(self.X, axis=0)) / np.std(self.X, axis=0)
        
        # covariance matrix
        # X_mean = np.mean(X_norm, axis=0) # featurewise mean
        # X_cov_test = (X_norm - X_mean).T.dot(X_norm - X_mean)/(X_norm.shape[0] -1)
        # print("X_cov_test ", X_cov_test)
        X_cov = np.cov(X_norm.T)
        print("X_cov ", X_cov)
        
        # eigendecomposition of d*d matrix where d represents # of features
        self._ei_vals, self._ei_vecs = np.linalg.eig(X_cov)
        print("test ", self._ei_vals, self._ei_vecs)
        
        # extract eigenvalue and eigenvector pair
        self._e_pairs = [([np.abs(self._ei_vals[i]) , self._ei_vecs[:,i]]) \
                        for i in range(len(self._ei_vals))]
        # select a top eigen vectors
        self._e_pairs.sort(key=lambda x: x[0], reverse=True)
    
    def largest_eigenvalues(self, k=5):
        """
        After sorting based on eigenvalue (variance), return
        top five eigenvalue and its corresponding eigenvectors.
        """
        if len(self._e_pairs) < k:
            return self._e_pairs
        else:
            return self._e_pairs[:k] 
        
    def explained_variance(self):
        """Computed explained variance"""
        total_variance = sum([self._e_pairs[i][0] for i in range(len(self._e_pairs))])
        variance = [self._e_pairs[i][0]/total_variance for i in range(len(self._e_pairs))]
        return np.cumsum(variance)
        
        
    
    def SVD_matrix_decomp(self):
        # standarized
        X_norm = (X - np.mean(self.X, axis=1)) / np.std(self.X, axis=1)
        X_mean = np.mean(X_norm, axis=0) # featurewise mean
        
        # SVD 
        # X_cov = (X_norm - X_mean).T.dot(X_norm - X_mean)/(X_norm.shape[0] -1)
        U, S, V = np.linalg.svd(X_norm.T)
        # where U and V are unity matrix, dot product of the inverse
        # gives identify matrix 
        
        

    
            

In [9]:
pca = PCA()
pca.load_data()
pca.fit()
print(pca.explained_variance())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
0    150 non-null float64
1    150 non-null float64
2    150 non-null float64
3    150 non-null float64
4    150 non-null object
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None
# of features:150 
(150,)
X_cov  [[ 1.00671141 -0.11010327  0.87760486  0.82344326]
 [-0.11010327  1.00671141 -0.42333835 -0.358937  ]
 [ 0.87760486 -0.42333835  1.00671141  0.96921855]
 [ 0.82344326 -0.358937    0.96921855  1.00671141]]
test  [2.93035378 0.92740362 0.14834223 0.02074601] [[ 0.52237162 -0.37231836 -0.72101681  0.26199559]
 [-0.26335492 -0.92555649  0.24203288 -0.12413481]
 [ 0.58125401 -0.02109478  0.14089226 -0.80115427]
 [ 0.56561105 -0.06541577  0.6338014   0.52354627]]
m  2.930353775589317
m  0.9274036215173419
m  0.14834222648163944
m  0.02074601399559593
total variance  4.026845637583894
[0.7277045209380135, 0.2303052326768065, 0.03683831957627379, 0.005151926808906321]
[0.72770