In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# PCA
## Load data

In [None]:
X = pd.read_csv('./leaf.csv', header=None).values
X.shape

## 1.1

In [None]:
# scale data
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X_scaled = (X-mean) / std

# calculate covariance matrix
cov = np.cov(X_scaled.T)

# calculate eigenvectors and eigenvalues
eig_vals, eig_vecs = np.linalg.eig(cov)

# plot eigenvalues in ascending order
plt.figure(figsize=(8,4))
plt.plot(np.arange(len(eig_vals))+1,np.sort(eig_vals))
plt.xlim([1,len(eig_vals)])
plt.xlabel('eigenvalue index')
plt.ylabel('value')
plt.xticks(np.arange(len(eig_vals))+1)
plt.show()

## 1.2

In [None]:
# selecting how many eigenvalues to use (choosing k)
eig_vals_percentage = np.flip(np.sort(eig_vals))/np.sum(eig_vals)
ks = np.arange(len(eig_vals))+1
k_sums = np.zeros_like(eig_vals)
for k, val in enumerate(eig_vals_percentage):
    k_sums[k] = k_sums[k-1]+val
    if k_sums[k]>0.95 and k_sums[k-1]<0.95:
        print(f'k={k+1} is a good choise of k')
pd.DataFrame({'k':ks, 'coverage %':k_sums}, columns= ['k', 'coverage %']).style.hide_index()

## 1.3

In [None]:
# printing the eigenvectors for k=2
pd.DataFrame({'w1':eig_vecs[0], 'w2':eig_vecs[1]}, columns= ['w1', 'w2'])

## 1.4

In [None]:
# calculate  and plat data on PCA space
alpha1 = X_scaled.dot(eig_vecs[0])
alpha2 = X_scaled.dot(eig_vecs[1])
plt.figure(figsize=(6,6))
plt.scatter(alpha1,alpha2)
plt.xlabel(r'$\alpha_1$')
plt.ylabel(r'$\alpha_2$')
plt.show()

## 1.5

In [None]:
# calculate mean squared reconstruction error
X_pca = np.zeros_like(X)
for i in range(X_pca.shape[0]):
    X_pca[i] = eig_vecs[:2].T.dot(eig_vecs[:2].dot(X_scaled[i]))
MSE = np.mean((X_scaled-X_pca)**2)
print(f'The Mean squared reconstruction error is {MSE}')