***
# <font color=blue>UNSUPERVISED LEARNING</font>
# <font color=blue>Practice with PCA</font>
# <font color=blue>(lecturer version)</font>
<div style="text-align: right"><font color=magenta>Andrea De Simone</font></div>
***

In [None]:
import numpy as np  
import pandas as pd  
from scipy import ndimage
import matplotlib.pyplot as plt  
%matplotlib inline
from sklearn import datasets

***
# 1. PCA on IRIS data

## 1.1 Load IRIS Dataset

In [None]:
iris = datasets.load_iris()
X = iris.data
Y = iris.target

In [None]:
def scatter_plot(X):
    
    fig, ax = plt.subplots()
    ax.set_xlabel('$x_3$',size=16)
    ax.set_ylabel('$x_4$',size=16)
    
    ax.scatter(X[:,2], X[:,3], c='blue', alpha=1, marker='x')
    plt.show() 

In [None]:
scatter_plot(X)

## 1.2 Covariance matrix

In [None]:
# Standardize features
X_norm = (X - X.mean(axis=0)) / X.std(axis=0)

### <font color='magenta'>>>> Q1: Compute the covariance matrix 'Sigma' out of 'X_norm' </font>

In [None]:
# Compute the covariance matrix

# Stard Edit  
#Sigma = 
# End Edit

print(Sigma)

## 1.3 Diagonalization

In [None]:
eigenValues, eigenVectors = np.linalg.eigh(Sigma)

# Sort in descending order of eigenvalues
idx = eigenValues.argsort()[::-1]  # list reverse
eigenValues = eigenValues[idx]
V = eigenVectors[:,idx]

print("Eigenvalues = ",eigenValues)
print("Eigenvectors = \n",V)

In [None]:
# check diagonalization
V.T.dot(Sigma).dot(V)

## 1.4 Project and recover data

In [None]:
# Set number of principal components
k=2

# Projection matrix
W = V[:,:k]

print(W)

### <font color='magenta'>>>> Q2: Compute the projected data Z along principal components </font>

In [None]:
# Projected data

# Start Edit
#Z = 
# End Edit

print(Z[:3])

In [None]:
# Recover data
X_approx = Z.dot(W.T)
X_approx[:10]

In [None]:
scatter_plot(X_approx)
scatter_plot(X_norm)

In [None]:
# Set number of principal components
k=3

# Projection matrix
W = V[:,:k]
Z = X_norm.dot(W)
X_approx = Z.dot(W.T)

scatter_plot(X_approx)
scatter_plot(X_norm)

## 1.5 Explained Variance

### <font color='magenta'>>>> Q3: Compute the % of variance explained for k=1,2,3</font>

In [None]:
# Start Edit

# End Edit

In [None]:
D = X.shape[1]
fig,ax = plt.subplots()

ax.bar(range(D),eigenvalue_fractions, color='orange')
ax.plot(range(D), var_explained, marker="o")

ax.set_xlabel("Principal Component")
ax.set_ylabel("% Variance explained")

ax.set_xticks(range(D))
ax.set_xticklabels(range(1,D+1))
plt.show()

***
# 2. PCA on Faces

## 2.1 Load Dataset 'Faces'

In [None]:
# Load data
X = np.loadtxt('dataset_faces.csv', delimiter=',')
print(X[:2])
print(X.shape) # 10 faces, each consists of 32x32 grayscale pixel values

In [None]:
def plot_faces(X):
    
    N_faces = 10
    fig, ax = plt.subplots(1,N_faces,figsize=(16,4))

    for i in range(N_faces):

        face = np.reshape(X[i,:], (32, 32))  
        rotated_face = ndimage.rotate(face, 90)
        ax[i].imshow(rotated_face, origin="lower", cmap='gray')
        ax[i].set_axis_off()

    plt.show()

In [None]:
plot_faces(X)

In [None]:
# Normalize Features
X_norm = (X - X.mean(axis=0)) / X.std(axis=0)

# Covariance Matrix
Sigma = X_norm.T.dot(X_norm) / X_norm.shape[0]

# Diagonalization
eigenValues, eigenVectors = np.linalg.eigh(Sigma)

# Sort in descending order of eigenvalues
idx = eigenValues.argsort()[::-1]  # list reverse
eigenValues = eigenValues[idx]
V = eigenVectors[:,idx]

## 2.2 Vary the number of principal components

### <font color='magenta'>>>> Q4: for what number of principal components more than 95% of the variance is explained?</font>

In [None]:
# Start Edit

# End Edit

In [None]:
max_pc=20
fig,ax = plt.subplots()

ax.bar(range(max_pc),eigenvalue_fractions[:max_pc], color='orange')
ax.plot(range(max_pc), var_explained[:max_pc], marker="o")

ax.set_xlabel("Principal Component")
ax.set_ylabel("% Variance explained")

ax.set_xticks(range(max_pc))
ax.set_xticklabels(range(1,max_pc+1))
plt.show()

In [None]:
# Set number of principal components
k=3

# Projection matrix
W = V[:,:k]

# Projected data
Z = X_norm.dot(W)

# Recovered data
X_approx = Z.dot(W.T)

print("% of variance explained: {:.1f}%".format(var_explained[k-1]))

plot_faces(X.mean(axis=0)+ X_approx*X.std(axis=0))
plot_faces(X)

In [None]:
# Set number of principal components
k=5

# Projection matrix
W = V[:,:k]

# Projected data
Z = X_norm.dot(W)

# Recovered data
X_approx = Z.dot(W.T)

print("% of variance explained: {:.1f}%".format(var_explained[k-1]))

plot_faces(X.mean(axis=0)+ X_approx*X.std(axis=0))
plot_faces(X)

In [None]:
# Set number of principal components
k=10

# Projection matrix
W = V[:,:k]

# Projected data
Z = X_norm.dot(W)

# Recovered data
X_approx = Z.dot(W.T)

print("% of variance explained: {:.1f}%".format(var_explained[k-1]))

plot_faces(X.mean(axis=0)+ X_approx*X.std(axis=0))
plot_faces(X)