# Geometrical Methods in Machine Learning
## Seminar 1: PCA

In [None]:
from __future__ import print_function

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np

from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine, load_boston, fetch_olivetti_faces, fetch_mldata
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## 1. Model data

#### Exercise

Consider a generated dataset which is sampled from multivariate normal distribution with covariance matrix $C = \begin{pmatrix} 3 & 1 \\ 1 & 2 \end{pmatrix}$

In [None]:
sample_size = 500

mu = np.zeros(2)
C = np.array([[3,1],[1,2]])
data = np.random.multivariate_normal(mu, C, size=sample_size)

# find true PC components by covariance matrix diagonalization E = V * L * V^-1
l, V = np.linalg.eig(C)
l, V

Find PCs and projection using scikit-learn

In [None]:
# estimate PC components from empirical data
pca = PCA()
pca.fit(data)
eigenvectors = pca.components_

In [None]:
def plot_principal_components(data, model, scatter=True):
    W_pca = model.components_
    
    if scatter:
        plt.scatter(data[:,0], data[:,1])
    
    plt.plot(data[:,0], -(W_pca[0,0]/W_pca[0,1])*data[:,0], color="c")
    plt.plot(data[:,0], -(W_pca[1,0]/W_pca[1,1])*data[:,0], color="c")

    plt.axis('equal')
    limits = [np.minimum(np.amin(data[:,0]), np.amin(data[:,1]))-0.5,
              np.maximum(np.amax(data[:,0]), np.amax(data[:,1]))+0.5]
    plt.xlim(limits[0],limits[1])
    plt.ylim(limits[0],limits[1])
    plt.draw()

In [None]:
fig = plt.figure(figsize=(6,6))
plt.xlim(-7, 7)
plt.ylim(-7, 7)
plt.grid(linestyle="dotted")

# plot true principal components
plt.plot(data[:,0], (V[0,0] / V[0,1]) * data[:,0], color="g")
plt.plot(data[:,0], (V[1,0] / V[1,1]) * data[:,0], color="g")

# plot estimated principal components
plot_principal_components(data, pca, scatter=False)

plt.title("Multivariate gaussian")
plt.xlabel("X")
plt.ylabel("Y")
plt.scatter(data[:,0], data[:,1], alpha=0.2)
plt.show()

#### Exercise

Find PCs and projection manually

Estimator of covariance matrix of normally distributed variables:

$$\hat{S}_{\mathcal{N}} = \frac{1}{n} (\mathbf{X} - \mathbf{\bar{x}})^T (\mathbf{X} - \mathbf{\bar{x}})$$

Use either eigendecomposition `np.linalg.eig`:
$$\mathbf{A} = \mathbf{V} \mathbf{\Lambda} \mathbf{V}^{-1}$$

or SVD `np.linalg.eig`:

$$\mathbf{A} = \mathbf{U} \mathbf{S} \mathbf{V}^T$$

In [None]:
# center the data
mean = 
data_centered = 

# estimate data covariance matrix, X.T * X
covariance_hat = 

# find eigenvalues and eigenvectors of data covariance matrix
eigenvalues, eigenvectors = 

print("\nNumerical mean of centered data (should be zero):\n", mean)
print("\nEstimation of covariance matrix:\n", covariance_hat)
print("\nEigenvalues:\n", eigenvalues)
print("\nEigenvectors\n", eigenvectors)

#### Exersice:

Compute and plot the projection on eigenbasis:

Let $\mathbf{V}'$ be $D \times d$ matrix whose columns contain the largest eigenvectors and let $\mathbf{X}$ be the original data $n \times D$ matrix whose columns contain the different observations. Then the projected data $\mathbf{X}'$ is obtained as $\mathbf{X} = \mathbf{X} \mathbf{V}'$. 

In [None]:
# compute projection
proj = 
proj.shape

In [None]:
fig = plt.figure(figsize=(6,6))
plt.xlim(-7, 7)
plt.ylim(-7, 7)
plt.grid(linestyle="dotted")

plt.title("Projected data")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.scatter(proj[:,0], proj[:,1], alpha=0.2)
plt.show()

## 2. Real data

### 2.1 Airfoils

In [None]:
ref_points = np.loadtxt('./data/ref_points.csv', delimiter=',') # X coordinates
X_train = np.loadtxt('./data/airfoils.csv', delimiter=',') # 199 wings
X_test = np.loadtxt('./data/test_afl.csv', delimiter=',') # 200th wing

In [None]:
ref_points

In [None]:
fig = plt.figure(figsize=(12,5))
plt.ylim(-0.23, 0.23)
plt.grid(linestyle="dotted")

#plt.plot(ref_points, test_point, '-*', label = 'Original test')
plt.plot(ref_points, X_train[0, :], '-*', label = 'Airfoil #1')
plt.plot(ref_points, X_train[1, :], '-*', label = 'Airfoil #2')
plt.plot(ref_points, X_train[2, :], '-*', label = 'Airfoil #3')
#plt.plot(ref_points, test_point, label = 'Original')

plt.title("Airfoils")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend(['Airfoil #1', 'Airfoil #2', 'Airfoil #3'])

### Airfoils principal components

#### Exercise  
Plot eigenvectors for the 5 largest and 5 smallest eigenvalues

In [None]:
# apply PCA
pca = PCA()
pca.fit(X_train)
components = pca.components_

In [None]:
for i in range(5):
    fig = plt.figure(figsize=(6,2.5))
    plt.ylim(-0.3, 0.3)
    plt.grid(linestyle="dotted")
    plt.plot(ref_points, pca.components_[i, :], '-*')
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.legend(['Component #{}'.format(i+1)])
    plt.show()
    
for i in range(len(pca.components_) - 5, len(pca.components_)):
    fig = plt.figure(figsize=(6,2.5))
    plt.ylim(-0.3, 0.3)
    plt.grid(linestyle="dotted")
    plt.plot(ref_points, pca.components_[i, :], '-*')
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.legend(['Component #{}'.format(i+1)])
    plt.show()

#### Exercise

Explained variance is a ratio of sum of squared eigenvalues left to the sum of all eigenvalues:

$$EV =  = \frac{\lambda_1^2}{\lambda_1^2 + \dots + \lambda_n^2} = \frac{\sum_{i=1}^{d} \lambda_i^2}{\sum_{j=1}^{D} \lambda_j^2}$$

Plot expained `pca.explained_variance_ratio_` and cumulative explained variance, choose sample dimensionality, state the rule used

In [None]:
EV = 
CEV = 

In [None]:
# plot EV/CEVs
fig = plt.figure(figsize=(12,5.25))

plt.subplot(121)
plt.title("Explained variance")
plt.xlabel("# PCs")
plt.grid(linestyle="dotted")
plt.plot(EV, "o-")

plt.subplot(122)
plt.title("Cumulative explained variance")
plt.axhline(linewidth=1, y=0.99, color='r')
plt.axhline(linewidth=1, y=0.95, color='r')
plt.axhline(linewidth=1, y=0.9, color='r')
plt.axhline(linewidth=1, y=0.8, color='r')
plt.xlabel("# PCs")
plt.grid(linestyle="dotted")
plt.plot(CEV, "o-")

#### Exercise 
Describe original airfoil vector in eigenbasis. Try different number of basis eigenvectors.

In [None]:
# apply PCA
pca = PCA(n_components=1)
X_train_pca = pca.fit_transform(X_train)
eigen_vectors = pca.components_

sample_mean = X_train.mean(axis=0)
X_test_pcs = np.dot(np.dot(X_test - sample_mean, eigen_vectors.T), eigen_vectors) + sample_mean

In [None]:
fig = plt.figure(figsize=(12,5))
plt.ylim(-0.23, 0.23)
plt.grid(linestyle="dotted")

#plt.plot(ref_points, test_point, '-*', label = 'Original test')
plt.plot(ref_points, X_test, '-*', label = 'Airfoil #200')
plt.plot(ref_points, X_test_pcs, '-*', label = 'Airfoil #200 in eigenspace')

plt.title("Airfoils")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend(['Airfoil #1', 'Airfoil #1 in eigenspace'])

#### Exercise 
Measure reconstruction loss of out-of-sample airfoil `X_test` with different number of principal components in terms of 2-norm, conclude.

In [None]:
# apply PCA
norms = []

for n in range(X_train.shape[1]):
    pca = PCA(n_components=(n+1))
    X_train_pca = pca.fit_transform(X_train)
    eigen_vectors = pca.components_
    
    sample_mean = X_train.mean(axis=0)
    X_test_pcs = np.dot(np.dot(X_test - sample_mean, eigen_vectors.T), eigen_vectors) + sample_mean
    
    norms.append(###) # your code here

In [None]:
fig = plt.figure(figsize=(6,6.5))
plt.title("Reconstruction loss")
plt.grid(linestyle="dotted")
plt.plot(norms)
plt.xlabel("# PCs")
plt.ylabel("2-norm")
plt.show()

### 2.2. MNIST

In [None]:
mnist = fetch_mldata('MNIST original')
train_size = 60000

X_train, X_test, y_train, y_train = \
mnist.data[:train_size,:], mnist.data[train_size:,:], \
mnist.target[:train_size], mnist.target[train_size:]

print("Dataset summary:\nSamples: {}, features: {}, classes: {}"
      .format(X_train.shape[0] + X_test.shape[0], X_train.shape[1], np.unique(y_train).shape[0]))

In [None]:
# apply PCA
pca = PCA()
pca.fit(X_train)

#### Exercise

Plot expained and cumulative explained variance, choose sample dimensionality, state the rule used

In [None]:
EV = 
CEV = 

In [None]:
# plot EV/CEVs
fig = plt.figure(figsize=(12,5.25))

plt.subplot(121)
plt.title("Explained variance")
plt.xlabel("# PCs")
plt.grid(linestyle="dotted")
plt.plot(EV)

plt.subplot(122)
plt.title("Cumulative explained variance")
plt.xlabel("# PCs")
plt.grid(linestyle="dotted")
plt.axhline(linewidth=1, y=0.99, color='r')
plt.axhline(linewidth=1, y=0.95, color='r')
plt.axhline(linewidth=1, y=0.9, color='r')
plt.axhline(linewidth=1, y=0.8, color='r')
plt.plot(CEV)

#### Exersice

Choose one of the digits $0 \dots 9$ and encode it in terms of different number of principal compoments.

In [None]:
# apply PCA
pca = PCA(n_components=2)
pca.fit(X_train)
eigenvectors = pca.components_

EV = pca.explained_variance_ratio_
CEV = np.cumsum(pca.explained_variance_ratio_)

In [None]:
# encode vector in eigenbasis
sample_mean = X_train.mean(axis=0)
X_test_pcs = np.dot(np.dot(X_test - sample_mean, eigenvectors.T), eigenvectors) + sample_mean

In [None]:
# plot numbers
shape = (28, 28)
_, ax = plt.subplots(10, 10, figsize=(10, 10),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))

for i, a in enumerate(ax.flat):
    a.imshow(X_test_pcs[i].reshape(shape), cmap=plt.cm.gray, interpolation="nearest")

#### Exercise 
Measure reconstruction loss of chosen digit within test data `X_test` with different number of principal components in terms of mean 2-norm and PSNR metric.

In [None]:
from psnr import psnr

# your code here

## 3. PCs interpretation

### 3.1. US arrests

In [None]:
# load data
X_arrests = np.loadtxt("./data/usarrests.csv", delimiter=",", skiprows=1, usecols=(1,2,3,4))
targets = list(np.genfromtxt('./data/usarrests.csv', delimiter=',', dtype='str', skip_header=True)[:,0])
y_arrests = [ x.replace('"', '') for x in targets ]
features_arrest = list(np.genfromtxt('./data/usarrests.csv', delimiter=',', dtype='str', skip_header=False)[0,1:])
features_arrest = [ x.replace('"', '') for x in features_arrest ]
#X_arrests, y_arrests, feature_names

#### Exercise

Standartize data to zero mean and unit variance with `StandardScaler`

In [None]:
# standartize data to zero mean and unit variance
X_arrests_std = 

In [None]:
pca = PCA()
pca.fit(X_arrests_std)

In [None]:
# set eigenvectors to project data to
pc1, pc2 = 0, 1
eigenvector_x = pca.components_[pc1]
eigenvector_y = pca.components_[pc2]

# project data into PC space
xs = pca.transform(X_arrests_std)[:,pc1]
ys = pca.transform(X_arrests_std)[:,pc2]

# visualize projections
plt.figure(figsize=(10,10))

# arrows project features (ie columns from csv) as vectors onto PC axes
for i in range(eigenvector_x.shape[0]):
    plt.arrow(0, 0, eigenvector_x[i] * max(xs), eigenvector_y[i] * max(ys), color='r', width=0.0005)
    plt.text(eigenvector_x[i] * max(xs) + 0.05, eigenvector_y[i] * max(ys), features_arrest[i], color='r')

# circles project documents (ie rows from csv) as points onto PC axes
for i in range(xs.shape[0]):
    plt.plot(xs[i], ys[i], 'bo', alpha=0.25)
    plt.text(xs[i] - 0.05, ys[i] + 0.1, y_arrests[i], color='b', alpha=0.25)

plt.title("Biplot")
plt.xlabel("PC " + str(pc1 + 1))
plt.ylabel("PC " + str(pc2 + 1))
plt.xlim((-3.5, 3.5))
plt.ylim((-3.5, 3.5))
plt.grid(linestyle="dotted")
plt.show()

### 3.2. Wine dataset

In [None]:
wine = load_wine()
X_wine = wine.data
y_wine = wine.target
features_wine = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
# X_wine, y_wine, features_wine

In [None]:
X_wine_std = StandardScaler().fit_transform(X_wine)

pca = PCA()
pca.fit(X_wine_std)

In [None]:
# set eigenvectors to project data to
pc1, pc2 = 0, 1
eigenvector_x = pca.components_[pc1]
eigenvector_y = pca.components_[pc2]

# project data into PC space
xs = pca.transform(X_wine_std)[:,pc1]
ys = pca.transform(X_wine_std)[:,pc2]

# visualize projections
plt.figure(figsize=(10,10))

# arrows project features (ie columns from csv) as vectors onto PC axes
for i in range(eigenvector_x.shape[0]):
    plt.arrow(0, 0, eigenvector_x[i] * max(xs), eigenvector_y[i] * max(ys), color='r', width=0.0005, alpha=0.5)
    plt.text(eigenvector_x[i] * max(xs) + 0.05, eigenvector_y[i] * max(ys), features_wine[i], color='r')

colors = ['r', 'g', 'b']
# circles project documents (ie rows from csv) as points onto PC axes
for i in range(xs.shape[0]):
    plt.plot(xs[i], ys[i], 'o', color=colors[y_wine[i]], alpha=0.25)

plt.title("Biplot")
plt.xlabel("PC " + str(pc1 + 1))
plt.ylabel("PC " + str(pc2 + 1))
plt.xlim((-4.5, 4.5))
plt.ylim((-4.5, 4.5))
plt.grid(linestyle="dotted")
plt.show()

## 4. PCA for decision making

- high-dimensional space is more dangerous to overfit complex models, than simple ones
- PC are orthogonal, decorellated features
    - may improve linear classifiers, if data multicolinearity problem
    - for complex decision boundaries classifiers, they can be a problem in even lower dimensions
- variable scaling, ensure your data is scaled to isotropic Gaussian $\sim \mathcal{N}(0, 1)$ or at least have similar scale (`MinMaxScaler`)
- you can estimate first $q$ PCs only, using iterative optimization, instead full eigenvalue/SVD decoposition and then taking top PC corresponing to largest eigenvalues

### 4.1. Eigenfaces

Eigenfaces as principal compoments and logistic regression classification

In [None]:
# load Olivetti faces dataset
dataset = fetch_olivetti_faces(shuffle=True, random_state=1)
train_size = 350
X_train, X_test, y_train, y_test = dataset.data[:train_size,:], dataset.data[train_size:,:], dataset.target[:train_size], dataset.target[train_size:]
shape = (64, 64)
print("Dataset summary:\nSamples: {}, features: {}, classes: {}".format(dataset.data.shape[0], dataset.data.shape[1], np.unique(dataset.target).shape[0]))


In [None]:
# plot faces
_, ax = plt.subplots(10, 10, figsize=(10, 10),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))

for i, a in enumerate(ax.flat):
    a.imshow(X_train[i].reshape(shape), cmap=plt.cm.gray, interpolation="nearest")

#### Exercise

Plot first 64 eigenfaces, comment the difference between the first and the last ones

In [None]:
pca = PCA()
X_train_pca = pca.fit_transform(X_train)

In [None]:
# plot first 64 components
_, ax = plt.subplots(8, 8, figsize=(10, 10),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))

for i, a in enumerate(ax.flat):
    a.imshow(pca.components_[i].reshape(shape), cmap=plt.cm.gray, interpolation="nearest")

#### Exercise
Encode any face in test dataset in eigenbasis of different powers, `n_components`= [2, 4, 8 16, 32, 64, $\dots$]

In [None]:
# apply PCA
face_id = 10
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
eigen_vectors = pca.components_

sample_mean = X_train.mean(axis=0)
X_test_pcs = np.dot(np.dot(X_test[face_id] - sample_mean, eigen_vectors.T), eigen_vectors) + sample_mean

In [None]:
# plot
_, ax = plt.subplots(1, 2, figsize=(5, 5),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))

ax[0].imshow(X_test[face_id].reshape(shape), cmap=plt.cm.gray, interpolation="nearest")
ax[1].imshow(X_test_pcs.reshape(shape), cmap=plt.cm.gray, interpolation="nearest")

#### Exercise 
Classify faces in original dimension with kNN, logistic regression and random forest, conclude

In [None]:
clf = 
clf.fit(X_train, y_train)

acc_full_train = accuracy_score(clf.predict(X_train), y_train)
acc_full_test = accuracy_score(clf.predict(X_test), y_test)

acc_full_train, acc_full_test

#### Exercise

Perform classification in the space of reduced dimension of your choice, conclude

In [None]:
n_components = 

pca = PCA(n_components=n_components)
pca.fit(X_train)

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

clf = 
clf.fit(X_train_pca, y_train)
accuracy_score(clf.predict(X_train_pca), y_train), accuracy_score(clf.predict(X_test_pca), y_test), 

#### Exercise

- Perform classification for a whole range of dimensions, with a classifier of your choice.
- State the maximum number of principal components for this space, conclude. 
- Plot the classification performance.

In [None]:
acc_train = []
acc_test = []

for n in range():
    

In [None]:
fig = plt.figure(figsize=(12,6))
plt.title("Classifier performance")
plt.xlabel("# PCs")
plt.ylabel("Accuracy")
plt.grid(linestyle="dotted")
plt.axhline(linewidth=1, y=acc_full_test, color='r')
plt.plot(acc_train, 'b')
plt.plot(acc_test, 'g')
plt.legend(['Best on test in original dimension', 'Train in reduced dimension', 'Test in reducted dimension'])
plt.show()

## 5. Homework

Sberbank housing dataset  
https://www.kaggle.com/c/sberbank-russian-housing-market

### 5.1. PCA + interpretation

As a homework, check which number of variables explains 80%, 95% and 99,5% of variance. Draw and interpret biplots of first two principal components and top 10 features with highest influence (largest absolute eigenvector coefficient) to each first two principal components.

### 5.2. PCA + regression

Check regression performance of various classifiers on raw data and of reduced data of various dimensions.

Do a feature selection, selecting top 10 performing features, with any technique you know, compare them with top 10 features influencing first two principal components.

In [None]:
# your code here