# PCA

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Load and explore data

In [None]:
from sklearn.datasets import load_wine

wine = load_wine(as_frame=True)
X = wine.data
y = wine.target
wine_features = X.columns

X.head()

In [None]:
X.shape

In [None]:
y.value_counts()

## Center Data

In [None]:
#⚠️ Data must be centered around its mean before applying PCA ⚠️
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X), columns=wine_features)
X

In [None]:
# We use a heatmap to view features with high correlation (positive or negative)
sns.heatmap(pd.DataFrame(X).corr(), cmap='coolwarm');

## Compute Principal Components

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X)

In [None]:
pca.components_.shape

In [None]:
# Access our 13 PCs 
W = pca.components_

# Print PCs as COLUMNS
W = pd.DataFrame(W.T,
                 index=wine_features,
                 columns=[f'PC{i}' for i in range(1, 14)])
W

## Project Dataset into PC's

In [None]:
# At this point we have 
# Our standardized dataset (X)  
    # 170+ rows with 13 features each
# A PC map showing how to convert a given example into PC values
    # currently 13 columns, 1 for each PC
    # each holding 13 rows, individual directions for each original feature to create that PC


### Single Manual Conversion

In [None]:
# This is our first actual wine example with scaled values
X.loc[0]

In [None]:
# This it the map to convert an example into a PC1 value
W.PC1

In [None]:
# This is the conversion taking place
convert_row = X.loc[0] * W.PC1
convert_row

In [None]:
# The final number we are after is the dot product (multiply and sum)
convert_row.sum()

In [None]:
# Verified with numpy's dot function
np.dot(X.loc[0], W.PC1)

### Full Set conversion

In [None]:
# Convert the whole data set
X_proj = pca.transform(X)
X_proj = pd.DataFrame(X_proj, columns=[f'PC{i}' for i in range(1, 14)])
X_proj

In [None]:
# Completely remove all collinearity
sns.heatmap(X_proj.corr(), cmap='coolwarm');

## Plotting in our new space

In [None]:
# 2D-slice
plt.figure(figsize=(13,5))
# Plotting column 1 vs column 2 on scaled data (alchol and malic acid)
plt.subplot(1,2,1)
plt.title('X1 vs. X0 before PCA (initial space)'); plt.xlabel('X0'); plt.ylabel('X1')
plt.scatter(X.iloc[:,0], X.iloc[:,1], c=y)

plt.subplot(1,2,2)
# Plotting PC1 vs PC2 on transformed data (First PC composed of all 13 features and 2nd PC composed of all 13 features)
plt.title('PC1 vs PC2 (new space)'); plt.xlabel('PC 1'); plt.ylabel('PC 2')
plt.scatter(X_proj.iloc[:,0], X_proj.iloc[:,1], c=y);

## Check the math

In [None]:
# A quick look at the covariance matrix for our 13 features
pd.DataFrame(np.dot(X.T,X)/178)

In [None]:
# For our covariance matrix we can compute our
    # Eigenvectors - shift used to project observation onto line
    # Eigenvalue - amount of underlying observation variance maintained with projection
eig_vals, eig_vecs = np.linalg.eig(pd.DataFrame(np.dot(X.T,X)))

In [None]:
# Eigen Values hold 13 values, one for each PC being created
eig_vals

In [None]:
# Eigen Vectors holds 
    # 1 array per PC being created
    # Each array length matches number of column in dataset
W2 = pd.DataFrame(eig_vecs, 
                 index=wine_features,
                 columns=[f'PC{i}' for i in range(1, 14)])
W2