# Interpreting PCA

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
np.random.seed(42)
x1 = np.random.normal(0, 1, 50)
x2 = x1 + np.random.normal(0, 0.5, 50)
x3 = -x1 + np.random.normal(0, 0.5, 50)
x4 = np.random.normal(0, 1, 50)

In [None]:
X = pd.DataFrame(np.c_[x1, x2, x3, x4], columns = ['X1', 'X2', 'X3', 'X4'])

## Correlation between column features

In [None]:
X.corr()

## Standardize features

In [None]:
scaler = StandardScaler(with_mean=True, with_std=True)

In [None]:
X_scaled = scaler.fit_transform(X)

## PCA

In [None]:
pca = PCA(n_components=2)

In [None]:
X_pca = pca.fit_transform(X)

### How much variance is explained by each PC?

In [None]:
pca.explained_variance_ratio_

In [None]:
np.cumsum(pca.explained_variance_ratio_)

### What are the directions of maximum variance?

In [None]:
pca.components_

### Loadings

In [None]:
coeffs = pca.components_.T
coeffs

### PCA plot, loadings plot, and biplot

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12,4),
                         sharex=True, sharey=True)

ax = axes[0]
ax.scatter(X_pca[:, 0], X_pca[:, 1])
ax.set_xlabel('PC1', fontsize=16)
ax.set_ylabel('PC2', fontsize=16)
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_title('PCA plot', fontsize=16)

ax = axes[1]
for i in range(coeffs.shape[0]):
    ax.arrow(0, 0, coeffs[i,0], coeffs[i,1], color='red', head_width=0.1)
    ax.text(2*coeffs[i,0], 2*coeffs[i,1], f'X{i+1}', fontsize=16)
ax.set_xlabel('PC1', fontsize=16)
ax.set_ylabel('PC2', fontsize=16)
ax.set_title('Loadings plot', fontsize=16)

ax = axes[2]
ax.scatter(X_pca[:, 0], X_pca[:, 1])
for i in range(coeffs.shape[0]):
    ax.arrow(0, 0, coeffs[i,0], coeffs[i,1], color='red', head_width=0.1)
    ax.text(2*coeffs[i,0], 2*coeffs[i,1], f'X{i+1}', fontsize=16)
ax.set_xlabel('PC1', fontsize=16)
ax.set_ylabel('PC2', fontsize=16)
ax.set_title('Biplot', fontsize=16)

pass

### Observations about loadings

- X1, X2 and X3 contribute most to PC1
- X4 contributes most to PC2
- X1 and X2 are highly correlated
- X1 and X2 are highly anti-correlated with X3
- X3 is essentially uncorrelated with X1, X2 and X3
- The arrows for X1, X2 and X3 are smaller than for X4 because they all contribute to PC1, while X4 is the only contributor to PC2