# Unsupervised Learning

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import numpy as np
import pandas as pd

## Data

In [None]:
url = (
    'http://biostat.mc.vanderbilt.edu/' 
    'wiki/pub/Main/DataSets/titanic3.xls'
)

In [None]:
df = pd.read_excel(url)
df_orig = df.copy()

In [None]:
! python3 -m pip install --quiet category_encoders missingno yellowbrick

In [None]:
import category_encoders as ce

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
from sklearn.preprocessing import StandardScaler

### Preprocess data

In [None]:
df.isnull().shape

In [None]:
import missingno as mn

#### Drop leaky or low-information variables

In [None]:
df = df.drop(columns = ['survived', 'name', 'ticket', 'boat', 'body', 'cabin', 'home.dest'])

In [None]:
mn.matrix(df);

In [None]:
df.select_dtypes('object')

In [None]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

#### Convert categorical values

In [None]:
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.columns

#### Impute missing values

In [None]:
imputer = IterativeImputer()

In [None]:
df.loc[:, :] = imputer.fit_transform(df)

In [None]:
df.head()

In [None]:
df.isnull().sum().sum()

In [None]:
mn.matrix(df);

In [None]:
scaler = StandardScaler()

In [None]:
df.loc[:, :] = scaler.fit_transform(df)

In [None]:
df.head()

## Dimension reduction

### PCA

If your knowledge of PCA is fading, see the notebook `B03A_PCA.ipynb`

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA()

In [None]:
X = pca.fit_transform(df)

In [None]:
y = np.cumsum(pca.explained_variance_ratio_)
x = np.arange(1, len(y)+1)
plt.bar(x, y, alpha = 0.2)
plt.plot(x, y)
plt.xlabel('Principal components')
plt.ylabel('Explained variance')
plt.title('Cumulative fraction of variance explained', fontsize=16);

In [None]:
X.shape

In [None]:
df_orig.survived

In [None]:
df_X = pd.DataFrame(X[:, :4], columns = [f'PC{i}' for i in range(1, 5)])
df_X['survived'] = df_orig.survived

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(df_X, hue='survived');

In [None]:
df.columns

In [None]:
plt.figure(figsize=(8,8))
plt.imshow(pca.components_.T, cmap='Spectral', vmin=-1, vmax=1)
plt.colorbar(fraction=0.046, pad=0.04)
plt.yticks(range(len(df.columns)), df.columns, fontsize=14);
plt.xticks(range(len(df.columns)), 1+np.arange(len(df.columns)), fontsize=14);

In [None]:
from yellowbrick.features import PCA as PCA_

In [None]:
plt.rcParams['font.size'] = 14
pca_viz = PCA_(scale=True, proj_features=True)
pca_viz.fit_transform(df, df_orig.survived)
pca_viz.show();

### Interpreting the biplot

Bi = PCA plot + loadings plot

Each PC is just a linear combination of the original variables. 

$$
v = \alpha_1 x_1 + \alpha_2 x_2 + \ldots + \alpha_n x_n
$$

The coefficients $\alpha_i$ are known as *loadings* for each PC. This is stored in the `components_` attribute of the `sklearn` PCA instance. The loadings plot shows the contributions of the original features onto the PC axes. Here we show how `pclass` and `page` are projected as arrows onto the first 2 PC axes to make the process explicit.

- Arrows that point in the same direction indicate that the corresponding features are positively correlated
- Arrows that point in the opposite direction that the corresponding features are negatively correlated
- Arrows that are orthogonal that the corresponding features are uncorrelated

In [None]:
loadings = pd.DataFrame(pca.components_.T, columns = df.columns)
loadings

In [None]:
x1, y1 = loadings.pclass[:2]
x2, y2 = loadings.age[:2]

In [None]:
plt.arrow(0,0,x1,y1, head_width=0.02)
plt.arrow(0,0,x2,y2, head_width=0.02)
plt.text(x1 + 0.05, y1, loadings.columns[0])
plt.text(x2 - 0.05, y2, loadings.columns[1])
plt.tight_layout()

We expect these to be negatively correlated since they point in approximately opposite directions.

In [None]:
df.corr().iloc[:2, :2]

## Other dimension reduction methods

### PCA does not preserve local structure

## Limitations of PCA

We will project a 2-d data set onto 1-d to see one limitation of PCA. This provides motivation for learning non-linear methods of dimension reduction.

In [None]:
x1 = np.random.multivariate_normal([-3,3], np.eye(2), 100)
x2 = np.random.multivariate_normal([3,3], np.eye(2), 100)
x3 = np.random.multivariate_normal([0,-10], np.eye(2), 100)
xs = np.r_[x1, x2, x3]
xs = (xs - xs.mean(0))/xs.std()
zs = np.r_[np.zeros(100), np.ones(100), 2*np.ones(100)]

In [None]:
plt.scatter(xs[:, 0], xs[:, 1], c=zs, cmap='Set1')
plt.axis('equal')
pass

In [None]:
pca = PCA(n_components=1)

In [None]:
ys = pca.fit_transform(xs)

In [None]:
plt.scatter(ys[:, 0], np.random.uniform(-1, 1, len(ys)), c=zs, cmap='Set1')
plt.axhline(0, c='red')
pass

### T-SNE preserves locality

The t-SNE algorithm was designed to preserve local distances between points in the original space, as we saw in the example above. This means that t-SNE is particularly effective at preserving **clusters** in the original space. The full t-SNE algorithm is quite complex, so we just sketch the ideas here.

For more details, see the original [series of papers](https://lvdmaaten.github.io/tsne/) and this Python [tutorial](https://www.oreilly.com/learning/an-illustrated-introduction-to-the-t-sne-algorithm). The algorithm is also clearly laid out in the fairly comprehensive [tutorial](https://www.analyticsvidhya.com/blog/2017/01/t-sne-implementation-r-python/).

### Outline of t-SNE

t-SNE is similar in outline to MDS, with two main differences - "distances" are baased on probabilistic concepts and depend on the local neighborhood of the point.

#### Original space

- Find the conditinoal similarity between points in the original space based on a Gaussian kernel

$$
p_{j \mid i} = \frac{f(\vert y_i - y_j \vert)}{\sum_{k \ne i} {f(\vert y_i - y_k \vert)}}
$$

where

$$
f(z) = {e^\frac{{-z^2}}{2\sigma_i^2}}
$$

- Symmetize the conditional similarity (this is necessary becasue each kernel has its own variance)

$$
p_{ij} = \frac{p_{i \mid j} + p_{j \mid i}}{2}
$$

- This gives a similarity matrix $p_{ij}$ that is fixed

Notes

- In t-SNE, the variance of the Gaussian kernel depensd on the point $x_i$. Intuitively, we want the variance to be small if $x_i$ is in a locally desnse region, and to be large if $x_i$ is in a locally sparse region. This is done by an iteratvie algorithm that depends on a user-defined variable called **perplexity**. Roughly, perplexity determines the number of meaningful neighbors each point should have.

#### Map space

- Find the conditional similarity between points in the map space based on a Cauchy kernel

$$
q_{ij} = \frac{g(\vert y_i - y_j \vert)}{\sum_{k \ne i} {g(\vert y_i - y_k \vert)}}
$$

where

$$
g(z) = \frac{1}{1+z^2}
$$

- This gives a similarity matrix $q_{ij}$ that depends on the points in the map space that we can vary

#### Optimization

- Minimize the Kullback-Leibler divergence between $p_{ij}$ and $q_{ij}$

$$
\text{KL}(P \mid\mid  Q) = \sum p_{ij} \log{\frac{p_{ij}}{q_{ij}}}
$$

#### Normal and Cauhcy distributions

The Cauchy has much fatter tails than the normal distribution.  This means that two points that are widely separated in the original space would be pushed much further apart in the map space.

In [None]:
! python3 -m pip install --quiet fitsne

In [None]:
import fitsne

In [None]:
ys = fitsne.FItSNE(xs)

In [None]:
plt.scatter(ys[:, 0], np.random.uniform(-1, 1, len(ys)), c=zs, cmap='Set1')
plt.axhline(0, c='red')
pass

### Illustrating with MNIST digits

In [None]:
from sklearn.datasets import fetch_openml

X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
plt.imshow(X[0].reshape((28,28)), cmap='binary')
plt.title(f'label = {y[0]}');

In [None]:
%%capture
sc = plt.scatter(np.arange(10), np.arange(10), c=np.arange(10), cmap='tab10')

In [None]:
%%time
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

In [None]:
plt.scatter(X_pca[:, 0], X_pca[:, 1], s=1, 
            c=y.astype('int'), cmap='tab10')
plt.title('Images in PCA space colored by label')
for i in range(10):
    idx = y == i
    μ = np.mean(X_pca[y == i], axis=0)
    plt.text(*μ, str(i), va='center', ha='center', 
             bbox=dict(facecolor='yellow', alpha=0.5))
plt.legend(*sc.legend_elements(), 
           bbox_to_anchor=(1,1), 
           fontsize=20, 
           markerscale=2);

### t-SNE preserves local structure

In [None]:
X.shape

In [None]:
import fitsne

In [None]:
%%time
X = X.copy(order='C')
X_tsne = fitsne.FItSNE(X)

In [None]:
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], s=1, 
            c=y.astype('int'), cmap='tab10')
plt.title('Images in t-SNE space colored by label')
for i in range(10):
    idx = y == i
    μ = np.mean(X_tsne[y == i], axis=0)
    plt.text(*μ, str(i), va='center', ha='center', 
             bbox=dict(facecolor='yellow', alpha=0.5))
plt.legend(*sc.legend_elements(), 
           bbox_to_anchor=(1,1), 
           fontsize=20, 
           markerscale=2);

### UMAP preserves local and (maybe) global structure

Normally I would refer you to the [original paper](https://arxiv.org/pdf/1802.03426). But the original paper is hard to read unless you have graduate training in pure mathematics, so visit this [tutorial](https://pair-code.github.io/understanding-umap/) instead.

In [None]:
import umap

In [None]:
%%time
X_umap = umap.UMAP().fit_transform(X)

In [None]:
y = y.astype('int')

In [None]:
plt.scatter(X_umap[:, 0], X_umap[:, 1], s=1, 
            c=y, cmap='tab10')
plt.title('Images in UMAP space colored by label')
for i in range(10):
    idx = y == i
    μ = np.mean(X_umap[y == i], axis=0)
    plt.text(*μ, str(i), va='center', ha='center', 
             bbox=dict(facecolor='yellow', alpha=0.5))
plt.legend(*sc.legend_elements(), 
           bbox_to_anchor=(1,1), 
           fontsize=20, 
           markerscale=2);