# Unsupervised Learning

![image.png](https://miro.medium.com/max/1000/0*qJ3y2frzPU8hbt9L.png)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

### Principal Component Analysis
##### Reduction of Dimensionality

In [None]:
x1 = np.linspace(0,1.5,15)
y1 = .25*x1 + 2 + np.random.normal(0,.2,len(x1))
x2 = np.linspace(3,5,15)
y2 = .25*x2 + 2 + np.random.normal(0,.2,len(x1))
data = pd.DataFrame({"x":np.concatenate([x1,x2]),"y":np.concatenate([y1,y2]),"label":["red"]*15+["blue"]*15})
data.head()

In [None]:
plt.scatter(data["x"],data["y"],c=data["label"]);

In [None]:
# Find center of all points
plt.grid()
cx,cy = data["x"].mean(),data["y"].mean()
plt.scatter(data["x"],data["y"])
plt.scatter(cx,cy,s=100);

In [None]:
# Center data in origin
plt.grid()
plt.scatter(data["x"]-cx,data["y"]-cy)
plt.scatter(0,0,s=100);

In [None]:
# Find line that best fits data (PC1)
plt.grid()
plt.scatter(data["x"]-cx,data["y"]-cy)
x = np.linspace((data["x"]-cx).min(),(data["x"]-cx).max())
y = .25*x
plt.plot(x,y, c="red");

In [None]:
# Find ortogonal line to PC1 (PC2)
plt.figure(figsize=(8,8))
plt.xlim(-2.5,2.5)
plt.ylim(-2.5,2.5)
plt.scatter(data["x"]-cx,data["y"]-cy)
x = np.linspace((data["x"]-cx).min(),(data["x"]-cx).max())
y = .25*x
plt.plot(x,y, c="red")
x = np.linspace(-2,2,2)
y = -4*x
plt.plot(x,y, c="red")
plt.grid()

In [None]:
pca = PCA(2)
data_t = pca.fit_transform(data[["x","y"]])

In [None]:
plt.grid()
plt.scatter(data_t[:,0],data_t[:,1]);

In [None]:
pca.explained_variance_ratio_

In [None]:
plt.bar([1,2],pca.explained_variance_ratio_,tick_label=["PC1","PC2"]);

- The number of (max) Principal Components is equal to the number of variables or the number of data points, whichever is lower.
- Each PC explains part of the variance of the original data

In [None]:
pca = PCA(3)
data_t = pca.fit_transform(data[["x","y"]])

### More dimensions

In [None]:
from sklearn.datasets import load_iris

In [None]:
X,y = load_iris(return_X_y=True)
cols = load_iris()["feature_names"]
labels = load_iris()["target_names"]

In [None]:
data = pd.DataFrame(X,columns=cols)
data["species"] = [labels[i] for i in y]

In [None]:
data.head()

In [None]:
pca = PCA(4)
data_t = pca.fit_transform(data.iloc[:,:-1])

In [None]:
var_ratio = pca.explained_variance_ratio_
var_ratio

In [None]:
plt.bar(range(4),var_ratio,tick_label=[f"PC{i+1}" for i in range(len(var_ratio))]);

In [None]:
import seaborn as sns
plt.grid()
sns.scatterplot(data_t[:,0],data_t[:,1], hue=data["species"]);

### t-SNE
##### t-distributed stochastic neighbor embedding 

In [None]:
tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, learning_rate=200)

In [None]:
data_t = tsne.fit_transform(data.iloc[:,:-1])

In [None]:
plt.grid()
sns.scatterplot(data_t[:,0],data_t[:,1], hue=data["species"]);

In [None]:
l=15
x1 = np.random.normal(0,.1,l)
y1 = np.random.normal(0,.1,l)
x2 = np.random.normal(0,.1,l)
y2 = np.random.normal(4,.1,l)
x3 = np.random.normal(4,.1,l)
y3 = np.random.normal(4,.1,l)
x = np.concatenate([x1,x2,x3])
y = np.concatenate([y1,y2,y3])
labels = ["b"]*l+["r"]*l+["pink"]*l

In [None]:
plt.scatter(x,y,c=labels);

In [None]:
plt.scatter(x,[1]*len(labels),c=labels)

In [None]:
plt.scatter([1]*len(labels),y,c=labels)

In [None]:
data = np.concatenate([np.expand_dims(x,-1),np.expand_dims(y,-1)],axis=-1)

In [None]:
tsne = TSNE(n_components=1, perplexity=30, n_iter=1000, learning_rate=100)

In [None]:
x_t = tsne.fit_transform(data)

In [None]:
plt.scatter(x_t,[1]*len(labels),c=labels)