# Unsupervised Learning

## Import Packages and Load Data

https://archive.ics.uci.edu/ml/datasets/Glass+Identification

In [None]:
# import
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import datasets, metrics
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

## Unsupervised vs. Supervised Learning 

In [None]:
# load data
data = datasets.load_wine(as_frame=True)
df = data["frame"]
labels = data["target"]

In [None]:
# inspect
df.sample(5)

In [None]:
# plot data
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x="flavanoids", y="proline",
                hue=labels, palette="tab10")
plt.show()

In [None]:
# you can change the size of the test set as percentage of the samples
test_size = 0.50  # try changing this value
y = df["target"]
X = df.drop("target", axis=1)
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

In [None]:
max_depth = 5  # try changing this value
min_samples_leaf = 2  # try changing this value
# create decision tree model
dt = DecisionTreeClassifier(max_depth=max_depth,
                            min_samples_leaf=min_samples_leaf)
# fit to the data
dt.fit(X_train, y_train)
# evaluate on test data
y_pred = dt.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

In [None]:
# create the cluster model
n_clusters = 3  # try changing this value
km = KMeans(n_clusters=n_clusters).fit(X)
# cluster the data
y_pred = km.predict(X)
# compare the clusters to the actual lobels
metrics.accuracy_score(y, y_pred)

In [None]:
# plot data, look at how the clusters vary in the data
# what does that tell us about the data?
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x="flavanoids", y="proline",
                hue=y_pred, palette="tab10")
plt.show()

In [None]:
# look at more of the features compare with proline and colored by the clusters
# again, what does this tell use about the data?
df_clusters = X.assign(cluster = y_pred)
sns.pairplot(data=df_clusters, y_vars="proline", hue="cluster", palette="tab10")

## Dimensionality Reduction

In [None]:
# PCA
# choose how many feature to reduce to
n_components = 2  # try changing this value
pca = PCA(n_components=n_components)
pca.fit(X)
pca_emb  = pca.transform(X)
# sum of explained variance ration tells us how much of the variation in the
# data is explained by the PCA dimensions
pca.explained_variance_ratio_.sum()

In [None]:
df.columns

In [None]:
# now let's plot it
# try different features for the colormap
# alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
# 'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
# 'proanthocyanins', 'color_intensity', 'hue', 
# 'od280/od315_of_diluted_wines', 'proline'
feature = "ash"  # change this value
plt.figure(figsize=(13, 8))
sns.scatterplot(x=pca_emb[:,0], y=pca_emb[:,1],
                hue=df[feature],
                palette="viridis") 

In [None]:
# PCA also lets use look at the components that are important for the projection
# try using the features have greater magnitude
sns.heatmap(data=pd.DataFrame(pca.components_, columns=X.columns).T,
            vmax=1, vmin=-1, cmap="coolwarm")

In [None]:
# componets is the number of dimensions to reduce to
n_components = 2  # change this value
# perplexity is number of data points around to base distance measurement on
# how does changing perplexity effect the plot?
perplexity = 15  # change this value
tsne = TSNE(n_components=n_components, perplexity=perplexity,
            random_state=42, learning_rate="auto").fit_transform(X)
plt.figure(figsize=(13, 8))
sns.scatterplot(x=tsne[:,0], y=tsne[:,1]) 

In [None]:
# now let's plot it
# try different features for the colormap
# alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
# 'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
# 'proanthocyanins', 'color_intensity', 'hue', 
# 'od280/od315_of_diluted_wines', 'proline'
feature = "ash"  # change this value
plt.figure(figsize=(13, 8))
sns.scatterplot(x=tsne[:,0], y=tsne[:,1],
                hue=df[feature],
                palette="viridis") 

## Clustering

In [None]:
# load the new dataset
# make sure this file path points to the glass.data dataset
file_path = "glass.data"

# don't change any of the code below, it loads adds feature names to the data
cols = ["Id", "refractive_index", "Sodium", "Magnesium", "Aluminum", "Silicon",
        "Potassium", "Calcium", "Barium", "Iron", "glass"]
df_glass = pd.read_csv(file_path, names=cols)\
    .drop(columns=["Id", "glass"])

features = ["refractive_index", "Sodium", "Magnesium", "Aluminum",
            "Silicon", "Potassium", "Calcium", "Barium", "Iron"]
df_glass[features]

In [None]:
# This code below runs multiple clustering algorithms and checks the inertia
# and silhouette scores
# from the plots can you determine how many clusters are in the data?

sil_scores = []
inertias = []
X = df_glass[features]

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
    y_km = kmeans.labels_
    sil_scores.append(metrics.silhouette_score(X, y_km))
    inertias.append(kmeans.inertia_)
fig, axs = plt.subplots(2, 1, figsize=(13, 8))
sns.lineplot(ax=axs[0], x=np.arange(2, 11), y=inertias, label="Inertia")
sns.lineplot(ax=axs[1], x=np.arange(2, 11), y=sil_scores, label="Silhouette")
plt.show()

In [None]:
# choose the number of clusters you think is ideal based on the plots above
k = ??  # change this value
kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
y_km = kmeans.labels_

# this will use PCA to plot the data, don't change the code below
# how does the plot help your assessment of the number of clusters?
pca = PCA(n_components=2)
X = df_glass[features]
pca.fit(X)
pca_emb  = pca.transform(X)

plt.figure(figsize=(13, 8))
sns.scatterplot(x=pca_emb[:,0], y=pca_emb[:,1],
                hue=y_km,
                palette="viridis") 

## Difficulty and Subjectivity

In [None]:
# we are switching back to the wine dataset
y = df["target"]
X = df.drop("target", axis=1)
# and running the code to assess the number of k-means clusters that
# are optimal for this dataset
sil_scores = []
inertias = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
    y_km = kmeans.labels_
    sil_scores.append(metrics.silhouette_score(X, y_km))
    inertias.append(kmeans.inertia_)
fig, axs = plt.subplots(2, 1, figsize=(13, 8))
sns.lineplot(ax=axs[0], x=np.arange(2, 11), y=inertias, label="Inertia")
sns.lineplot(ax=axs[1], x=np.arange(2, 11), y=sil_scores, label="Silhouette")
plt.show()

In [None]:
# select the number of clusters that seems ideal based on the plots above
k = ??  # change this value
kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
y_km = kmeans.labels_

pca = PCA(n_components=2)
pca.fit(X)
pca_emb  = pca.transform(X)

plt.figure(figsize=(13, 8))
sns.scatterplot(x=pca_emb[:,0], y=pca_emb[:,1],
                hue=y_km,
                palette="tab10") 

In [None]:
# run this code for the hierarchical clustering and look at the dendrogram
# how many clusters seem ideal, and how does it compare to k-means?
Z = linkage(X, method='ward', metric='euclidean')
fig = plt.figure(figsize=(21, 8))
dn = dendrogram(Z)

In [None]:
# chose a distance based on the dendrogram above to cluster the data and plot
# the results. How do the cluster results look? How do they compare to k-means?
distance = ??  # change this value
y_hier = fcluster(Z, distance, criterion='distance')
plt.figure(figsize=(13, 8))
sns.scatterplot(x=pca_emb[:,0], y=pca_emb[:,1], hue=y_hier, palette="tab10") 

In [None]:
# for sake of completeness, let's plot the same PCA with the actual labels of
# the dataset. How do these compare to the clusters? Does anything surprise you?
plt.figure(figsize=(13, 8))
sns.scatterplot(x=pca_emb[:,0], y=pca_emb[:,1], hue=df["target"], palette="tab10") 

## Example Application

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits

In [None]:
# load part of the MNIST hand written digigts dataset
digits = load_digits()
# dataset is 1797 images that are 8x8 pixels, or 64 features
digits.data.shape

In [None]:
# lets take a look at the first one
idx = 0
print(digits.target[idx])
plt.imshow(digits.images[idx])
plt.show()

In [None]:
# and now split into train/test splits
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,
                                                    test_size=0.33,
                                                    random_state=42)

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# now lets try extracting some meaningful features from the data and see how
# a model does only trained on that
# specifying a floating point value for PCA components will automatically
# choose the number of features to explain that specified variance
n_components = 0.80  # change this value
pca = PCA(n_components=n_components)
pca.fit(X_train)
pca_emb  = pca.transform(X_train)
print(pca.explained_variance_ratio_)

In [None]:
# try running this cell multiple times, how do the results compare to the
# baseline? Better? Worse? The same?
model = DecisionTreeClassifier()
model.fit(pca_emb, y_train)
model.score(pca.transform(X_test), y_test)

In [None]:
plt.imshow(pca.components_[0].reshape(8, 8))
plt.show()