# 5.4 Unsupervised Learning Tutorial

This notebook covers key unsupervised learning techniques including:
- Principal Component Analysis (PCA)
- t-SNE
- K-means Clustering
- Hierarchical Clustering
- DBSCAN

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.datasets import make_blobs, make_moons
from scipy.cluster.hierarchy import dendrogram, linkage

# Set random seed for reproducibility
np.random.seed(42)

## 1. Principal Component Analysis (PCA)

Let's explore dimensionality reduction using PCA.

In [None]:
# Generate high-dimensional data
n_samples = 1000
n_features = 50
n_informative = 5

# Create data with only a few informative features
X = np.random.randn(n_samples, n_features)
important_features = np.random.randn(n_features, n_informative)
X = np.dot(X, important_features)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio vs Number of Components')
plt.grid(True)
plt.show()

# Plot first two principal components
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Data Projected onto First Two Principal Components')
plt.show()

# Print variance explained by first few components
print("Variance explained by first 5 components:")
for i, var in enumerate(pca.explained_variance_ratio_[:5], 1):
    print(f"Component {i}: {var:.4f}")

## 2. t-SNE

Let's visualize high-dimensional data using t-SNE.

In [None]:
# Generate data with known clusters
X, y = make_blobs(n_samples=500, n_features=10, centers=5, random_state=42)

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

# Plot t-SNE results
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='viridis')
plt.colorbar(scatter)
plt.title('t-SNE Visualization of High-Dimensional Data')
plt.show()

# Compare with PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
plt.colorbar(scatter)
plt.title('PCA Visualization of High-Dimensional Data')
plt.show()

## 3. K-means Clustering

Let's explore K-means clustering and its properties.

In [None]:
# Generate clustered data
X, true_labels = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=42)

# Find optimal number of clusters using elbow method
inertias = []
K = range(1, 10)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(K, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

# Apply K-means with optimal k
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(X)

# Plot clustering results
plt.figure(figsize=(12, 5))

plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=true_labels, cmap='viridis')
plt.title('True Labels')

plt.subplot(122)
plt.scatter(X[:, 0], X[:, 1], c=cluster_labels, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], 
            marker='x', s=200, linewidths=3, color='r', label='Centroids')
plt.title('K-means Clustering')
plt.legend()

plt.show()

## 4. Hierarchical Clustering

Let's implement hierarchical clustering and visualize the dendrogram.

In [None]:
# Generate sample data
X, _ = make_blobs(n_samples=50, centers=3, random_state=42)

# Create linkage matrix
linkage_matrix = linkage(X, method='ward')

# Plot dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()

# Apply hierarchical clustering
n_clusters = 3
hc = AgglomerativeClustering(n_clusters=n_clusters)
cluster_labels = hc.fit_predict(X)

# Plot clustering results
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], c=cluster_labels, cmap='viridis')
plt.title('Hierarchical Clustering Results')
plt.show()

## 5. DBSCAN

Let's explore density-based clustering using DBSCAN.

In [None]:
# Generate non-spherical clusters
X, _ = make_moons(n_samples=200, noise=0.05, random_state=42)

# Apply DBSCAN with different parameters
eps_values = [0.1, 0.2, 0.3]
min_samples_values = [5, 10, 15]

fig, axes = plt.subplots(len(eps_values), len(min_samples_values), figsize=(15, 15))

for i, eps in enumerate(eps_values):
    for j, min_samples in enumerate(min_samples_values):
        # Fit DBSCAN
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        cluster_labels = dbscan.fit_predict(X)
        
        # Plot results
        axes[i, j].scatter(X[:, 0], X[:, 1], c=cluster_labels, cmap='viridis')
        axes[i, j].set_title(f'eps={eps}, min_samples={min_samples}')

plt.tight_layout()
plt.show()

# Compare with K-means
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans_labels = kmeans.fit_predict(X)

# Best DBSCAN parameters
best_dbscan = DBSCAN(eps=0.2, min_samples=5)
dbscan_labels = best_dbscan.fit_predict(X)

plt.figure(figsize=(12, 5))

plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis')
plt.title('K-means Clustering')

plt.subplot(122)
plt.scatter(X[:, 0], X[:, 1], c=dbscan_labels, cmap='viridis')
plt.title('DBSCAN Clustering')

plt.show()

## Practice Exercises

1. Apply PCA to a real-world dataset and analyze the results.

2. Compare t-SNE with UMAP for dimensionality reduction.

3. Implement the silhouette score to evaluate clustering quality.

4. Try different linkage methods in hierarchical clustering.

5. Use DBSCAN to detect outliers in a dataset.