In [None]:
#1.  Loading and Preprocessing

from sklearn.datasets import load_iris
import pandas as pd

# Load the dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)

# Display the first few rows
X.head()

#--------------------------------------------------------------------

#2 A. KMeans Clustering

# KMeans is an unsupervised learning algorithm that partitions the data into K clusters. It works as follows:
# Initialize K centroids randomly.
# Assign each data point to the nearest centroid (based on Euclidean distance).
# Recalculate centroids based on assigned points.
# Repeat steps 2-3 until convergence (no change in clusters or centroids).


# Why is it suitable for the Iris dataset?
# The Iris dataset has continuous numerical features.
# It’s relatively small and balanced.
# Natural clusters exist (3 species), which KMeans can discover.

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Apply KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X)

# Reduce dimensions for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Plot clusters
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis', s=50)
plt.title("KMeans Clustering on Iris Dataset (PCA reduced)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.show()

#-----------------------------------------------------------------------

#2 B. Hierarchical Clustering


# Hierarchical Clustering builds a hierarchy of clusters using:
# Agglomerative method (bottom-up): Each point starts as a single cluster. Closest clusters are merged iteratively.
# Dendrograms are used to visualize cluster merges.

# Why is it suitable for the Iris dataset?
# Suitable for small datasets.
# No need to specify number of clusters initially.
# Helps visualize natural grouping via dendrogram.

    
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# Create linkage matrix
linked = linkage(X, method='ward')

# Plot dendrogram
plt.figure(figsize=(10, 6))
dendrogram(linked, truncate_mode='level', p=5)
plt.title("Hierarchical Clustering Dendrogram (truncated)")
plt.xlabel("Data Points")
plt.ylabel("Distance")
plt.show()

# Assign cluster labels
hier_labels = fcluster(linked, t=3, criterion='maxclust')

# Visualize clusters using PCA
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=hier_labels, cmap='plasma', s=50)
plt.title("Hierarchical Clustering on Iris Dataset (PCA reduced)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.show()

