K-means
Artificially generate a dataset

In [None]:
from sklearn.datasets import make_blobs

# Generate a synthetic dataset
X, _ = make_blobs(n_samples=1000, centers=4, cluster_std=0.5, random_state=0)


Modeling (note: we don't have split dataset. It can be called training, provided that you have new data comes in. If you have a fixed dataset, some of the time we call it modeling.)

In [None]:
# Re-importing necessary libraries after reset
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Apply KMeans clustering
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)



Plot graph

In [None]:
# Plotting the clusters
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.5)  # Mark the centroids
plt.title("K-Means Clustering")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()


Parameters:
Change the number of clusters by n_clusters, by default n_clusters=8
Change the initialization method by init, by default init='k-means++'
Set the maximum iterations by max_iter, by default max_iter=300

In [None]:
# Apply KMeans clustering
#kmeans = KMeans(n_clusters=4)
kmeans = KMeans(n_clusters=10, init='random', max_iter=1000)

kmeans.fit(X)
y_kmeans = kmeans.predict(X)

Let's try Iris data again.
If we remove Iris data label, then we can apply K-means and construct the confusion matrix.

In [None]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Apply KMeans clustering
# Since we know there are 3 species of iris in the dataset, we'll use n_clusters=3
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)

# Predict the cluster labels
labels = kmeans.predict(X)

# Visualizing the clusters - we'll use the first two features for simplicity
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', edgecolor='k', s=50)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.5, marker='X')  # Mark the centroids
plt.title('K-Means Clustering on Iris Dataset')
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.show()

# Since we have the true labels, we can compare them to our k-means labels
# Note: This is just for educational purposes; in real-world unsupervised learning scenarios, we usually don't have true labels.
from sklearn.metrics import confusion_matrix, accuracy_score

# A simple function to relabel the clusters to match the original labels as closely as possible for accuracy calculation
def relabel_clusters(labels, true_labels):
    from scipy.stats import mode
    new_labels = np.zeros_like(labels)
    for i in range(3):
        mask = (labels == i)
        new_labels[mask] = mode(true_labels[mask])[0]
    return new_labels

new_labels = relabel_clusters(labels, y)
print("Confusion Matrix:")
print(confusion_matrix(y, new_labels))
print(f"Accuracy: {accuracy_score(y, new_labels):.2f}")



Hierarchical Clustering
Create a sample data first

In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
import numpy as np

# Generating a synthetic dataset
X, labels_true = make_blobs(n_samples=150, centers=3, cluster_std=0.60, random_state=0)


Develop the Dendrogram

In [None]:
# Generating the linkage matrix
Z = linkage(X, 'ward')

# Plotting the dendrogram
plt.figure(figsize=(10, 7))
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Sample index")
plt.ylabel("Distance")
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()

Linkage methods:
'single': Uses the minimum of the distances between all observations of the two sets.
'complete': Uses the maximum distances between all observations of the two sets.
'average': Uses the average of the distances of each observation of the two sets.
'ward': Minimizes the variance of the clusters being merged.

In [None]:
# Generating the linkage matrix
Z = linkage(X, 'complete')

# Plotting the dendrogram
plt.figure(figsize=(10, 7))
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Sample index")
plt.ylabel("Distance")
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()