In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import PCA
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.datasets import fetch_openml

from plot_silhouette import plot_silhouette_scores
from sklearn.metrics import silhouette_score, silhouette_samples

# suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Iris Data

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
scaler = StandardScaler()
X = scaler.fit_transform(iris.data)
features = iris.feature_names

In [None]:
kmeans_per_k = [KMeans(n_clusters=k).fit(X)
                for k in range(1, 10)]
inertias = [model.inertia_ for model in kmeans_per_k]

plt.figure(figsize=(10,4))
plt.plot(np.arange(len(inertias))+1,inertias,marker="o")
plt.xlabel('Number of Clusters, K')
plt.ylabel('WCSS');

In [None]:
silhouette_scores = [silhouette_score(X, model.labels_)
                     for model in kmeans_per_k[1:]]

plt.figure(figsize=(8, 3))
plt.plot(range(2, 10), silhouette_scores, "bo-")
plt.xlabel("$k$")
plt.ylabel("Silhouette score")
plt.show()

In [None]:
plot_df = pd.DataFrame(X, columns=iris.feature_names)
plot_df['truth'] = iris.target

In [None]:
plot_df.head()

In [None]:
n_clusters = 2
plt.scatter(plot_df['petal length (cm)'], plot_df['petal width (cm)'],c=kmeans_per_k[n_clusters-1].labels_,cmap='viridis')
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.title('KMeans Clustering');
# fig = px.scatter(plot_df, x='petal length (cm)', y='petal width (cm)', color=kmeans_per_k[n_clusters-1].labels_,
#                  hover_data=['truth'])
# fig.show()

In [None]:
si = silhouette_samples(X, kmeans_per_k[n_clusters-1].labels_)
plt.scatter(plot_df['petal length (cm)'], plot_df['petal width (cm)'],c=si,cmap='viridis')
plt.colorbar()
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.title('Silhouette Scores');
# fig = px.scatter(plot_df, x='petal length (cm)', y='petal width (cm)', color=si,
#                  hover_data=['truth'])
# fig.show()

### Pairwise Plots

In [None]:
plot_df['clusters'] = kmeans_per_k[n_clusters-1].labels_

In [None]:
sns.pairplot(plot_df.drop('truth', axis=1), hue='clusters', palette='Dark2');

### Cluster Centers

In [None]:
centers = kmeans_per_k[n_clusters-1].cluster_centers_

In [None]:
centers_df = pd.DataFrame(centers, columns=iris.feature_names)
print(centers_df)
sns.heatmap(centers_df, annot=True, cmap='viridis');

# MNIST

In [None]:
data_id = 554
class_names = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
X,y = fetch_openml(data_id=data_id, return_X_y=True, as_frame=False, parser='auto') # will return numpy arrays
X = X/255.0

In [None]:
kmeans_per_k = [KMeans(n_clusters=k, n_init='auto').fit(X)
                for k in range(1, 15)]
inertias = [model.inertia_ for model in kmeans_per_k]

In [None]:
# plt.figure(figsize=(10,5))
# plt.plot(np.arange(len(inertias))+1,inertias,marker="o")
# plt.xlabel('Number of Clusters, K')
# plt.ylabel('WCSS');

WCSS Plot

In [None]:
## Takes about 8 minutes to run

# silhouette_scores = [silhouette_score(X, model.labels_)
#                      for model in kmeans_per_k[1:]]

# plt.figure(figsize=(8, 3))
# plt.plot(np.arange(len(silhouette_scores))+2, silhouette_scores, "bo-")
# plt.xlabel("$k$")
# plt.ylabel("Silhouette score")
# plt.show()

Silhouette Plot

In [None]:
k_centers = 7
km = KMeans(k_centers)
km.fit(X)

In [None]:
plot_df = pd.DataFrame(X)
plot_df['clusters'] = km.labels_

### UMAP

In [None]:
umap = UMAP(n_components=2)
umap.fit(X)
u = umap.transform(X)


In [None]:

plt.scatter(u[:,0], u[:,1], c=km.labels_)
plt.colorbar();

### Center Means

In [None]:
centers = km.cluster_centers_

In [None]:
plot_centers = centers.reshape(k_centers,28,28)
fig, ax = plt.subplots(1,k_centers, figsize=(10,4))
for axi, center in zip(ax.flat, plot_centers):
    axi.set(xticks=[], yticks=[])
    axi.imshow(center, interpolation='nearest',cmap='Blues')

### Top Features of each cluster

In [None]:
order_centroids = km.cluster_centers_.argsort()[:,::-1]

In [None]:
n_top = 50
plot_main = np.zeros((k_centers,784))
for i in range(k_centers):
    plot_main[i, order_centroids[i,0:n_top]] = 1
plot_main = plot_main.reshape(k_centers, 28,28)

In [None]:
fig, ax = plt.subplots(1,k_centers, figsize=(10,4))
for axi, center in zip(ax.flat, plot_main):
    axi.set(xticks=[], yticks=[])
    axi.imshow(center, interpolation='nearest',cmap='Blues')

## Text

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans

In [None]:
sw = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", "from", "subject"]

The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation).


In [None]:
## Choose a subset of categories 
categories = [
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.space'
]

dataset = fetch_20newsgroups(
    remove=("headers", "footers", "quotes"),
    subset="all",
    categories=categories,
    shuffle=True,
    random_state=42,
)

In [None]:
tf = TfidfVectorizer(max_df=.5,min_df=100,stop_words=sw, token_pattern=r'(?u)\b[a-zA-Z]+(?:-[a-zA-Z]+)*\b')
X = tf.fit_transform(dataset.data)
X.shape

In [None]:
Xdf = pd.DataFrame(X.toarray(), columns=tf.get_feature_names_out())
Xdf.head(3)

In [None]:
kmeans_per_k = [KMeans(n_clusters=k, n_init='auto', random_state=42).fit(X)
                for k in range(1, 30)]
inertias = [model.inertia_ for model in kmeans_per_k]

In [None]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(len(inertias))+1,inertias,marker="o")
plt.xlabel('Number of Clusters, K')
plt.ylabel('WCSS')

In [None]:
silhouette_scores = [silhouette_score(X, model.labels_)
                     for model in kmeans_per_k[1:]]
plt.figure(figsize=(10, 5))
plt.plot(np.arange(len(silhouette_scores))+2, silhouette_scores, "bo-")
plt.xlabel("$k$")
plt.ylabel("Silhouette score")
plt.show()

In [None]:
k = 4
kmeans = MiniBatchKMeans(k, n_init='auto')
kmeans.fit(X)
labs = kmeans.predict(X)

In [None]:
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = tf.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" %(i+1), end='')
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind], end='')
    print()

In [None]:
## might need to install wordcloud
## conda install -c conda-forge wordcloud
## or
## pip install wordcloud

from wordcloud import WordCloud

In [None]:
terms.shape

In [None]:
num_clusters = kmeans.n_clusters

# Loop through each cluster to create a word cloud
for cluster in range(num_clusters):
    plt.figure(figsize=(6, 4))
    
    # Filter rows belonging to the current cluster
    cluster_data = Xdf[kmeans.labels_ == cluster]
    
    # Compute mean TF-IDF scores for words in this cluster
    word_freq = cluster_data.mean(axis=0)  # Mean TF-IDF per word
    
    # Convert to dictionary (only keep words with nonzero TF-IDF)
    word_dict = word_freq[word_freq > 0].to_dict()
    
    # Generate the word cloud
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_dict)
    
    # Display the word cloud
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Cluster {cluster+1}")
    plt.show()

In [None]:
X_embedded = UMAP(n_components=2).fit_transform(X.toarray())

In [None]:
X_embedded.shape

In [None]:
Udf = pd.DataFrame(X_embedded, columns=['U1,','U2'])

In [None]:
def get_top_words(row, terms, n=10):
    top_indices = np.argsort(row)[::-1][:n]  # Get indices of top N TF-IDF values
    top_words = [terms[i] for i in top_indices if row[i] > 0]  # Only keep nonzero words
    return ", ".join(top_words)


# Apply function to get top words per document
Udf["top_words"] = Xdf.apply(lambda row: get_top_words(row.values, terms, n=5), axis=1)

# Create UMAP scatter plot with hover text
fig = px.scatter(
    x=X_embedded[:, 0], 
    y=X_embedded[:, 1], 
    color=kmeans.labels_, 
    hover_data={"Top Words": Udf["top_words"]},  # Show top words
    labels={'color': 'Cluster'},
    title='UMAP'
)

fig.show()

# Fun application of k-means:  Image color segmentation

In [None]:
from sklearn.datasets import load_sample_image
china = load_sample_image("china.jpg")

In [None]:
ax = plt.axes(xticks=[], yticks=[])
ax.imshow(china);

In [None]:
china.shape

In [None]:
data = china / 255
data = data.reshape(427*640, 3)
data.shape

In [None]:
data[0:5,:]

In [None]:
# number of unique colors
pd.DataFrame(data).drop_duplicates().shape

![colors](colors.png)

In [None]:
k = 10 # choose how many colors
kmeans = MiniBatchKMeans(k, n_init='auto')
#kmeans = KMeans(n_clusters=k, n_init='auto')
kmeans.fit(data)
labs = kmeans.predict(data)
new_colors = kmeans.cluster_centers_[labs]

In [None]:
x1 = np.arange(k)
fig, ax = plt.subplots(figsize=(15,5))
#line1 = ax.scatter(x,y)
for i in range(k):
    ax.plot(x1[i], 0, marker='o', color=tuple(kmeans.cluster_centers_[i,:]),markersize=40)

In [None]:
china_recolored = new_colors.reshape(china.shape)
fig, ax = plt.subplots(1, 2, figsize=(16,6), subplot_kw=dict(xticks=[], yticks=[]))
fig.subplots_adjust(wspace=0.05)
ax[0].imshow(china)
ax[0].set_title('Original Image', size=16)
ax[1].imshow(china_recolored)
ax[1].set_title(str(k)+'-color Image', size=16);