# Imports

In [70]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import load_iris, load_wine
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score, adjusted_mutual_info_score
from sklearn.preprocessing import StandardScaler

# Load the data

In [22]:
basic = "https://storage.googleapis.com/edulabs-public-datasets/synthetic/basic5.csv"


In [2]:
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)

In [23]:
basic_df = pd.read_csv(basic)

In [31]:
fig = px.scatter(basic_df, x= 'x', y='y', color="color")
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.show()

# Synthetic Data

## Standardize the Data

In [46]:
basic_X = basic_df[['x', 'y']]
from sklearn.preprocessing import StandardScaler
basic_scaler = StandardScaler()
basic_scaled = basic_scaler.fit_transform(basic_X)

##K-Means Clustering (k=3)

In [47]:
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(basic_scaled)

In [48]:
labels

array([0, 0, 2, ..., 2, 1, 1], dtype=int32)

In [49]:
kmeans.cluster_centers_

array([[ 1.20118496, -0.61553433],
       [-0.18445041,  1.34599988],
       [-1.07981314, -0.66411344]])

In [50]:
df_clusters = pd.DataFrame(basic_scaled.copy())
df_clusters["Cluster"] = labels.astype(str)

In [51]:
df_clusters

Unnamed: 0,0,1,Cluster
0,1.475562,-0.980411,0
1,0.762902,0.182968,0
2,-1.075933,-1.028417,2
3,-1.161917,0.042997,2
4,-0.945600,1.456344,1
...,...,...,...
3995,0.205212,1.354899,1
3996,1.396698,-0.835845,0
3997,-0.909301,-0.393804,2
3998,0.188626,1.771557,1


## Visualize clusters

In [59]:
# Create scatter plot of the clustered data
fig = px.scatter(
    df_clusters,
    x=0,
    y=1,
    color="Cluster",
    title="K-Means Clustering on Iris Dataset (Original Features)",
    color_discrete_sequence=px.colors.qualitative.Set1,
    width=700, height=500,
    opacity=0.3
)

# Add cluster centers as black Xs
fig.add_trace(
    go.Scatter(
        x=kmeans.cluster_centers_[:, 0],
        y=kmeans.cluster_centers_[:, 1],
        mode='markers',
        marker=dict(color='black', size=12, symbol='x'),
        name='Centroids'
    )
)

fig.show()

## Elbow Method

In [61]:
kmeans.inertia_

966.4066850118362

In [62]:
inertias = []
K_range = range(1, 11)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(basic_scaled)
    inertias.append(km.inertia_)

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(K_range), y=inertias, mode='lines+markers'))
fig.update_layout(title='Elbow Method for Optimal k',
                  xaxis_title='Number of Clusters (k)',
                  yaxis_title='Inertia (WCSS)')
fig.show()

## Silhouette Score

In [63]:
silhouette_scores = []
for k in range(2, 11):
    km = KMeans(n_clusters=k, random_state=42)
    labels_tmp = km.fit_predict(basic_scaled)
    silhouette_scores.append(silhouette_score(basic_scaled, labels_tmp))

fig_sil = go.Figure()
fig_sil.add_trace(go.Scatter(x=list(range(2, 11)), y=silhouette_scores, mode='lines+markers'))
fig_sil.update_layout(title='Silhouette Score by Number of Clusters',
                      xaxis_title='k', yaxis_title='Silhouette Score')
fig_sil.show()

## Final Evaluation Metrics


###Adjusted Rand Index (ARI)
The Adjusted Rand Index (ARI) helps us measure how accurate a clustering result is by comparing it to the true labels (ground truth).

It checks how well the pairs of points are grouped:

- Are the same pairs together in both the real and predicted clusters?
- Are different pairs also kept apart correctly?

The score ranges from -1 to 1:
- 1 means perfect match - the clustering is exactly right.
- 0 means random guess - no better than chance.
- Below 0 means worse than random - very poor clustering.

In [67]:
print("Adjusted Rand Index (ARI):", adjusted_rand_score(basic_df['color'], labels))

Adjusted Rand Index (ARI): 0.998531837597305


####Mutual Information (MI)
Mutual Information measures how much two variables are related or connected. In clustering, it compares how much the true cluster labels match with the predicted labels. It shows how much knowing about one variable helps us predict the other. The more agreement there is, the higher the score.

- Higher values mean better agreement between the clusters.
- Zero means no agreement at all.


In [68]:
print("Mutual Information Index (MI):", adjusted_mutual_info_score(basic_df['color'], labels))

Mutual Information Index (MI): 0.9965920068085719


# Iris dataset

# Standardize the Data

In [33]:
from sklearn.preprocessing import StandardScaler
basic_scaler = StandardScaler()
X_scaled = basic_scaler.fit_transform(basic_df)

# K-Means Clustering (k=3)

In [15]:
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_scaled)

In [16]:
labels

array([1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1,
       1, 2, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [17]:
kmeans.cluster_centers_

array([[ 0.57100359, -0.37176778,  0.69111943,  0.66315198],
       [-0.81623084,  1.31895771, -1.28683379, -1.2197118 ],
       [-1.32765367, -0.373138  , -1.13723572, -1.11486192]])

# Add Cluster Labels to Original Data

In [18]:
df_clusters = X.copy()
df_clusters["Cluster"] = labels.astype(str)

# Visualize Using First Two Features

In [20]:
kmeans.cluster_centers_[:, 0]

array([ 0.57100359, -0.81623084, -1.32765367])

In [21]:
kmeans.cluster_centers_[:, 1]

array([-0.37176778,  1.31895771, -0.373138  ])

In [19]:
# Create scatter plot of the clustered data
fig = px.scatter(
    df_clusters,
    x=iris.feature_names[0],
    y=iris.feature_names[1],
    color="Cluster",
    title="K-Means Clustering on Iris Dataset (Original Features)",
    color_discrete_sequence=px.colors.qualitative.Set1,
    width=700, height=500
)

# Add cluster centers as black Xs
fig.add_trace(
    go.Scatter(
        x=kmeans.cluster_centers_[:, 0],
        y=kmeans.cluster_centers_[:, 1],
        mode='markers',
        marker=dict(color='black', size=12, symbol='x'),
        name='Centroids'
    )
)

fig.show()

# Elbow Method

In [12]:
kmeans.inertia_

191.02473685317958

In [11]:
inertias = []
K_range = range(1, 11)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X_scaled)
    inertias.append(km.inertia_)

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(K_range), y=inertias, mode='lines+markers'))
fig.update_layout(title='Elbow Method for Optimal k',
                  xaxis_title='Number of Clusters (k)',
                  yaxis_title='Inertia (WCSS)')
fig.show()