In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score
import plotly.express as px
import plotly.graph_objects as go




In [5]:
# Load dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y_true = iris.target  # optional for evaluation



In [6]:

# Pairplot (scatter matrix)
fig = px.scatter_matrix(X, dimensions=X.columns,
                        color=pd.Categorical(y_true),
                        title="Iris Feature Pairplot")
fig.update_traces(diagonal_visible=False)
fig.show()



In [7]:
# Feature distributions
for col in X.columns:
    fig = px.histogram(X, x=col, color=pd.Categorical(y_true),
                       barmode='overlay', marginal='box',
                       title=f'Distribution of {col}')
    fig.show()


In [8]:
# Correlation heatmap
corr = X.corr()
fig = go.Figure(data=go.Heatmap(
    z=corr.values,
    x=corr.columns,
    y=corr.columns,
    colorscale='Viridis',
    zmin=-1, zmax=1
))
fig.update_layout(title='Feature Correlation Heatmap')
fig.show()

In [10]:
# ----------------------------
# Standardize features
# ----------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# ----------------------------
# Elbow method for KMeans
# ----------------------------
wcss = []
for k in range(1, 11):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled)
    wcss.append(km.inertia_)

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1,11)), y=wcss, mode='lines+markers'))
fig.update_layout(title='Elbow Method',
                  xaxis_title='Number of Clusters (K)',
                  yaxis_title='WCSS')
fig.show()

In [12]:
# ----------------------------
# Train KMeans variations
# ----------------------------
optimal_k = 3

# 1. Standard K-Means with K-Means++ initialization
kmeans_pp = KMeans(n_clusters=optimal_k, init='k-means++', random_state=42, n_init=10)
y_kmeans_pp = kmeans_pp.fit_predict(X_scaled)

In [13]:
# 2. Mini-Batch K-Means
mini_batch = MiniBatchKMeans(n_clusters=optimal_k, batch_size=20, random_state=42)
y_mini_batch = mini_batch.fit_predict(X_scaled)

In [14]:
# ----------------------------
# Evaluation
# ----------------------------
silhouette_kmeans = silhouette_score(X_scaled, y_kmeans_pp)
silhouette_mini = silhouette_score(X_scaled, y_mini_batch)
print(f"Silhouette Score (K-Means++): {silhouette_kmeans:.3f}")
print(f"Silhouette Score (Mini-Batch K-Means): {silhouette_mini:.3f}")

Silhouette Score (K-Means++): 0.460
Silhouette Score (Mini-Batch K-Means): 0.464


In [15]:

def plot_clusters(X_scaled, labels, centroids=None, title="Clusters"):
    df_plot = pd.DataFrame(X_scaled[:, :2], columns=['Feature1', 'Feature2'])
    df_plot['Cluster'] = labels.astype(str)

    fig = px.scatter(df_plot, x='Feature1', y='Feature2', color='Cluster', title=title)

    if centroids is not None:
        centroids_df = pd.DataFrame(centroids[:, :2], columns=['Feature1', 'Feature2'])
        fig.add_trace(go.Scatter(x=centroids_df['Feature1'], y=centroids_df['Feature2'],
                                 mode='markers', marker=dict(color='red', size=12, symbol='x'),
                                 name='Centroids'))
    fig.show()


In [16]:
plot_clusters(X_scaled, y_kmeans_pp, kmeans_pp.cluster_centers_, "K-Means++ Clusters")
plot_clusters(X_scaled, y_mini_batch, mini_batch.cluster_centers_, "Mini-Batch K-Means Clusters")
