In [2]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")


In [3]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs
X, _ = make_blobs(n_samples=1000, centers=4, n_features=10, random_state=42)
X

array([[ -9.00045423,  10.01793583,   4.73753592, ...,  -0.15855051,
         -2.16747656,  -4.09972226],
       [ -1.66855407,   8.36166215,   4.1936954 , ...,   8.08393757,
          2.80810039,   4.58690912],
       [-10.15814304,   9.72770578,   5.13167912, ...,   1.75227785,
         -1.89643471,  -3.81696358],
       ...,
       [ -3.49492367,   9.51833264,   4.10962122, ...,   6.12564502,
          3.98702537,   4.19671511],
       [  1.57641033,  -5.71781494,  -8.18945458, ...,  -8.02533507,
          2.75006664,  -1.11936474],
       [ -1.42614638,  10.06808818,   3.26220947, ...,  11.17625441,
          2.59319075,   5.2970172 ]])

## Standardize the data (important for PCA and some clustering algorithms)

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-1.43799539,  1.09624239,  0.80653375, ..., -0.03861019,
        -1.77942242, -0.31561534],
       [ 0.05770845,  0.89237746,  0.72070476, ...,  1.44475233,
         0.59515544,  1.4400295 ],
       [-1.67416331,  1.06051899,  0.86873749, ...,  0.30527277,
        -1.65006859, -0.25846732],
       ...,
       [-0.31487   ,  1.03474799,  0.70743616, ...,  1.09232747,
         1.15779353,  1.36116784],
       [ 0.71967951, -0.84061614, -1.23360585, ..., -1.45435909,
         0.56745901,  0.28674121],
       [ 0.10715949,  1.10241547,  0.57369751, ...,  2.00126232,
         0.49259051,  1.5835486 ]])

## Evalution

In [4]:
from sklearn.metrics import silhouette_score

# ========== K-Means Clustering ==========


In [5]:
from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters=4, random_state=42)
kmeans_model.fit(X_scaled)
kmeans_pred = kmeans_model.predict(X_scaled)

from sklearn.metrics import silhouette_score
kmeans_silhouette = silhouette_score(X_scaled, kmeans_pred)

print("K-Means Clustering")
print(f"  Silhouette Score: {kmeans_silhouette:.2f}")
print("-" * 40)

K-Means Clustering
  Silhouette Score: 0.73
----------------------------------------


# ========== DBSCAN ==========


In [7]:
from sklearn.cluster import DBSCAN

dbscan_model = DBSCAN(eps=0.5, min_samples=5)
dbscan_pred = dbscan_model.fit_predict(X_scaled)

from sklearn.metrics import silhouette_score
dbscan_silhouette = silhouette_score(X_scaled, dbscan_pred)

print("DBSCAN Clustering")
print(f"  Silhouette Score: {dbscan_silhouette:.2f}")
print("-" * 40)


DBSCAN Clustering
  Silhouette Score: 0.07
----------------------------------------


# ========== Agglomerative Clustering ==========

In [6]:

from sklearn.cluster import AgglomerativeClustering

agg_model = AgglomerativeClustering(n_clusters=4)
agg_pred = agg_model.fit_predict(X_scaled)

from sklearn.metrics import silhouette_score
agg_silhouette = silhouette_score(X_scaled, agg_pred)

print("Agglomerative Clustering")
print(f"  Silhouette Score: {agg_silhouette:.2f}")

Agglomerative Clustering
  Silhouette Score: 0.73


# ========== Principal Component Analysis (PCA) ==========


In [8]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=2)
X_pca = pca_model.fit_transform(X_scaled)

print("Principal Component Analysis (PCA)")
print(f"  Explained Variance Ratio: {pca_model.explained_variance_ratio_}")


Principal Component Analysis (PCA)
  Explained Variance Ratio: [0.61044647 0.22112436]
