# Metrics Table for Clustering using pycaret

### Installation and Import

In [1]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-3.2.0-py3-none-any.whl.metadata (17 kB)
Collecting kaleido>=0.2.1 (from pycaret)
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Collecting matplotlib<=3.6,>=3.3.0 (from pycaret)
  Downloading matplotlib-3.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
Collecting pandas<2.0.0,>=1.3.0 (from pycaret)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting plotly-resampler>=0.8.3.1 (from pycaret)
  Downloading plotly_resampler-0.9.2-py3-none-any.whl.metadata (13 kB)
Collecting pmdarima!=1.8.1,<3.0.0,>=1.8.0 (from pycaret)
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylin

In [2]:
import numpy as np
import pandas as pd

In [3]:
from pycaret.clustering import *

### Dataset

In [4]:
from pycaret.datasets import get_data
dataSets = get_data('index')

Unnamed: 0,Dataset,Data Types,Default Task,Target Variable 1,Target Variable 2,# Instances,# Attributes,Missing Values
0,anomaly,Multivariate,Anomaly Detection,,,1000,10,N
1,france,Multivariate,Association Rule Mining,InvoiceNo,Description,8557,8,N
2,germany,Multivariate,Association Rule Mining,InvoiceNo,Description,9495,8,N
3,bank,Multivariate,Classification (Binary),deposit,,45211,17,N
4,blood,Multivariate,Classification (Binary),Class,,748,5,N
5,cancer,Multivariate,Classification (Binary),Class,,683,10,N
6,credit,Multivariate,Classification (Binary),default,,24000,24,N
7,diabetes,Multivariate,Classification (Binary),Class variable,,768,9,N
8,electrical_grid,Multivariate,Classification (Binary),stabf,,10000,14,N
9,employee,Multivariate,Classification (Binary),left,,14999,10,N


In [5]:
df = get_data("seeds")

Unnamed: 0,area,perimeter,compactness,length,width,asymmetry,length.1
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175


### Testing the approach

In [6]:
s = setup(df)

Unnamed: 0,Description,Value
0,Session id,1979
1,Original data shape,"(210, 7)"
2,Transformed data shape,"(210, 7)"
3,Numeric features,7
4,Rows with missing values,1.4%
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,CPU Jobs,-1


In [7]:
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
kmeans,K-Means Clustering,sklearn.cluster._kmeans.KMeans
ap,Affinity Propagation,sklearn.cluster._affinity_propagation.Affinity...
meanshift,Mean Shift Clustering,sklearn.cluster._mean_shift.MeanShift
sc,Spectral Clustering,sklearn.cluster._spectral.SpectralClustering
hclust,Agglomerative Clustering,sklearn.cluster._agglomerative.AgglomerativeCl...
dbscan,Density-Based Spatial Clustering,sklearn.cluster._dbscan.DBSCAN
optics,OPTICS Clustering,sklearn.cluster._optics.OPTICS
birch,Birch Clustering,sklearn.cluster._birch.Birch
kmodes,K-Modes Clustering,kmodes.kmodes.KModes


In [8]:
kmeans = create_model('kmeans', num_clusters=4)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4015,275.848,0.9174,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
x = pull()

In [10]:
x.iloc[0]

Silhouette             0.4015
Calinski-Harabasz    275.8480
Davies-Bouldin         0.9174
Homogeneity            0.0000
Rand Index             0.0000
Completeness           0.0000
Name: 0, dtype: float64

### Making Metric Tables for Clustering Techniques

In [11]:
model = ['kmeans','hclust','dbscan']
cluster_no = [3,4,5]

In [12]:
kmeans_metric = pd.DataFrame()
hclust_metric = pd.DataFrame()
dbscan_metric = pd.DataFrame()

In [13]:
n=[False, True, False, False, True, True]
t=[False, False, True, False, True, True]
p=[False, False, False, True, False, True]

In [14]:
for i in range(len(n)):
    for k,m in enumerate(model):
        for j,c in enumerate(cluster_no):
            s = setup(df, normalize = n[i], transformation = t[i], pca = p[i], verbose=False)
            cm = create_model(m, num_clusters=c, verbose=False)
            x = pull().iloc[0]
            
            if(m=='kmeans'):
                kmeans_metric[i*3+j] = x
            elif(m=='hclust'):
                hclust_metric[i*3+j] = x
            elif(m=='dbscan'):
                dbscan_metric[i*3+j] = x

In [15]:
kmeans_metric

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
Silhouette,0.454,0.403,0.3628,0.3854,0.3954,0.3285,0.5234,0.5343,0.5207,0.454,0.3985,0.3556,0.3958,0.3216,0.267,0.396,0.3605,0.3045
Calinski-Harabasz,324.4954,275.5166,256.8724,155.4493,164.3684,145.3292,492.833,653.319,736.6037,324.4955,276.0282,256.1555,196.4406,153.3858,130.5825,196.5351,153.441,131.9544
Davies-Bouldin,0.7987,0.8609,0.9686,1.0248,0.8729,1.0094,0.5664,0.5415,0.5515,0.7987,0.9254,0.9779,0.9925,1.3857,1.4056,0.9848,1.3485,1.4165
Homogeneity,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rand Index,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Completeness,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
hclust_metric

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
Silhouette,0.4066,0.3914,0.3734,0.3794,0.3624,0.3006,0.5064,0.5129,0.4994,0.4066,0.3914,0.3734,0.3822,0.387,0.359,0.3822,0.387,0.359
Calinski-Harabasz,253.3362,256.1088,231.3251,144.008,149.012,132.3875,438.9553,608.5274,630.6559,253.3361,256.1087,231.325,188.0839,149.2392,125.5824,188.0838,149.2392,125.5824
Davies-Bouldin,0.8383,0.9231,0.9811,1.121,0.8866,1.0281,0.5312,0.5506,0.5332,0.8383,0.9231,0.9811,1.0136,1.3004,1.075,1.0136,1.3004,1.075
Homogeneity,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rand Index,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Completeness,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
dbscan_metric

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
Silhouette,-0.2832,-0.2832,-0.2832,-0.3184,-0.3184,-0.3184,0,0,0,-0.2832,-0.2832,-0.2832,0,0,0,0,0,0
Calinski-Harabasz,16.0052,16.0052,16.0052,5.0309,5.0309,5.0309,0,0,0,16.0052,16.0052,16.0052,0,0,0,0,0,0
Davies-Bouldin,1.6832,1.6832,1.6832,2.4907,2.4907,2.4907,0,0,0,1.6832,1.6832,1.6832,0,0,0,0,0,0
Homogeneity,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0
Rand Index,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0
Completeness,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0


### Applying Multi-indexing for better understanding

In [18]:
kmeans_metric.columns = pd.MultiIndex.from_product([['No Preprocessing', 'Normalization', 'Transformation', 'PCA', 'N + T', 'N + T + P'], ['c=3','c=4','c=5']])
hclust_metric.columns = pd.MultiIndex.from_product([['No Preprocessing', 'Normalization', 'Transformation', 'PCA', 'N + T', 'N + T + P'], ['c=3','c=4','c=5']])
dbscan_metric.columns = pd.MultiIndex.from_product([['No Preprocessing', 'Normalization', 'Transformation', 'PCA', 'N + T', 'N + T + P'], ['c=3','c=4','c=5']])

In [19]:
kmeans_metric

Unnamed: 0_level_0,No Preprocessing,No Preprocessing,No Preprocessing,Normalization,Normalization,Normalization,Transformation,Transformation,Transformation,PCA,PCA,PCA,N + T,N + T,N + T,N + T + P,N + T + P,N + T + P
Unnamed: 0_level_1,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5
Silhouette,0.454,0.403,0.3628,0.3854,0.3954,0.3285,0.5234,0.5343,0.5207,0.454,0.3985,0.3556,0.3958,0.3216,0.267,0.396,0.3605,0.3045
Calinski-Harabasz,324.4954,275.5166,256.8724,155.4493,164.3684,145.3292,492.833,653.319,736.6037,324.4955,276.0282,256.1555,196.4406,153.3858,130.5825,196.5351,153.441,131.9544
Davies-Bouldin,0.7987,0.8609,0.9686,1.0248,0.8729,1.0094,0.5664,0.5415,0.5515,0.7987,0.9254,0.9779,0.9925,1.3857,1.4056,0.9848,1.3485,1.4165
Homogeneity,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rand Index,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Completeness,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
hclust_metric

Unnamed: 0_level_0,No Preprocessing,No Preprocessing,No Preprocessing,Normalization,Normalization,Normalization,Transformation,Transformation,Transformation,PCA,PCA,PCA,N + T,N + T,N + T,N + T + P,N + T + P,N + T + P
Unnamed: 0_level_1,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5
Silhouette,0.4066,0.3914,0.3734,0.3794,0.3624,0.3006,0.5064,0.5129,0.4994,0.4066,0.3914,0.3734,0.3822,0.387,0.359,0.3822,0.387,0.359
Calinski-Harabasz,253.3362,256.1088,231.3251,144.008,149.012,132.3875,438.9553,608.5274,630.6559,253.3361,256.1087,231.325,188.0839,149.2392,125.5824,188.0838,149.2392,125.5824
Davies-Bouldin,0.8383,0.9231,0.9811,1.121,0.8866,1.0281,0.5312,0.5506,0.5332,0.8383,0.9231,0.9811,1.0136,1.3004,1.075,1.0136,1.3004,1.075
Homogeneity,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rand Index,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Completeness,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
dbscan_metric

Unnamed: 0_level_0,No Preprocessing,No Preprocessing,No Preprocessing,Normalization,Normalization,Normalization,Transformation,Transformation,Transformation,PCA,PCA,PCA,N + T,N + T,N + T,N + T + P,N + T + P,N + T + P
Unnamed: 0_level_1,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5,c=3,c=4,c=5
Silhouette,-0.2832,-0.2832,-0.2832,-0.3184,-0.3184,-0.3184,0,0,0,-0.2832,-0.2832,-0.2832,0,0,0,0,0,0
Calinski-Harabasz,16.0052,16.0052,16.0052,5.0309,5.0309,5.0309,0,0,0,16.0052,16.0052,16.0052,0,0,0,0,0,0
Davies-Bouldin,1.6832,1.6832,1.6832,2.4907,2.4907,2.4907,0,0,0,1.6832,1.6832,1.6832,0,0,0,0,0,0
Homogeneity,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0
Rand Index,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0
Completeness,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0
