In [4]:
from pycaret.datasets import get_data
from pycaret.clustering import *
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Load the data from UCI library
data = get_data('diabetes')

# Setup the environment
exp_clustering = setup(data, normalize=True, pca=True, pca_method='linear', pca_components=2, verbose=True)

best_model = None
best_num_clusters = None
best_silhouette_score = float('-inf')
best_calinski_harabasz_score = float('-inf')
best_davies_bouldin_score = float('inf')  # Initialize with infinity for minimization

# Try different numbers of clusters for KMeans
for num_clusters in range(2, 11):
    print(f"Trying KMeans with {num_clusters} clusters...")
    kmeans_model = create_model('kmeans', num_clusters=num_clusters, verbose=False)
    labels = assign_model(kmeans_model)
    silhouette = silhouette_score(data, labels.iloc[:, 0])  # Extracting the cluster labels as a 1-dimensional array
    calinski_harabasz = calinski_harabasz_score(data, labels.iloc[:, 0])
    davies_bouldin = davies_bouldin_score(data, labels.iloc[:, 0])
    print(f"Silhouette Score: {silhouette}, Calinski-Harabasz Score: {calinski_harabasz}, Davies-Bouldin Score: {davies_bouldin}")
    
    # Update best scores and model if necessary
    if silhouette > best_silhouette_score:
        best_model = kmeans_model
        best_num_clusters = num_clusters
        best_silhouette_score = silhouette
        best_calinski_harabasz_score = calinski_harabasz
        best_davies_bouldin_score = davies_bouldin

# Print the best KMeans model and its scores
print("Best KMeans Model:")
print(best_model)
print("Best Number of Clusters:", best_num_clusters)
print("Best Silhouette Score:", best_silhouette_score)
print("Best Calinski-Harabasz Score:", best_calinski_harabasz_score)
print("Best Davies-Bouldin Score:", best_davies_bouldin_score)

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Unnamed: 0,Description,Value
0,Session id,3066
1,Original data shape,"(768, 9)"
2,Transformed data shape,"(768, 2)"
3,Numeric features,9
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,Normalize,True
9,Normalize method,zscore


Trying KMeans with 2 clusters...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying KMeans with 3 clusters...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying KMeans with 4 clusters...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying KMeans with 5 clusters...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying KMeans with 6 clusters...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying KMeans with 7 clusters...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying KMeans with 8 clusters...
Silhouette Sc

In [5]:
# Try different numbers of clusters for Agglomerative Clustering
for num_clusters in range(2, 11):
    print(f"Trying Agglomerative Clustering with {num_clusters} clusters...")
    agg_model = create_model('hclust', num_clusters=num_clusters, verbose=False)
    labels = assign_model(agg_model)
    silhouette = silhouette_score(data, labels.iloc[:, 0])  # Extracting the cluster labels as a 1-dimensional array
    calinski_harabasz = calinski_harabasz_score(data, labels.iloc[:, 0])
    davies_bouldin = davies_bouldin_score(data, labels.iloc[:, 0])
    print(f"Silhouette Score: {silhouette}, Calinski-Harabasz Score: {calinski_harabasz}, Davies-Bouldin Score: {davies_bouldin}")
    
    # Update best scores and model if necessary
    if silhouette > best_silhouette_score:
        best_model = agg_model
        best_num_clusters = num_clusters
        best_silhouette_score = silhouette
        best_calinski_harabasz_score = calinski_harabasz
        best_davies_bouldin_score = davies_bouldin

# Print the best Agglomerative Clustering model and its scores
print("Best Agglomerative Clustering Model:")
print(best_model)
print("Best Number of Clusters:", best_num_clusters)
print("Best Silhouette Score:", best_silhouette_score)
print("Best Calinski-Harabasz Score:", best_calinski_harabasz_score)
print("Best Davies-Bouldin Score:", best_davies_bouldin_score)

Trying Agglomerative Clustering with 2 clusters...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying Agglomerative Clustering with 3 clusters...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying Agglomerative Clustering with 4 clusters...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying Agglomerative Clustering with 5 clusters...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying Agglomerative Clustering with 6 clusters...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying Agglomerative Clustering with 7 clusters...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score:

In [6]:
# Try different numbers of clusters for DBSCAN
for eps in range(1, 11):
    print(f"Trying DBSCAN with eps={eps}...")
    dbscan_model = create_model('dbscan', eps=eps, verbose=False)
    labels = assign_model(dbscan_model)
    silhouette = silhouette_score(data, labels.iloc[:, 0])  # Extracting the cluster labels as a 1-dimensional array
    calinski_harabasz = calinski_harabasz_score(data, labels.iloc[:, 0])
    davies_bouldin = davies_bouldin_score(data, labels.iloc[:, 0])
    print(f"Silhouette Score: {silhouette}, Calinski-Harabasz Score: {calinski_harabasz}, Davies-Bouldin Score: {davies_bouldin}")
    
    # Update best scores and model if necessary
    if silhouette > best_silhouette_score:
        best_model = dbscan_model
        best_num_clusters = None  # DBSCAN does not use a fixed number of clusters
        best_silhouette_score = silhouette
        best_calinski_harabasz_score = calinski_harabasz
        best_davies_bouldin_score = davies_bouldin

# Print the best DBSCAN model and its scores
print("Best DBSCAN Model:")
print(best_model)
print("Best Number of Clusters:", best_num_clusters)
print("Best Silhouette Score:", best_silhouette_score)
print("Best Calinski-Harabasz Score:", best_calinski_harabasz_score)
print("Best Davies-Bouldin Score:", best_davies_bouldin_score)

Trying DBSCAN with eps=1...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying DBSCAN with eps=2...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying DBSCAN with eps=3...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying DBSCAN with eps=4...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying DBSCAN with eps=5...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying DBSCAN with eps=6...
Silhouette Score: -0.36903974723195737, Calinski-Harabasz Score: 1.3367510768156425, Davies-Bouldin Score: 15.209199963290507
Trying DBSCAN with eps=7...
Silhouette Score: -0.36903974723195737, Calinski