In [11]:
from pycaret.datasets import get_data
from pycaret.clustering import *
from sklearn.metrics import silhouette_score

# Load data
data = get_data('diabetes')

# Setup the environment
exp_clustering = setup(data, normalize=True, pca=True, pca_method='linear', pca_components=2, verbose=True)

best_model = None
best_num_clusters = None
best_silhouette_score = float('-inf')

# Try different numbers of clusters for KMeans
for num_clusters in range(2, 11):
    print(f"Trying KMeans with {num_clusters} clusters...")
    kmeans_model = create_model('kmeans', num_clusters=num_clusters, verbose=False)
    labels = assign_model(kmeans_model)
    silhouette = silhouette_score(data, labels.iloc[:, 0])  # Extracting the cluster labels as a 1-dimensional array
    print(f"Silhouette Score: {silhouette}")
    
    if silhouette > best_silhouette_score:
        best_model = kmeans_model
        best_num_clusters = num_clusters
        best_silhouette_score = silhouette

print(f"Best KMeans model has {best_num_clusters} clusters with Silhouette Score: {best_silhouette_score}")

# Print the overall best model and its parameters
print("Overall Best Model:")
print(best_model)
print("Best Number of Clusters:", best_num_clusters)
print("Best Silhouette Score:", best_silhouette_score)


Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Unnamed: 0,Description,Value
0,Session id,2302
1,Original data shape,"(768, 9)"
2,Transformed data shape,"(768, 2)"
3,Numeric features,9
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,Normalize,True
9,Normalize method,zscore


Trying KMeans with 2 clusters...
Silhouette Score: -0.36903974723195737
Trying KMeans with 3 clusters...
Silhouette Score: -0.36903974723195737
Trying KMeans with 4 clusters...
Silhouette Score: -0.36903974723195737
Trying KMeans with 5 clusters...
Silhouette Score: -0.36903974723195737
Trying KMeans with 6 clusters...
Silhouette Score: -0.36903974723195737
Trying KMeans with 7 clusters...
Silhouette Score: -0.36903974723195737
Trying KMeans with 8 clusters...
Silhouette Score: -0.36903974723195737
Trying KMeans with 9 clusters...
Silhouette Score: -0.36903974723195737
Trying KMeans with 10 clusters...
Silhouette Score: -0.36903974723195737
Best KMeans model has 2 clusters with Silhouette Score: -0.36903974723195737
Overall Best Model:
KMeans(n_clusters=2, random_state=2302)
Best Number of Clusters: 2
Best Silhouette Score: -0.36903974723195737
