In [1]:
import optuna
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load and scale data
data = load_digits()
X = data.data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [3]:
def objective(trial):
    # Suggest hyperparameters
    n_clusters = trial.suggest_int('n_clusters', 2, 10)
    init_method = trial.suggest_categorical('init', ['k-means++', 'random'])
    max_iter = trial.suggest_int('max_iter', 100, 500)
    tol = trial.suggest_float('tol', 1e-5, 1e-1, log=True)

    # Create and fit KMeans
    kmeans = KMeans(
        n_clusters=n_clusters,
        init=init_method,
        max_iter=max_iter,
        tol=tol,
        random_state=42,
        n_init=10,
    )
    labels = kmeans.fit_predict(X_scaled)

    # Evaluate with silhouette score (higher is better)
    score = silhouette_score(X_scaled, labels)
    return score

In [4]:
# Create Optuna study to maximize silhouette score
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_trial = study.best_trial

best_params = best_trial.params

print(best_params)


[I 2025-07-15 21:50:01,629] A new study created in memory with name: no-name-2eeb2a7d-b4f4-478d-adab-071d023ad10a
[I 2025-07-15 21:50:01,968] Trial 0 finished with value: 0.12238683836395757 and parameters: {'n_clusters': 8, 'init': 'random', 'max_iter': 465, 'tol': 0.06740084895298537}. Best is trial 0 with value: 0.12238683836395757.
[I 2025-07-15 21:50:02,081] Trial 1 finished with value: 0.10229832879268379 and parameters: {'n_clusters': 5, 'init': 'k-means++', 'max_iter': 180, 'tol': 2.139731766349455e-05}. Best is trial 0 with value: 0.12238683836395757.
[I 2025-07-15 21:50:02,245] Trial 2 finished with value: 0.13598453312825654 and parameters: {'n_clusters': 9, 'init': 'k-means++', 'max_iter': 335, 'tol': 0.00013058451710958984}. Best is trial 2 with value: 0.13598453312825654.
[I 2025-07-15 21:50:02,362] Trial 3 finished with value: 0.11257715108993595 and parameters: {'n_clusters': 7, 'init': 'random', 'max_iter': 493, 'tol': 0.006753655215994728}. Best is trial 2 with value:

{'n_clusters': 10, 'init': 'k-means++', 'max_iter': 278, 'tol': 0.0002052282313219989}


In [None]:
kmeans_best = KMeans(
    n_clusters=best_params['n_clusters'],
    init=best_params['init'],
    max_iter=best_params['max_iter'],
    tol=best_params['tol'],
    random_state=42,
    n_init=10
)
labels_pred = kmeans_best.fit_predict(X_scaled)

# Reduce dimensionality for visualization
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# Plot true labels
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_true, cmap='tab10', s=15)
plt.title("True Digit Labels")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(*scatter.legend_elements(), title="Digits", bbox_to_anchor=(1, 1))
plt.grid(True)

# Plot KMeans clusters
plt.subplot(1, 2, 2)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels_pred, cmap='tab10', s=15)
plt.title(f"KMeans Clusters (k={best_params['n_clusters']})")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(*scatter.legend_elements(), title="Clusters", bbox_to_anchor=(1, 1))
plt.grid(True)

plt.tight_layout()
plt.show()