In [1]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_score, completeness_score, adjusted_rand_score
import pandas as pd

In [2]:
iris = datasets.load_iris()
X = iris.data

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

results = []
true_labels = iris.target

In [3]:
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans.fit(X_scaled)
labels = kmeans.labels_

homogeneity_val = homogeneity_score(true_labels, labels)
completeness_val = completeness_score(true_labels, labels)
ari_score = adjusted_rand_score(true_labels, labels)

results.append({
    'NumComponents': "without component",
    'Homogeneity': homogeneity_val,
    'Completeness': completeness_val,
    'ARI': ari_score,
})

In [4]:
for n in range(1, 4):
    pca = PCA(n_components=n)
    X_pca = pca.fit_transform(X_scaled)

    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    kmeans.fit(X_pca)
    labels = kmeans.labels_

    homogeneity_val = homogeneity_score(true_labels, labels)
    completeness_val = completeness_score(true_labels, labels)
    ari_score = adjusted_rand_score(true_labels, labels)

    results.append({
        'NumComponents': n,
        'Homogeneity': homogeneity_val,
        'Completeness': completeness_val,
        'ARI': ari_score,
    })

results_df = pd.DataFrame(results)

print(results_df)

       NumComponents  Homogeneity  Completeness       ARI
0  without component     0.659127      0.659848  0.620135
1                  1     0.789914      0.790010  0.801550
2                  2     0.659127      0.659848  0.620135
3                  3     0.659127      0.659848  0.620135


In [5]:
max_ari = results_df['ARI'].max()
best_n_components_ari = results_df['NumComponents'][results_df['ARI'].idxmax()]

homogeneity_for_best_ari = results_df.loc[results_df['NumComponents'] == best_n_components_ari, 'Homogeneity'].values[0]
completeness_for_best_ari = results_df.loc[results_df['NumComponents'] == best_n_components_ari, 'Completeness'].values[0]

print("Максимальное значение ARI:", max_ari)
print("Оптимальное значение n_components по ARI:", best_n_components_ari)
print("Homogeneity для оптимальной конфигурации:", homogeneity_for_best_ari)
print("Completeness для оптимальной конфигурации:", completeness_for_best_ari)

Максимальное значение ARI: 0.8015498244339508
Оптимальное значение n_components по ARI: 1
Homogeneity для оптимальной конфигурации: 0.7899143618189679
Completeness для оптимальной конфигурации: 0.7900102479875966
