In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score

df1 = pd.read_csv("data/S07-hw-dataset-01.csv")
sample_ids1 = df1["sample_id"]
X1 = df1.drop("sample_id", axis=1)

df2 = pd.read_csv("data/S07-hw-dataset-02.csv")
sample_ids2 = df2["sample_id"]
X2 = df2.drop("sample_id", axis=1)

df3 = pd.read_csv("data/S07-hw-dataset-03.csv")
sample_ids3 = df3["sample_id"]
X3 = df3.drop("sample_id", axis=1)

print("Первый датасет:")
print(X1.info())
print(X1.describe())
print("\nПропущенные значения:\n", X1.isnull().sum())

print("\nВторой датасет:")
print(X2.info())
print(X2.describe())
print("\nПропущенные значения:\n", X2.isnull().sum())

print("\nТретий датасет:")
print(X3.info())
print(X3.describe())
print("\nПропущенные значения:\n", X3.isnull().sum())

Первый датасет:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   f01     12000 non-null  float64
 1   f02     12000 non-null  float64
 2   f03     12000 non-null  float64
 3   f04     12000 non-null  float64
 4   f05     12000 non-null  float64
 5   f06     12000 non-null  float64
 6   f07     12000 non-null  float64
 7   f08     12000 non-null  float64
dtypes: float64(8)
memory usage: 750.1 KB
None
                f01           f02           f03           f04           f05  \
count  12000.000000  12000.000000  12000.000000  12000.000000  12000.000000   
mean      -2.424716     19.107804     -0.222063     -8.284501     -0.190717   
std       11.014315     60.790338      0.500630     59.269838      7.026435   
min      -19.912573    -92.892652     -1.590979   -134.303679    -11.869169   
25%       -9.472623    -40.282955     -0.125145    -48.345007   

In [91]:
scaler1 = StandardScaler()
X1_processed = scaler1.fit_transform(X1)
scaler2 = StandardScaler()
X2_processed = scaler2.fit_transform(X2)
scaler3 = StandardScaler()
X3_processed = scaler3.fit_transform(X3)

print("Препроцессинг завершен для всех датасетов")

Препроцессинг завершен для всех датасетов


In [92]:
def select_kmeans(X, dataset_name, max_k=15):
    silhouette_scores = []
    db_scores = []
    ch_scores = []
    k_range = range(2, max_k + 1)
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(X)
        silhouette_scores.append(silhouette_score(X, labels))
        db_scores.append(davies_bouldin_score(X, labels))
        ch_scores.append(calinski_harabasz_score(X, labels))
    
    plt.figure(figsize=(10, 6))
    plt.plot(k_range, silhouette_scores, 'bo-')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Silhouette Score')
    plt.title(f'{dataset_name}: Silhouette Score vs k')
    plt.grid(True)
    plt.savefig(f"artifacts/figures/{dataset_name.lower().replace(' ', '_')}_silhouette_vs_k.png")
    plt.close()
    
    plt.figure(figsize=(10, 6))
    plt.plot(k_range, db_scores, 'ro-')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Davies-Bouldin Score')
    plt.title(f'{dataset_name}: Davies-Bouldin Score vs k (lower is better)')
    plt.grid(True)
    plt.savefig(f"artifacts/figures/{dataset_name.lower().replace(' ', '_')}_db_vs_k.png")
    plt.close()
    
    best_k_silhouette = k_range[np.argmax(silhouette_scores)]
    best_k_db = k_range[np.argmin(db_scores)]
    best_k_ch = k_range[np.argmax(ch_scores)]
    
    print(f"\n{dataset_name} - KMeans results:")
    print(f"Лучший K по Silhouette: {best_k_silhouette} (score: {max(silhouette_scores):.3f})")
    print(f"Лучший K по Davies-Bouldin: {best_k_db} (score: {min(db_scores):.3f})")
    print(f"Лучший K по Calinski-Harabasz: {best_k_ch} (score: {max(ch_scores):.3f})")
    
    best_k = best_k_silhouette
    best_model = KMeans(n_clusters=best_k, random_state=42, n_init=10)
    best_labels = best_model.fit_predict(X)
    
    final_silhouette = silhouette_score(X, best_labels)
    final_db = davies_bouldin_score(X, best_labels)
    final_ch = calinski_harabasz_score(X, best_labels)
    
    print(f"Выбранный K = {best_k} с метриками:")
    print(f"  Silhouette: {final_silhouette:.3f}")
    print(f"  Davies-Bouldin: {final_db:.3f}")
    print(f"  Calinski-Harabasz: {final_ch:.3f}")
    
    return best_k, best_labels, final_silhouette, final_db, final_ch, best_model

In [93]:
k1, labels1_kmeans, sil1, db1, ch1, model1_kmeans = select_kmeans(X1_processed, "Dataset-01")
k2, labels2_kmeans, sil2, db2, ch2, model2_kmeans = select_kmeans(X2_processed, "Dataset-02")
k3, labels3_kmeans, sil3, db3, ch3, model3_kmeans = select_kmeans(X3_processed, "Dataset-03")


Dataset-01 - KMeans results:
Лучший K по Silhouette: 2 (score: 0.522)
Лучший K по Davies-Bouldin: 2 (score: 0.685)
Лучший K по Calinski-Harabasz: 2 (score: 11786.955)
Выбранный K = 2 с метриками:
  Silhouette: 0.522
  Davies-Bouldin: 0.685
  Calinski-Harabasz: 11786.955

Dataset-02 - KMeans results:
Лучший K по Silhouette: 2 (score: 0.307)
Лучший K по Davies-Bouldin: 15 (score: 0.966)
Лучший K по Calinski-Harabasz: 2 (score: 3573.393)
Выбранный K = 2 с метриками:
  Silhouette: 0.307
  Davies-Bouldin: 1.323
  Calinski-Harabasz: 3573.393

Dataset-03 - KMeans results:
Лучший K по Silhouette: 3 (score: 0.316)
Лучший K по Davies-Bouldin: 15 (score: 1.093)
Лучший K по Calinski-Harabasz: 2 (score: 7004.796)
Выбранный K = 3 с метриками:
  Silhouette: 0.316
  Davies-Bouldin: 1.158
  Calinski-Harabasz: 6957.163


In [94]:
def select_agglomerative(X, dataset_name, best_k):
    linkages = ['ward', 'complete', 'average', 'single']
    results = {}
    
    plt.figure(figsize=(12, 8))
    
    for i, linkage in enumerate(linkages):
        if linkage == 'ward':
            model = AgglomerativeClustering(n_clusters=best_k, linkage=linkage)
        else:
            model = AgglomerativeClustering(n_clusters=best_k, linkage=linkage, metric='euclidean')
        
        labels = model.fit_predict(X)
        sil = silhouette_score(X, labels)
        db = davies_bouldin_score(X, labels)
        ch = calinski_harabasz_score(X, labels)
        
        results[linkage] = {
            'silhouette': sil,
            'davies_bouldin': db,
            'calinski_harabasz': ch,
            'labels': labels.copy()
        }
        
        print(f"\n{dataset_name} - Agglomerative ({linkage}):")
        print(f"  Silhouette: {sil:.3f}")
        print(f"  Davies-Bouldin: {db:.3f}")
        print(f"  Calinski-Harabasz: {ch:.3f}")
    
    sil_scores = [results[l]['silhouette'] for l in linkages]
    db_scores = [results[l]['davies_bouldin'] for l in linkages]
    
    x = np.arange(len(linkages))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(12, 6))
    rects1 = ax.bar(x - width/2, sil_scores, width, label='Silhouette (higher better)')
    rects2 = ax.bar(x + width/2, [-score for score in db_scores], width, label='Davies-Bouldin (inverted, higher better)')
    
    ax.set_ylabel('Scores')
    ax.set_title(f'{dataset_name}: Agglomerative Clustering - linkage comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(linkages)
    ax.legend()
    
    def autolabel(rects, values):
        for rect, val in zip(rects, values):
            height = rect.get_height()
            ax.annotate(f'{val:.3f}',
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),
                        textcoords="offset points",
                        ha='center', va='bottom')
    
    autolabel(rects1, sil_scores)
    autolabel(rects2, db_scores)
    
    plt.tight_layout()
    plt.savefig(f"artifacts/figures/{dataset_name.lower().replace(' ', '_')}_agglomerative_linkage.png")
    plt.close()
    
    best_linkage = max(results, key=lambda x: results[x]['silhouette'])
    print(f"\n{dataset_name} - Best linkage: {best_linkage}")
    print(f"Metrics: Silhouette={results[best_linkage]['silhouette']:.3f}, " 
          f"Davies-Bouldin={results[best_linkage]['davies_bouldin']:.3f}")
    
    return best_linkage, results[best_linkage], results

In [95]:
best_linkage_ds1, best_agg_ds1, all_agg_ds1 = select_agglomerative(X1_processed, "Dataset-01", k1)


Dataset-01 - Agglomerative (ward):
  Silhouette: 0.522
  Davies-Bouldin: 0.685
  Calinski-Harabasz: 11786.955

Dataset-01 - Agglomerative (complete):
  Silhouette: 0.522
  Davies-Bouldin: 0.685
  Calinski-Harabasz: 11786.955

Dataset-01 - Agglomerative (average):
  Silhouette: 0.522
  Davies-Bouldin: 0.685
  Calinski-Harabasz: 11786.955

Dataset-01 - Agglomerative (single):
  Silhouette: 0.522
  Davies-Bouldin: 0.685
  Calinski-Harabasz: 11786.955

Dataset-01 - Best linkage: ward
Metrics: Silhouette=0.522, Davies-Bouldin=0.685


<Figure size 1200x800 with 0 Axes>

In [96]:
def select_dbscan(X, dataset_name):
    from sklearn.neighbors import NearestNeighbors
    nn = NearestNeighbors(n_neighbors=5)
    nn.fit(X)
    distances, _ = nn.kneighbors(X)
    k_distances = distances[:, -1]  
    k_distances_sorted = np.sort(k_distances)[::-1] 
    
    plt.figure(figsize=(10, 6))
    plt.plot(range(len(k_distances_sorted)), k_distances_sorted, 'b-')
    plt.xlabel('Points sorted by distance to 5th nearest neighbor')
    plt.ylabel('Distance to 5th nearest neighbor')
    plt.title(f'{dataset_name}: k-distance graph for DBSCAN parameter selection')
    plt.grid(True)
    plt.savefig(f"artifacts/figures/{dataset_name.lower().replace(' ', '_')}_dbscan_kdist.png")
    plt.close()
    
    eps_values = [0.3, 0.4, 0.5, 0.6, 0.7]
    min_samples_values = [5, 10, 15]
    
    results = {}
    best_score = -1
    best_params = None
    best_labels = None
    
    for eps in eps_values:
        for min_samples in min_samples_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(X)
            
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            if n_clusters <= 1:
                continue

            noise_ratio = np.sum(labels == -1) / len(labels)

            non_noise_mask = labels != -1
            if np.sum(non_noise_mask) > 1: 
                sil = silhouette_score(X[non_noise_mask], labels[non_noise_mask])
                db = davies_bouldin_score(X[non_noise_mask], labels[non_noise_mask])
                ch = calinski_harabasz_score(X[non_noise_mask], labels[non_noise_mask])
                
                results[(eps, min_samples)] = {
                    'silhouette': sil,
                    'davies_bouldin': db,
                    'calinski_harabasz': ch,
                    'noise_ratio': noise_ratio,
                    'n_clusters': n_clusters,
                    'labels': labels.copy()
                }
                
                print(f"{dataset_name} - DBSCAN(eps={eps}, min_samples={min_samples}):")
                print(f"  Clusters: {n_clusters}, Noise ratio: {noise_ratio:.2%}")
                print(f"  Silhouette (non-noise): {sil:.3f}, DB: {db:.3f}, CH: {ch:.3f}")
                
                if sil > best_score:
                    best_score = sil
                    best_params = (eps, min_samples)
                    best_labels = labels.copy()
    
    if best_params is None:
        return None, None
    
    print(f"\n{dataset_name} - лучшие DBSCAN параметры: eps={best_params[0]}, min_samples={best_params[1]}")
    print(f"Метрики:")
    print(f"  Silhouette: {results[best_params]['silhouette']:.3f}")
    print(f"  Davies-Bouldin: {results[best_params]['davies_bouldin']:.3f}")
    print(f"  Calinski-Harabasz: {results[best_params]['calinski_harabasz']:.3f}")
    print(f"  Noise ratio: {results[best_params]['noise_ratio']:.2%}")
    print(f"  Number of clusters: {results[best_params]['n_clusters']}")
    
    return best_params, results[best_params]

In [97]:
best_params_ds2, best_dbscan_ds2 = select_dbscan(X2_processed, "Dataset-02")

Dataset-02 - DBSCAN(eps=0.3, min_samples=5):
  Clusters: 5, Noise ratio: 7.24%
  Silhouette (non-noise): 0.085, DB: 0.590, CH: 21.952
Dataset-02 - DBSCAN(eps=0.4, min_samples=5):
  Clusters: 8, Noise ratio: 6.19%
  Silhouette (non-noise): 0.135, DB: 0.604, CH: 31.396
Dataset-02 - DBSCAN(eps=0.5, min_samples=5):
  Clusters: 14, Noise ratio: 4.50%
  Silhouette (non-noise): -0.058, DB: 0.790, CH: 49.822
Dataset-02 - DBSCAN(eps=0.5, min_samples=10):
  Clusters: 2, Noise ratio: 6.36%
  Silhouette (non-noise): 0.251, DB: 0.742, CH: 26.169
Dataset-02 - DBSCAN(eps=0.6, min_samples=5):
  Clusters: 12, Noise ratio: 2.26%
  Silhouette (non-noise): 0.138, DB: 0.902, CH: 69.332
Dataset-02 - DBSCAN(eps=0.6, min_samples=10):
  Clusters: 4, Noise ratio: 5.46%
  Silhouette (non-noise): 0.248, DB: 0.661, CH: 70.877
Dataset-02 - DBSCAN(eps=0.7, min_samples=10):
  Clusters: 5, Noise ratio: 3.64%
  Silhouette (non-noise): 0.305, DB: 0.601, CH: 114.429
Dataset-02 - DBSCAN(eps=0.7, min_samples=15):
  Cluster

In [98]:
best_params_ds3, best_dbscan_ds3 = select_dbscan(X3_processed, "Dataset-03")

Dataset-03 - DBSCAN(eps=0.3, min_samples=5):
  Clusters: 22, Noise ratio: 12.53%
  Silhouette (non-noise): -0.236, DB: 0.930, CH: 323.431
Dataset-03 - DBSCAN(eps=0.3, min_samples=10):
  Clusters: 8, Noise ratio: 21.57%
  Silhouette (non-noise): 0.038, DB: 1.232, CH: 1788.419
Dataset-03 - DBSCAN(eps=0.3, min_samples=15):
  Clusters: 7, Noise ratio: 29.93%
  Silhouette (non-noise): 0.269, DB: 1.010, CH: 3927.566
Dataset-03 - DBSCAN(eps=0.4, min_samples=5):
  Clusters: 6, Noise ratio: 4.95%
  Silhouette (non-noise): -0.040, DB: 1.010, CH: 1256.322
Dataset-03 - DBSCAN(eps=0.4, min_samples=10):
  Clusters: 4, Noise ratio: 7.51%
  Silhouette (non-noise): 0.120, DB: 1.020, CH: 2125.743
Dataset-03 - DBSCAN(eps=0.4, min_samples=15):
  Clusters: 4, Noise ratio: 10.27%
  Silhouette (non-noise): 0.224, DB: 1.276, CH: 3902.274
Dataset-03 - DBSCAN(eps=0.5, min_samples=5):
  Clusters: 4, Noise ratio: 2.22%
  Silhouette (non-noise): -0.104, DB: 0.842, CH: 12.756
Dataset-03 - DBSCAN(eps=0.5, min_sample

In [99]:
def plot_pca_clusters(X, labels, dataset_name, algorithm_name, noise_ratio=None):
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X)
    plt.figure(figsize=(12, 8))

    df_pca = pd.DataFrame({
        'PC1': X_pca[:, 0],
        'PC2': X_pca[:, 1],
        'Cluster': labels.astype(str)
    })
    
    if noise_ratio is not None:
        title = f'{dataset_name}: {algorithm_name} (Noise ratio: {noise_ratio:.1%})'
    else:
        title = f'{dataset_name}: {algorithm_name}'
    
    sns.scatterplot(data=df_pca, x='PC1', y='PC2', hue='Cluster', 
                    palette='viridis', alpha=0.8, s=50)
    
    plt.title(title)
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
    plt.grid(True)
    
    filename = f"{dataset_name.lower().replace(' ', '_')}_{algorithm_name.lower().replace(' ', '_')}_pca.png"
    plt.savefig(f"artifacts/figures/{filename}")
    plt.close()
    
    print(f"PCA визуализация сохранена для {dataset_name} с {algorithm_name}")
    return pca

In [100]:
plot_pca_clusters(X1_processed, labels1_kmeans, "Dataset-01", f"KMeans (k={k1})")
plot_pca_clusters(X1_processed, all_agg_ds1[best_linkage_ds1]['labels'], "Dataset-01", f"Agglomerative ({best_linkage_ds1})")

PCA визуализация сохранена для Dataset-01 с KMeans (k=2)
PCA визуализация сохранена для Dataset-01 с Agglomerative (ward)


0,1,2
,n_components,2
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42


In [101]:
plot_pca_clusters(X2_processed, labels2_kmeans, "Dataset-02", f"KMeans (k={k2})")

if best_dbscan_ds2 is not None:
    plot_pca_clusters(X2_processed, best_dbscan_ds2['labels'], "Dataset-02",
    f"DBSCAN (eps={best_params_ds2[0]}, ms={best_params_ds2[1]})",
    noise_ratio=best_dbscan_ds2['noise_ratio'])

PCA визуализация сохранена для Dataset-02 с KMeans (k=2)
PCA визуализация сохранена для Dataset-02 с DBSCAN (eps=0.7, ms=15)


In [102]:
plot_pca_clusters(X3_processed, labels3_kmeans, "Dataset-03", f"KMeans (k={k3})")

if best_dbscan_ds3 is not None:
    plot_pca_clusters(X3_processed, best_dbscan_ds3['labels'], "Dataset-03",
    f"DBSCAN (eps={best_params_ds3[0]}, ms={best_params_ds3[1]})",
    noise_ratio=best_dbscan_ds3['noise_ratio'])

PCA визуализация сохранена для Dataset-03 с KMeans (k=3)
PCA визуализация сохранена для Dataset-03 с DBSCAN (eps=0.3, ms=15)


In [103]:
def select_stability_kmeans(X, n_clusters, n_runs=5):
    all_labels = []
    inertias = []
    for i in range(n_runs):
        model = KMeans(n_clusters=n_clusters, random_state=i, n_init=10)
        labels = model.fit_predict(X)
        all_labels.append(labels)
        inertias.append(model.inertia_)

    ari_matrix = np.zeros((n_runs, n_runs))
    for i in range(n_runs):
        for j in range(i+1, n_runs):
            ari = adjusted_rand_score(all_labels[i], all_labels[j])
            ari_matrix[i, j] = ari
            ari_matrix[j, i] = ari
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(ari_matrix, annot=True, fmt=".3f", cmap='viridis',
                xticklabels=[f'Run {i}' for i in range(n_runs)],
                yticklabels=[f'Run {i}' for i in range(n_runs)])
    plt.title(f'Dataset-01: KMeans Stability (ARI between runs, k={n_clusters})')
    plt.tight_layout()
    plt.savefig("artifacts/figures/dataset-01_kmeans_stability_ari.png")
    plt.close()
    
    plt.figure(figsize=(10, 6))
    plt.plot(range(n_runs), inertias, 'bo-')
    plt.xlabel('Run number (random_state)')
    plt.ylabel('Inertia')
    plt.title(f'Dataset-01: KMeans Inertia across runs (k={n_clusters})')
    plt.grid(True)
    plt.savefig("artifacts/figures/dataset-01_kmeans_stability_inertia.png")
    plt.close()
    
    mean_ari = np.mean(ari_matrix[np.triu_indices(n_runs, k=1)])
    print(f"\nDataset-01 - K стабильные (k={n_clusters}):")
    print(f"Среднее ARI: {mean_ari:.3f}")
    print(f"Инерция: {[f'{x:.1f}' for x in inertias]}")
    print(f"Средняя инерция: {np.mean(inertias):.1f}, Std: {np.std(inertias):.1f}")
    
    return all_labels, ari_matrix, inertias

In [104]:
all_labels_ds1, ari_matrix_ds1, inertias_ds1 = select_stability_kmeans(X1_processed, k1)


Dataset-01 - K стабильные (k=2):
Среднее ARI: 1.000
Инерция: ['48425.9', '48425.9', '48425.9', '48425.9', '48425.9']
Средняя инерция: 48425.9, Std: 0.0


In [105]:
print("\nDataset-01 - Сравнение моделей:")
print(f"KMeans (k={k1}): Silhouette={sil1:.3f}, DB={db1:.3f}, CH={ch1:.3f}")
print(f"Agglomerative ({best_linkage_ds1}, k={k1}): "
f"Silhouette={best_agg_ds1['silhouette']:.3f}, "
f"DB={best_agg_ds1['davies_bouldin']:.3f}, "
f"CH={best_agg_ds1['calinski_harabasz']:.3f}")


Dataset-01 - Сравнение моделей:
KMeans (k=2): Silhouette=0.522, DB=0.685, CH=11786.955
Agglomerative (ward, k=2): Silhouette=0.522, DB=0.685, CH=11786.955


In [106]:
if best_agg_ds1['silhouette'] > sil1:
    best_model_ds1 = "Agglomerative"
    best_labels_ds1 = best_agg_ds1['labels']
    best_metrics_ds1 = {
    'silhouette': best_agg_ds1['silhouette'],
    'davies_bouldin': best_agg_ds1['davies_bouldin'],
    'calinski_harabasz': best_agg_ds1['calinski_harabasz'],
    'noise_ratio': 0.0
    }
    best_params_ds1 = {'linkage': best_linkage_ds1, 'n_clusters': k1}
else:
    best_model_ds1 = "KMeans"
    best_labels_ds1 = labels1_kmeans
    best_metrics_ds1 = {
    'silhouette': sil1,
    'davies_bouldin': db1,
    'calinski_harabasz': ch1,
    'noise_ratio': 0.0
    }
    best_params_ds1 = {'n_clusters': k1}

print(f"Лучшая модель для Dataset-01: {best_model_ds1}")

Лучшая модель для Dataset-01: KMeans


In [107]:
print("\nDataset-02 - Сравнение моделей:")
print(f"KMeans (k={k2}): Silhouette={sil2:.3f}, DB={db2:.3f}, CH={ch2:.3f}")

if best_dbscan_ds2 is not None:
    print(f"DBSCAN (eps={best_params_ds2[0]}, min_samples={best_params_ds2[1]}):")
    print(f" Silhouette (non-noise)={best_dbscan_ds2['silhouette']:.3f}, "
    f"DB={best_dbscan_ds2['davies_bouldin']:.3f}, "
    f"CH={best_dbscan_ds2['calinski_harabasz']:.3f}, "
    f"Noise ratio={best_dbscan_ds2['noise_ratio']:.2%}")
    best_model_ds2 = "DBSCAN"
    best_labels_ds2 = best_dbscan_ds2['labels']
    best_metrics_ds2 = {
        'silhouette': best_dbscan_ds2['silhouette'],
        'davies_bouldin': best_dbscan_ds2['davies_bouldin'],
        'calinski_harabasz': best_dbscan_ds2['calinski_harabasz'],
        'noise_ratio': best_dbscan_ds2['noise_ratio']
    }
    best_params_ds2 = {'eps': best_params_ds2[0], 'min_samples': best_params_ds2[1]}
else:
    best_model_ds2 = "KMeans"
    best_labels_ds2 = labels2_kmeans
    best_metrics_ds2 = {
    'silhouette': sil2,
    'davies_bouldin': db2,
    'calinski_harabasz': ch2,
    'noise_ratio': 0.0
    }
    best_params_ds2 = {'n_clusters': k2}

print(f"Лучшая модель для Dataset-02: {best_model_ds2}")


Dataset-02 - Сравнение моделей:
KMeans (k=2): Silhouette=0.307, DB=1.323, CH=3573.393
DBSCAN (eps=0.7, min_samples=15):
 Silhouette (non-noise)=0.349, DB=0.820, CH=133.196, Noise ratio=5.05%
Лучшая модель для Dataset-02: DBSCAN


In [108]:
print("\nDataset-03 - Сравнение моделей:")
print(f"KMeans (k={k3}): Silhouette={sil3:.3f}, DB={db3:.3f}, CH={ch3:.3f}")

if best_dbscan_ds3 is not None:
    print(f"DBSCAN (eps={best_params_ds3[0]}, min_samples={best_params_ds3[1]}):")
    print(f" Silhouette (non-noise)={best_dbscan_ds3['silhouette']:.3f}, "
    f"DB={best_dbscan_ds3['davies_bouldin']:.3f}, "
    f"CH={best_dbscan_ds3['calinski_harabasz']:.3f}, "
    f"Noise ratio={best_dbscan_ds3['noise_ratio']:.2%}")
    if best_dbscan_ds3['silhouette'] > sil3:
        best_model_ds3 = "DBSCAN"
        best_labels_ds3 = best_dbscan_ds3['labels']
        best_metrics_ds3 = {
            'silhouette': best_dbscan_ds3['silhouette'],
            'davies_bouldin': best_dbscan_ds3['davies_bouldin'],
            'calinski_harabasz': best_dbscan_ds3['calinski_harabasz'],
            'noise_ratio': best_dbscan_ds3['noise_ratio']
        }
        best_params_ds3 = {'eps': best_params_ds3[0], 'min_samples': best_params_ds3[1]}
    else:
        best_model_ds3 = "KMeans"
        best_labels_ds3 = labels3_kmeans
        best_metrics_ds3 = {
            'silhouette': sil3,
            'davies_bouldin': db3,
            'calinski_harabasz': ch3,
            'noise_ratio': 0.0
        }
        best_params_ds3 = {'n_clusters': k3}
else:
    best_model_ds3 = "KMeans"
    best_labels_ds3 = labels3_kmeans
    best_metrics_ds3 = {
    'silhouette': sil3,
    'davies_bouldin': db3,
    'calinski_harabasz': ch3,
    'noise_ratio': 0.0
    }
    best_params_ds3 = {'n_clusters': k3}

print(f"Лучшая модель для Dataset-03: {best_model_ds3}")


Dataset-03 - Сравнение моделей:
KMeans (k=3): Silhouette=0.316, DB=1.158, CH=6957.163
DBSCAN (eps=0.3, min_samples=15):
 Silhouette (non-noise)=0.269, DB=1.010, CH=3927.566, Noise ratio=29.93%
Лучшая модель для Dataset-03: KMeans


In [109]:
import json
import os
os.makedirs('artifacts/figures', exist_ok=True)
os.makedirs('artifacts/labels', exist_ok=True)

pd.DataFrame({
"sample_id": sample_ids1,
"cluster_label": best_labels_ds1
}).to_csv("artifacts/labels/labels_hw07_ds1.csv", index=False)

pd.DataFrame({
"sample_id": sample_ids2,
"cluster_label": best_labels_ds2
}).to_csv("artifacts/labels/labels_hw07_ds2.csv", index=False)

pd.DataFrame({
"sample_id": sample_ids3,
"cluster_label": best_labels_ds3
}).to_csv("artifacts/labels/labels_hw07_ds3.csv", index=False)

metrics_summary = {
    "dataset_01": {
    best_model_ds1: best_metrics_ds1
    },
    "dataset_02": {
    best_model_ds2: best_metrics_ds2
    },
    "dataset_03": {
    best_model_ds3: best_metrics_ds3
    }
}

with open('artifacts/metrics_summary.json', 'w') as f:
    json.dump(metrics_summary, f, indent=4)

best_configs = {
    "dataset_01": {
    "best_model": best_model_ds1,
    "params": best_params_ds1,
    "selection_metric": "silhouette score"
    },
    "dataset_02": {
    "best_model": best_model_ds2,
    "params": best_params_ds2,
    "selection_metric": "silhouette score (non-noise points for DBSCAN)"
    },
    "dataset_03": {
    "best_model": best_model_ds3,
    "params": best_params_ds3,
    "selection_metric": "silhouette score (non-noise points for DBSCAN)"
    }
}

with open('artifacts/best_configs.json', 'w') as f:
    json.dump(best_configs, f, indent=4)

print("\nВсе артефакты успешно сохранены в папку artifacts/")


Все артефакты успешно сохранены в папку artifacts/


In [110]:
print("ИТОГОВЫЕ ВЫВОДЫ")

print("\nDataset-01:")
print(f"Лучшая модель: {best_model_ds1}")
print("Сложности: разномасштабные признаки, шумовые признаки")
print("Обоснование выбора: " + ("Agglomerative с linkage='" + best_linkage_ds1 + "' показал лучшее качество по silhouette score"
if best_model_ds1 == "Agglomerative"
else "KMeans показал стабильные результаты с хорошим silhouette score"))
print("Препроцессинг был критичен для корректной работы алгоритмов кластеризации")

print("\nDataset-02:")
print(f"Лучшая модель: {best_model_ds2}")
print("Сложности: нелинейная структура кластеров, выбросы")
print("Обоснование выбора: " + ("DBSCAN эффективно обработал нелинейную структуру и выделил шумовые точки"
if best_model_ds2 == "DBSCAN"
else "KMeans показал приемлемое качество, несмотря на нелинейную структуру"))
print("DBSCAN лучше справился с выбросами, отнеся их к шуму")

print("\nDataset-03:")
print(f"Лучшая модель: {best_model_ds3}")
print("Сложности: кластеры разной плотности, фоновый шум")
print("Обоснование выбора: " + ("DBSCAN лучше обработал кластеры разной плотности и выделил шум"
if best_model_ds3 == "DBSCAN"
else "KMeans показал более стабильные результаты при наличии шума"))
print("Подбор параметров eps и min_samples был критичен для DBSCAN")

print("\nОбщие выводы:")
print("- Препроцессинг (масштабирование) необходим для всех distance-based методов кластеризации")
print("- KMeans эффективен для сферических кластеров одинаковой плотности")
print("- DBSCAN лучше подходит для нелинейных структур и данных с шумом/выбросами")
print("- Agglomerative Clustering с правильным linkage может дать хорошие результаты для сложных структур")
print("- Внутренние метрики (особенно silhouette score) помогают в выборе оптимальных параметров")

ИТОГОВЫЕ ВЫВОДЫ

Dataset-01:
Лучшая модель: KMeans
Сложности: разномасштабные признаки, шумовые признаки
Обоснование выбора: KMeans показал стабильные результаты с хорошим silhouette score
Препроцессинг был критичен для корректной работы алгоритмов кластеризации

Dataset-02:
Лучшая модель: DBSCAN
Сложности: нелинейная структура кластеров, выбросы
Обоснование выбора: DBSCAN эффективно обработал нелинейную структуру и выделил шумовые точки
DBSCAN лучше справился с выбросами, отнеся их к шуму

Dataset-03:
Лучшая модель: KMeans
Сложности: кластеры разной плотности, фоновый шум
Обоснование выбора: KMeans показал более стабильные результаты при наличии шума
Подбор параметров eps и min_samples был критичен для DBSCAN

Общие выводы:
- Препроцессинг (масштабирование) необходим для всех distance-based методов кластеризации
- KMeans эффективен для сферических кластеров одинаковой плотности
- DBSCAN лучше подходит для нелинейных структур и данных с шумом/выбросами
- Agglomerative Clustering с прав