In [123]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd


n_samples = 1500
dataset = datasets.make_blobs(n_samples=n_samples, centers=2, center_box=(-7.0, 7.5),
                              cluster_std=[1.4, 1.7],
                              random_state=42)
X_2, _ = datasets.make_blobs(n_samples=n_samples, random_state=170, centers=[[-4, -3]], cluster_std=[1.9])
transformation = [[1.2, -0.8], [-0.4, 1.7]]
X_2 = np.dot(X_2, transformation)
X, y = np.concatenate((dataset[0], X_2)), np.concatenate((dataset[1], np.array([2] * len(X_2))))

In [8]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# сначала получим предсказанные кластеры при помощи метода кластеризации
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
kmeans_pred = kmeans.labels_

# теперь посчитаем коэффициент силуэта
silhouette_score(X=X, labels=kmeans_pred, metric='euclidean')

0.5131660482634046

In [16]:
from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components=3, random_state=42)
gm.fit(X)
gm_labels = gm.predict(X)
np.round(silhouette_score(X=X, labels=gm_labels, metric='euclidean'), 1)

0.4

In [18]:
from sklearn.cluster import AgglomerativeClustering, DBSCAN

ac = AgglomerativeClustering(n_clusters=3)
dbscan = DBSCAN(eps=0.9, min_samples=35)

ac.fit(X)
dbscan.fit(X)

print(np.round(silhouette_score(X=X, labels=kmeans.labels_, metric='euclidean'), 2))
print(np.round(silhouette_score(X=X, labels=gm_labels, metric='euclidean'), 2))
print(np.round(silhouette_score(X=X, labels=ac.labels_, metric='euclidean'), 2))
print(np.round(silhouette_score(X=X, labels=dbscan.labels_, metric='euclidean'), 2))


0.51
0.4
0.48
0.45


In [35]:
best_scores = dict(kmeans={'scores':[], 'n_clusters':[]}, em={'scores':[], 'n_clusters':[]}, ac={'scores':[], 'n_clusters':[]})

for n_clusters in range(2, 11):
    clfs = [
        (KMeans(n_clusters=n_clusters, random_state=42), 0, 'kmeans'),
        (GaussianMixture(n_components=n_clusters, random_state=42), 1, 'em'),
        (AgglomerativeClustering(n_clusters=n_clusters), 0, 'ac')
    ]

    for clf in clfs:
        print(f'Fitting {clf[0]}')
        
        clf[0].fit(X)

        if clf[1]:
            labels = clf[0].predict(X)
        else:
            labels = clf[0].labels_

        best_scores[clf[2]]['scores'].append(np.round(silhouette_score(X=X, labels=labels, metric='euclidean'), 3))
        best_scores[clf[2]]['n_clusters'].append(n_clusters)



    

Fitting KMeans(n_clusters=2, random_state=42)
Fitting GaussianMixture(n_components=2, random_state=42)
Fitting AgglomerativeClustering()
Fitting KMeans(n_clusters=3, random_state=42)
Fitting GaussianMixture(n_components=3, random_state=42)
Fitting AgglomerativeClustering(n_clusters=3)
Fitting KMeans(n_clusters=4, random_state=42)
Fitting GaussianMixture(n_components=4, random_state=42)
Fitting AgglomerativeClustering(n_clusters=4)
Fitting KMeans(n_clusters=5, random_state=42)
Fitting GaussianMixture(n_components=5, random_state=42)
Fitting AgglomerativeClustering(n_clusters=5)
Fitting KMeans(n_clusters=6, random_state=42)
Fitting GaussianMixture(n_components=6, random_state=42)
Fitting AgglomerativeClustering(n_clusters=6)
Fitting KMeans(n_clusters=7, random_state=42)
Fitting GaussianMixture(n_components=7, random_state=42)
Fitting AgglomerativeClustering(n_clusters=7)
Fitting KMeans(random_state=42)
Fitting GaussianMixture(n_components=8, random_state=42)
Fitting AgglomerativeClusteri

In [46]:
kmeans_scores = dict(zip(best_scores['kmeans']['n_clusters'], best_scores['kmeans']['scores']))
em_scores = dict(zip(best_scores['em']['n_clusters'], best_scores['em']['scores']))
ac_scores = dict(zip(best_scores['ac']['n_clusters'], best_scores['ac']['scores']))

print('KMEANS', max(kmeans_scores, key=kmeans_scores.get))
print('EM', max(em_scores, key=em_scores.get))
print('AC', max(ac_scores, key=ac_scores.get))

KMEANS 3
EM 4
AC 4


In [85]:
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score
from sklearn.preprocessing import StandardScaler


# сначала получим предсказанные кластеры при помощи метода кластеризации
kmeans = KMeans(n_clusters=3, random_state=42)
gm = GaussianMixture(n_components=3, random_state=42)
ac = AgglomerativeClustering(n_clusters=3)
dbscan = DBSCAN(eps=0.9, min_samples=35)

# Нормируем данные
X_scaled = StandardScaler().fit_transform(X)

# Обучаем модели
kmeans.fit(X_scaled) 
gm.fit(X_scaled)
ac.fit(X_scaled)
dbscan.fit(X_scaled)

# Получаем метки
kmeans_pred = kmeans.labels_ 
gm_pred = gm.predict(X_scaled)
ac_pred = ac.labels_ 
dbscan_pred = dbscan.labels_ 

# теперь посчитаем однородность 
print('Однородность')
print(f'KMEANS: {np.round(homogeneity_score(labels_true=y, labels_pred=kmeans_pred), 2)}')
print(f'EM: {np.round(homogeneity_score(labels_true=y, labels_pred=gm_pred), 2)}')
print(f'AC: {np.round(homogeneity_score(labels_true=y, labels_pred=ac_pred), 2)}')
print(f'DBSCAN: {np.round(homogeneity_score(labels_true=y, labels_pred=dbscan_pred), 2)}')

# Полнота
print('Полнота')
print(f'KMEANS: {np.round(completeness_score(labels_true=y, labels_pred=kmeans_pred), 2)}')
print(f'EM: {np.round(completeness_score(labels_true=y, labels_pred=gm_pred), 2)}')
print(f'AC: {np.round(completeness_score(labels_true=y, labels_pred=ac_pred), 2)}')
print(f'DBSCAN: {np.round(completeness_score(labels_true=y, labels_pred=dbscan_pred), 2)}')

# V-мера
print('V-мера')
print(f'KMEANS: {np.round(v_measure_score(labels_true=y, labels_pred=kmeans_pred), 2)}')
print(f'EM: {np.round(v_measure_score(labels_true=y, labels_pred=gm_pred), 2)}')
print(f'AC: {np.round(v_measure_score(labels_true=y, labels_pred=ac_pred), 2)}')
print(f'DBSCAN: {np.round(v_measure_score(labels_true=y, labels_pred=dbscan_pred), 2)}')


Однородность
KMEANS: 0.8
EM: 0.93
AC: 0.91
DBSCAN: 0.0
Полнота
KMEANS: 0.78
EM: 0.93
AC: 0.91
DBSCAN: 0.08
V-мера
KMEANS: 0.79
EM: 0.93
AC: 0.91
DBSCAN: 0.0


In [67]:
kmeans1 = KMeans(n_clusters=3, init='k-means++', n_init=1, random_state=42)
kmeans2 = KMeans(n_clusters=3, init='random', n_init=1, random_state=42)

kmeans1.fit(X_scaled)
kmeans2.fit(X_scaled)

# V-мера
print('V-мера')
print(f'KMEANS 1: {np.round(v_measure_score(labels_true=y, labels_pred=kmeans1.labels_), 2)}')
print(f'KMEANS 2: {np.round(v_measure_score(labels_true=y, labels_pred=kmeans2.labels_), 2)}')



V-мера
KMEANS 1: 0.79
KMEANS 2: 0.79


In [69]:
from sklearn.cluster import MiniBatchKMeans

kmeans = KMeans(n_clusters=3, n_init=1, random_state=42)
mbkmeans = MiniBatchKMeans(n_clusters=3, n_init=1, random_state=42)

kmeans.fit(X_scaled)
mbkmeans.fit(X_scaled)

# V-мера
print('V-мера')
print(f'KMEANS 1: {np.round(v_measure_score(labels_true=y, labels_pred=kmeans.labels_), 2)}')
print(f'KMEANS 2: {np.round(v_measure_score(labels_true=y, labels_pred=mbkmeans.labels_), 2)}')


V-мера
KMEANS 1: 0.79
KMEANS 2: 0.75


In [76]:
params = ['ward', 'complete', 'average', 'single']

for param in params:
    clf = AgglomerativeClustering(n_clusters=3, linkage=param)
    clf.fit(X_scaled)
    print(f'linkage={param} v-score: {np.round(v_measure_score(labels_true=y, labels_pred=clf.labels_), 2)}')

linkage=ward v-score: 0.91
linkage=complete v-score: 0.58
linkage=average v-score: 0.68
linkage=single v-score: 0.0


In [78]:
from sklearn.neighbors import kneighbors_graph

connectivity = kneighbors_graph(X, n_neighbors=6, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)

clf = AgglomerativeClustering(n_clusters=3)
clf_c = AgglomerativeClustering(n_clusters=3, connectivity=connectivity)

clf.fit(X_scaled)
clf_c.fit(X_scaled)

print(f'connectivity=False v-score: {np.round(v_measure_score(labels_true=y, labels_pred=clf.labels_), 2)}')
print(f'connectivity=True v-score: {np.round(v_measure_score(labels_true=y, labels_pred=clf_c.labels_), 2)}')

connectivity=False v-score: 0.91
connectivity=True v-score: 0.88


In [110]:
dbscan1 = DBSCAN(eps=0.9, min_samples=35)
dbscan2 = DBSCAN(eps=0.8, min_samples=35)

dbscan1.fit(X)
dbscan2.fit(X)

print(f'eps=0.9 v-score: {np.round(v_measure_score(labels_true=y, labels_pred=dbscan1.labels_), 2)}')
print(f'eps=0.8 v-score: {np.round(v_measure_score(labels_true=y, labels_pred=dbscan2.labels_), 2)}')

eps=0.9 v-score: 0.77
eps=0.8 v-score: 0.71


In [114]:
X_df = pd.DataFrame(X, columns=['x1', 'x2'])
X_df['cluster'] = dbscan1.labels_
X_df['y'] = y

In [118]:
X_without_outliers = X_df[X_df.cluster != -1].drop('cluster', axis=1).copy()
y_without_outliers = X_without_outliers['y']

In [121]:
dbscan = DBSCAN(eps=0.9, min_samples=35)
dbscan.fit(X_without_outliers.drop('y', axis=1))
print(f'eps=0.9 v-score: {np.round(v_measure_score(labels_true=y_without_outliers, labels_pred=dbscan.labels_), 2)}')


eps=0.9 v-score: 0.97


In [130]:
from sklearn.preprocessing import MinMaxScaler
X_scaled = MinMaxScaler().fit_transform(X)

clf = AgglomerativeClustering(n_clusters=3)
clf_scaled = AgglomerativeClustering(n_clusters=3)
clf.fit(X)
clf_scaled.fit(X_scaled)

print(f'Normal v-score: {np.round(v_measure_score(labels_true=y, labels_pred=clf.labels_), 2)}')
print(f'Scaled v-score: {np.round(v_measure_score(labels_true=y, labels_pred=clf_scaled.labels_), 2)}')



Normal v-score: 0.7
Scaled v-score: 0.89


In [128]:
X.shape

(3000, 2)