# model selection

**2017301470026 王淳**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(42)
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
import time
import warnings

import numpy as np
import matplotlib.pyplot as plt

from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import MinMaxScaler
from itertools import cycle, islice

np.random.seed(42)
n_samples = 699
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics.cluster import normalized_mutual_info_score

In [3]:
# 导入数据集
df = pd.read_csv('breast.csv', sep=' ', header=None)
df.columns = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'label']
# 分离标签
labels_true = np.array(df.label)
data = df.iloc[:, :-1]

In [4]:
def model_selection(model, data, labels_ture):
    labels_pred = model.fit_predict(data)
    nmi = normalized_mutual_info_score(labels_ture, labels_pred)
    print(nmi)

# k-means

In [5]:
model = KMeans(n_clusters=2)
model_selection(model, data, labels_true)

0.7360780617978822


# SpectralClustering：核变换学习非线性边界

In [6]:
model = SpectralClustering(n_clusters=2,
                           affinity='nearest_neighbors',
                           n_neighbors=10,
                           assign_labels='kmeans')
model_selection(model, data, labels_true)

0.8202939814290227


# t-sne非线性嵌入+谱聚类

In [7]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=3, init='pca')
data_trans = tsne.fit_transform(data)
model = SpectralClustering(n_clusters=2,
                           affinity='nearest_neighbors',
                           assign_labels='kmeans')
model_selection(model, data_trans, labels_true)

0.6938960213793031


# t-sne非线性嵌入+高斯混合模型

In [8]:
from sklearn.mixture import GaussianMixture
tsne = TSNE(n_components=2, init='pca')
data_trans = tsne.fit_transform(data)
model = GaussianMixture(n_components=2, covariance_type="full")
model_selection(model, data_trans, labels_true)

0.5161061679296487


# 模型排名
- MeanShift
- DBSCAN
- Ward
- AffinityPropagation
- AgglomerativeClustering
- OPTICS
- Birch
- MiniBatchKMeans
- SpectralClustering

In [9]:
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
                                      noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None

# Anisotropicly distributed data
random_state = 42
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)
# blobs with varied variances
varied = datasets.make_blobs(n_samples=n_samples,
                             cluster_std=[1.0, 2.5, 0.5],
                             random_state=random_state)

# ============
# Set up cluster parameters
# ============
default_base = {'quantile': .3,
                'eps': .3,
                'damping': .9,
                'preference': -200,
                'n_neighbors': 10,
                'n_clusters': 3,
                'min_samples': 20,
                'xi': 0.05,
                'min_cluster_size': 0.1}

datasets = [
    (noisy_circles, {'damping': .77, 'preference': -240,
                     'quantile': .2, 'n_clusters': 2,
                     'min_samples': 20, 'xi': 0.25}),
    (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),
    (varied, {'eps': .18, 'n_neighbors': 2,
              'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}),
    (aniso, {'eps': .15, 'n_neighbors': 2,
             'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}),
    (blobs, {}),
    (no_structure, {})]

for i_dataset, (dataset, algo_params) in enumerate(datasets):
    # update parameters with dataset-specific values
    params = default_base.copy()
    params.update(algo_params)

    X, y = dataset

    # normalize dataset for easier parameter selection
    X = MinMaxScaler().fit_transform(data)

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(
        X, n_neighbors=params['n_neighbors'], include_self=True)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

In [10]:
# ============
# Create cluster objects
# ============
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2)
birch = cluster.Birch(n_clusters=2)
dbscan = cluster.DBSCAN(eps=params['eps'])
ward = cluster.AgglomerativeClustering(n_clusters=2,
                                       linkage='ward',
                                       connectivity=connectivity
                                      )
spectral = cluster.SpectralClustering(n_clusters=2,
                                      eigen_solver='arpack',
                                      affinity="nearest_neighbors"
                                     )
optics = cluster.OPTICS(min_samples=params['min_samples'],
                        xi=params['xi'],
                        min_cluster_size=params['min_cluster_size']
                       )
affinity_propagation = cluster.AffinityPropagation(damping=params['damping'],
                                                   preference=params['preference']
                                                  )
average_linkage = cluster.AgglomerativeClustering(linkage="average",
                                                  affinity="cityblock",
                                                  n_clusters=2,
                                                  connectivity=connectivity)
gmm = mixture.GaussianMixture(n_components=2,
                              covariance_type='full'
                             )

In [11]:
clustering_algorithms = (
    ('MiniBatchKMeans', two_means),
    ('AffinityPropagation', affinity_propagation),
    ('MeanShift', ms),
    ('SpectralClustering', spectral),
    ('Ward', ward),
    ('AgglomerativeClustering', average_linkage),
    ('DBSCAN', dbscan),
    ('OPTICS', optics),
    ('Birch', birch),
    ('GaussianMixture', gmm)
)

In [12]:
for name, algorithm in clustering_algorithms:
        algorithm.fit(data)
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(data)
        nmi = normalized_mutual_info_score(labels_true, y_pred)
        print(f'{name:<25}:  {nmi:.4f}')

MiniBatchKMeans          :  0.7495
AffinityPropagation      :  0.3732
MeanShift                :  0.2346
SpectralClustering       :  0.8203
Ward                     :  0.7941
AgglomerativeClustering  :  0.0307
DBSCAN                   :  0.1582
OPTICS                   :  0.4144
Birch                    :  0.6569
GaussianMixture          :  0.5582


  ratio = reachability_plot[:-1] / reachability_plot[1:]
