介绍了利用高斯混合模型（GMM）解决聚类问题

In [16]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [6]:
# 1.载入鸢尾花数据
iris=load_iris()
x=iris.data
y=iris.target

In [11]:
# 2. 统计特征均值和方差
mu = np.array([np.mean(x[y==i], axis=0) for i in range(3)])
std = np.array([np.std(x[y==i], axis=0) for i in range(3)])
print('实际均值为\n', mu)
print('实际标准差为\n', std)

实际均值为
 [[5.006 3.428 1.462 0.246]
 [5.936 2.77  4.26  1.326]
 [6.588 2.974 5.552 2.026]]
实际标准差为
 [[0.34894699 0.37525458 0.17191859 0.10432641]
 [0.51098337 0.31064449 0.46518813 0.19576517]
 [0.62948868 0.31925538 0.54634787 0.27188968]]


In [19]:
# 3. 利用K-means算法进行聚类
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=0)
y_hat1 = kmeans.fit_predict(x)
mu1 = np.array([np.mean(x[y_hat1==i], axis=0) for i in range(3)])
print('Kmeans算法得到的各聚类的均值为\n', mu1)

Kmeans算法得到的各聚类的均值为
 [[5.9016129  2.7483871  4.39354839 1.43387097]
 [5.006      3.428      1.462      0.246     ]
 [6.85       3.07368421 5.74210526 2.07105263]]


In [21]:
# 可见作为无监督学习，其cluster类与最初的标签并不一致，需要手动调整类0和类1
mask0 = (y_hat1==0)
mask1 = (y_hat1==1)
y_hat1[mask0] = 1
y_hat1[mask1] = 0
print("分类正确率为:{}".format(accuracy_score(y, y_hat1)))

分类正确率为:0.8933333333333333


In [24]:
# 4. 利用GMM算法进行聚类
gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=0)
gmm.fit(x)
print("GMM算法得到的各聚类的均值为\n", gmm.means_)

GMM算法得到的各聚类的均值为
 [[5.006      3.428      1.462      0.246     ]
 [6.54639415 2.94946365 5.48364578 1.98726565]
 [5.9170732  2.77804839 4.20540364 1.29848217]]


In [25]:
# 需要手动调整类1和类2
y_hat2 = gmm.predict(x)
mask1 = (y_hat2==1)
mask2 = (y_hat2==2)
y_hat2[mask1] = 2
y_hat1[mask2] = 1
print("分类正确率为:{}".format(accuracy_score(y, y_hat2)))

分类正确率为:0.6666666666666666
