In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import datasets
from sklearn.datasets import fetch_openml

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.naive_bayes import GaussianNB#高斯朴素贝叶斯算法
from sklearn.decomposition import PCA#主成分分析

from scipy.stats import multivariate_normal

from collections import Counter
import math

In [60]:
class MMG:
    def __init__(self):
        self.model = None
        self.k = 2
        
    def GMM_component(self, X, theta,param, c):#由联合正态分布计算GMM的单个成员
        sign = X-theta['mu'][c]
        exponent = math.exp((-0.5*np.dot(sign.T,np.dot(np.linalg.inv(theta['sigma'][c]),sign))))
        pp = exponent/(((2*math.pi)**(param["dim"]/2))*(np.linalg.det(theta['sigma'][c])**0.5))
#         return theta['pi'][c]*multivariate_normal(theta['mu'][c], theta['sigma'][c,:,:]).pdf(X)
        return theta['pi'][c]*pp

    def E_step(self, theta, param,X):#E步：更新隐变量概率分布q(Z)。这里的X是一个样本
        q = np.zeros((param['k'],len(X)))
        for i in range(param['k']):
            for j in range(len(X)):
                q[i,j] = self.GMM_component(X[j], theta ,param, i)
            
        q /= q.sum(axis=0)
        return q

    def M_step(self,X,q,theta,param):#M步：使用q(Z)更新GMM参数。
        pi_temp = q.sum(axis=1); pi_temp /= len(X) # 计算pi
        mu_temp = q.dot(X); mu_temp /= q.sum(axis=1)[:, None] # 计算mu
        sigma_temp = np.zeros((param['k'], param['dim'], param['dim']))
        for i in range(param['k']):
            ys = X - mu_temp[i, :]
            sigma_temp[i] = np.sum(q[i, :, None, None]*np.matmul(ys[..., None], ys[:, None, :]), axis=0)
        sigma_temp /= np.sum(q, axis=1)[:, None, None] # 计算sigma
        theta['pi'] = pi_temp; theta['mu'] = mu_temp; theta['sigma'] = sigma_temp
        return theta

    def likelihood(self,X,theta,param):#计算GMM的对数似然。
        ll = 0
        for i in range(param['k']):
            ll += self.GMM_component(X[0], theta, i)
        ll = np.log(ll).sum()
        return ll

    def EM_GMM(self,X,theta,param,eps=1e-5,max_iter=1000):
        '''
        高斯混合模型的EM算法求解
            theta: GMM模型参数; param: 其它系数; eps: 计算精度; max_iter: 最大迭代次数
            返回对数似然和参数theta，theta是包含pi、mu、sigma的Python字典
        '''
        print(X[1].shape)#是一个由array构成的list
        for i in range(max_iter):
            ll_old = 0
            q = self.E_step(theta, param,X)# E-step
            theta = self.M_step(X, q, theta, param)# M-step
#             ll_new = self.likelihood(X, theta,param)
#             if np.abs(ll_new - ll_old) < eps:
#                 break;
#             else:
#                 ll_old = ll_new
        ll_new = 1
        return ll_new, theta

    # 分类别求出数学期望和标准差
    def train(self, X, y):
        theta = {}; param = {}
        param['k'] = self.k; param['N'] = X.shape[0]; param['dim'] = X.shape[1]
        theta['pi'] = np.ones(param['k'])/param['k']                 # 均匀初始化
        theta['mu'] = np.random.random((param['k'],param['dim']))    # 随机初始化
        theta['sigma'] = np.array([np.eye(param['dim'])]*param['k']) # 初始化为单位正定矩阵
        
        labels = list(set(y))#标签的列表
        data = {label:[] for label in labels}#{0.0: [], 1.0: []}
        for f, label in zip(X, y):
            data[label].append(f)#print(data)#形成一个字典，根据标签将训练样本进行分类
#         print(data)
        self.model = {label: self.EM_GMM(value,theta,param,eps=1e-5,max_iter=50) for label, value in data.items()}
        print(self.model)
        return self.model

    # 计算概率
    def calculate_probabilities(self, input_data):
        probabilities = {}
        pp = 0.0
        dim = np.size(input_data)
        for label, value in self.model.items():#value是一个字典
            for i in range(self.k):
                mu,sigma = value['mu'][i],value['sigma'][i]
                sign = input_data-mu
                exponent = math.exp((-0.5*np.dot(sign,np.dot(np.linalg.inv(sigma),sign.T))))
                pp = exponent/(((2*math.pi)**(dim/2))*(np.linalg.det(sigma)**0.5))
                probabilities[label] += value['pi'][i]*pp

        return probabilities
    
    # 类别
    def predict(self, X_test):
        label = list(range(X_test.shape[0]))
        for i in range(X_test.shape[0]):#每个样本迭代一次
            label[i] = sorted(self.calculate_probabilities(X_test[i,:]).items(), key=lambda x: x[-1])[-1][0]
        
        return label
    
    def score(self, X_test, y_test):
        right = 0
        for X, y in zip(X_test, y_test):
            label = self.predict(X)
            if label == y:
                right += 1

        return right / float(len(X_test))

In [61]:
iris = datasets.load_iris()
X=iris.data
y=iris.target
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)


model = MMG()
model.train(X_train, y_train)
y_pred = model.predict(X_test)

print("IRIS:Number of mislabeled points out of a total %d points : %d, Acc: %f%%"
      % (X_test.shape[0], (y_test != y_pred).sum(),100*(y_test == y_pred).sum()/X_test.shape[0]))


(150, 4)
(4,)
(4,)
(4,)




{0: (1, {'pi': array([nan, nan]), 'mu': array([[nan, nan, nan, nan],
       [nan, nan, nan, nan]]), 'sigma': array([[[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]],

       [[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]]])}), 1: (1, {'pi': array([nan, nan]), 'mu': array([[nan, nan, nan, nan],
       [nan, nan, nan, nan]]), 'sigma': array([[[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]],

       [[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]]])}), 2: (1, {'pi': array([nan, nan]), 'mu': array([[nan, nan, nan, nan],
       [nan, nan, nan, nan]]), 'sigma': array([[[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]],

       [[nan, nan, nan, nan],
        [nan, nan, nan, nan]

TypeError: tuple indices must be integers or slices, not str