In [46]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math

这是一个多变量高斯的正式版本

### 多变量高斯分布的贝叶斯分类器  
$$p(x)=N(x|\mu,\Sigma)=\frac1{(2\pi)^\frac{p}2}\frac1{|\Sigma|^\frac1{2}}e^{-\frac1{2}(x-\mu)^T\Sigma^{-1}(x-\mu)}$$

其中$\mu=\frac1{N}\sum_{i=1}^{N}x_i$

$ \Sigma=\frac1N\sum_{i=1}^{N}(x_i-\mu)(x_i-\mu)^T$

In [47]:
class MVGaussion:
    def __init__(self):
        self.model = None

    @staticmethod
    # mathematical expectation
    def mu(X):
        return sum(X) / float(len(X))#生成一个p*1的特征均值矩阵

    # sigma natrix(p*p)
    def sigma(self, train_data,train_mu):
        train_data_shape = np.shape(train_data)
        sigma = np.zeros((train_data_shape[1],train_data_shape[1]))
        for sample in zip(train_data):
            dot = sample-train_mu
            sigma += np.dot(dot.T,dot)
        return sigma/train_data_shape[0]

    # 处理X_train
    def summarize(self, train_data):#train_data为一个N*p的矩阵
        mu = [self.mu(i) for i in zip(*train_data)]#1*P
        #muT = np.array(mu).reshape(np.size(mu),1)
        sigma = self.sigma(train_data,np.array(mu))
        summaries = {'mu':mu,'sigma':sigma}#
        return summaries#summaries得到的模型参数应该为一个p*1的mu矩阵和一个p*p的大sigma矩阵

    # 分类别求出数学期望和标准差
    def train(self, X, y):
        labels = list(set(y))
        data = {label:[] for label in labels}#
        for f, label in zip(X, y):
            data[label].append(f)
        self.model = {label: self.summarize(value) for label, value in data.items()}
#         print(self.model)
#         for label, value in self.model.items():
#             mu,sigma = value['mu'],value['sigma']
#             print(mu)
#             print(sigma)
        return self.model

    # 计算概率
    def calculate_probabilities(self, input_data):
        probabilities = {}
        p = np.size(input_data)
        for label, value in self.model.items():#value是一个字典
            mu,sigma = value['mu'],value['sigma']
            sign = input_data-mu
            exponent = math.exp((-0.5*np.dot(sign,np.dot(np.linalg.inv(sigma),sign.T))))
            probabilities[label] = exponent/(((2*math.pi)**(p/2))*(np.linalg.det(sigma)**0.5))
        
        return probabilities
    
    # 类别
    def predict(self, X_test):
        label = list(range(X_test.shape[0]))
        for i in range(X_test.shape[0]):
            label[i] = sorted(self.calculate_probabilities(X_test[i,:]).items(), key=lambda x: x[-1])[-1][0]
        
        return label

    def score(self, X_test, y_test):
        right = 0
        for X, y in zip(X_test, y_test):
            label = self.predict(X)
            if label == y:
                right += 1

        return right / float(len(X_test))

### 1.使用朴素贝叶斯解决IRIS分类问题

In [48]:
iris = datasets.load_iris()
X=iris.data
y=iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)


model = MVGaussion()
model.train(X_train, y_train)
y_pred = model.predict(X_test)

print("IRIS:Number of mislabeled points out of a total %d points : %d, Acc: %f%%"
      % (X_test.shape[0], (y_test != y_pred).sum(),100*(y_test == y_pred).sum()/X_test.shape[0]))


IRIS:Number of mislabeled points out of a total 75 points : 0, Acc: 100.000000%


### 使用朴素贝叶斯解决手写数字分类问题

In [49]:
digits=fetch_openml(name='USPS',version=2,data_home='E:/scikit_learn_data')
X_d=digits.data
y_d=digits.target
y_u=np.unique(y_d)
enc = LabelEncoder()
enc.fit(y_d)
name=enc.classes_

y1=enc.transform(y_d)
X_reduced = PCA(n_components=30).fit_transform(X_d)#特征提取

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_reduced, y1, test_size=0.5, random_state=1)

model_d = MVGaussion()
model_d.train(X_train_d, y_train_d)
y_pred_d = model_d.predict(X_test_d)

print("USPS:Number of mislabeled points out of a total %d points : %d, Acc: %f%%"
      %(X_test_d.shape[0], (y_test_d != y_pred_d).sum(),100*(y_test_d == y_pred_d).sum()/X_test_d.shape[0]))

USPS:Number of mislabeled points out of a total 4649 points : 184, Acc: 96.042160%
