In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import datasets
from sklearn.datasets import fetch_openml

from sklearn.model_selection import train_test_split#模型选择
from sklearn.preprocessing import LabelEncoder#预处理
from sklearn.decomposition import PCA#主成分分析

from sklearn.naive_bayes import GaussianNB#高斯朴素贝叶斯算法

from collections import Counter
import math

In [21]:
class MVGaussion:
    def __init__(self):
        self.model = None

    @staticmethod
    # mathematical expectation
    def mu(X):
        return sum(X) / float(len(X))#生成一个p*1的特征均值矩阵

    # sigma natrix(p*p)
    def sigma(self, train_data,train_mu):
        train_data_shape = np.shape(train_data)
        sigma = np.zeros((train_data_shape[1],train_data_shape[1]))
        for sample in zip(train_data):
            dot = sample-train_mu
            sigma += np.dot(dot.T,dot)
        return sigma/train_data_shape[0]

    # 处理X_train
    def summarize(self, train_data):#train_data为一个N*p的矩阵
        mu = [self.mu(i) for i in zip(*train_data)]#1*P
        #muT = np.array(mu).reshape(np.size(mu),1)
        sigma = self.sigma(train_data,np.array(mu))
        summaries = {'mu':mu,'sigma':sigma}#
        return summaries#summaries得到的模型参数应该为一个p*1的mu矩阵和一个p*p的大sigma矩阵

    # 分类别求出数学期望和标准差
    def train(self, X, y):
        labels = list(set(y))
        data = {label:[] for label in labels}#
        for f, label in zip(X, y):
            data[label].append(f)
        self.model = {label: self.summarize(value) for label, value in data.items()}
#         print(self.model)
#         for label, value in self.model.items():
#             mu,sigma = value['mu'],value['sigma']
#             print(mu)
#             print(sigma)
        return self.model

    # 计算概率
    def calculate_probabilities(self, input_data):
        probabilities = {}
        p = np.size(input_data)
        for label, value in self.model.items():#value是一个字典
            mu,sigma = value['mu'],value['sigma']
            sign = input_data-mu
            exponent = math.exp((-0.5*np.dot(sign,np.dot(np.linalg.inv(sigma),sign.T))))
            probabilities[label] = exponent/(((2*math.pi)**(p/2))*(np.linalg.det(sigma)**0.5))
        
        return probabilities
    
    # 类别
    def predict(self, X_test):
        label = list(range(X_test.shape[0]))
        for i in range(X_test.shape[0]):
            label[i] = sorted(self.calculate_probabilities(X_test[i,:]).items(), key=lambda x: x[-1])[-1][0]
        
        return label

    def score(self, X_test, y_test):
        right = 0
        for X, y in zip(X_test, y_test):
            label = self.predict(X)
            if label == y:
                right += 1

        return right / float(len(X_test))

**注意下边的代码中，数据集是按0到9的顺序排列的，因此才会出现取小的数据量时候更准确的情况(在分解数据时会只去一部分相同的数据)，这只是一个错觉，实际使用的过程中一定要将数据打乱的**


In [61]:
from sklearn.naive_bayes import GaussianNB#高斯朴素贝叶斯算法
from sklearn.decomposition import PCA#主成分分析
from sklearn.datasets import fetch_mldata
 
mnist = fetch_mldata('MNIST original',data_home="E:\scikit_learn_data")
print(mnist.keys())

X, y = mnist["data"], mnist["target"]
print(X.shape,y.shape)

# plt.imshow(X[0].reshape(28,28), cmap = matplotlib.cm.binary,interpolation="nearest")

X_reduced = PCA(n_components=40).fit_transform(X)#特征提取
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_reduced, y, test_size=0.1, random_state=1)

gate = 10000

model_d = MVGaussion()
model_d.train(X_train_d[0:gate], y_train_d[0:gate])
y_pred_d = model_d.predict(X_test_d)

print("USPS:Number of mislabeled points out of a total %d points : %d, Acc: %f%%"
      %(X_test_d.shape[0], (y_test_d != y_pred_d).sum(),100*(y_test_d == y_pred_d).sum()/X_test_d.shape[0]))

(70000, 784) (70000,)
USPS:Number of mislabeled points out of a total 7000 points : 321, Acc: 95.414286%


In [57]:
print(y_test_d.shape)

(7000,)


**经分析可得，改变数据训练量和主成分分析的生成数量均可对测试精度造成影响，训练数据越大，总体是越精确但是可能会出现过拟合的现象；主成分分析的生成数量不能太小，也不能太大，只有在一个合适的值时候才能得到最大精度**

下面用多变量高斯测试CIFAR数据集

In [68]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict
cifar_batch1 = unpickle("E:\scikit_learn_data\cifar-10-batches-py\data_batch_1")
cifar_batch2 = unpickle("E:\scikit_learn_data\cifar-10-batches-py\data_batch_2")
cifar_batch3 = unpickle("E:\scikit_learn_data\cifar-10-batches-py\data_batch_3")
cifar_batch4 = unpickle("E:\scikit_learn_data\cifar-10-batches-py\data_batch_4")
cifar_batch5 = unpickle("E:\scikit_learn_data\cifar-10-batches-py\data_batch_5")
cifar_test = unpickle("E:\scikit_learn_data\cifar-10-batches-py\\test_batch")
cifar_batch_meta = unpickle("E:\scikit_learn_data\cifar-10-batches-py\\batches.meta")

X_c,y_c = cifar_batch1[b'data'],cifar_batch1[b'labels']
print(X_c.shape)

X_c_reduced = PCA(n_components=40).fit_transform(X_c)#特征提取
X_c_train_d, X_c_test_d, y_c_train_d, y_c_test_d = train_test_split(X_c_reduced, y_c, test_size=0.3, random_state=1)
print(X_c_train_d.shape)
print(X_c_test_d.shape)

gate = 1000

model_c_d = MVGaussion()
model_c_d.train(X_c_train_d, y_c_train_d)
y_c_pred_d = model_c_d.predict(X_c_test_d)

print("USPS:Number of mislabeled points out of a total %d points : %d, Acc: %f%%"
      %(X_c_test_d.shape[0], (np.array(y_c_test_d) != np.array(y_c_pred_d)).sum(),100*(np.array(y_c_test_d) == np.array(y_c_pred_d)).sum()/X_c_test_d.shape[0]))

(10000, 3072)
(7000, 40)
(3000, 40)
USPS:Number of mislabeled points out of a total 3000 points : 1676, Acc: 44.133333%


In [59]:
x = [6.0, 9.0,3.0] 
y = [6.0, 9.0,2.0]
print(np.array(x)==np.array(y))

[ True  True False]
