# 朴素贝叶斯
* 高斯模型
* 多项式模型
* 伯努利模型

### 高斯模型
- 每个特征的可能性被假设为服从高斯分布
- 概率密度函数为：$$P(x_i | y_k)=\frac{1}{\sqrt{2\pi\sigma^2_{yk}}}exp(-\frac{(x_i-\mu_{yk})^2}{2\sigma^2_{yk}})$$

### 1、不用sklearn来实现

In [3]:
import csv
import random
import math

* 数据集为 wine.data.csv, 在该数据集中，每一行为一条数据，共178行，每一行的第一个数为类别，后面13项为特征值

In [9]:
# 类别的下标为0
class_index = 0
filename = "wine.data.csv"

In [10]:
# 加载数据集
def load_csv(filename):
    lines = csv.reader(open(filename, 'r'))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

In [17]:
dataset = load_csv(filename)
print("数据个数为：{}".format(len(dataset)))
print("每条数据长度为：{}".format(len(dataset[0])))
print("第一条数据的类别为：{}".format(dataset[0][0]))
# 数据类别包括1，2，3

数据个数为：178
每条数据长度为：14
第一条数据的类别为：1.0


In [8]:
# 划分训练集和预测集， spiltRatio为一个比例值， 如0.67代表的是数据集的67%作为训练集，其余作为测试集
def split_dataset(dataset, spiltRatio):
    trainSize = int(len(dataset) * spiltRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [21]:
spiltRadio = 0.67
trainSet, testSet = split_dataset(dataset, spiltRadio)
print("训练集的长度为：{}".format(len(trainSet)))
print("测试集的长度为：{}".format(len(testSet)))

训练集的长度为：119
测试集的长度为：59


In [22]:
# 按类划分数据, 建立一个字典来保存类的信息
def sepreateByClass(dataset):
    sepreated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if(vector[class_index] not in sepreated):
            sepreated[vector[class_index]] = []
        sepreated[vector[class_index]].append(vector)
    return sepreated

In [24]:
se = sepreateByClass(trainSet)
print("训练集的类别数为：{}".format(len(se)))

训练集的类别数为：3


In [25]:
# 计算均值和标准差
def mean(numbers):
    return sum(numbers)/float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    var = sum([pow(x-avg, 2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(var)

In [26]:
# 提取数据集的特征，即计算每个类别下，每个特征分布对应的均值和方差,其输出格式为：
# {1:[(mean1, stdev1), (mean2, stedev2)], 2:[(mean1, stdev1), (mean2, stdev2)]}
def summarize(dataset):
    summaries =[(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[class_index]
    return summaries

def summarizeByClass(dataset):
    sepreated= sepreateByClass(dataset)
    sumarizes = {}
    for classValue, instance in sepreated.items():
        sumarizes[classValue] = summarize(instance)
    return sumarizes

In [27]:
summaries = summarizeByClass(trainSet)
print("summaries:{}".format(summaries[1]))

summaries:[(13.737105263157895, 0.46833753071289624), (1.9747368421052627, 0.6523998808644286), (2.436578947368421, 0.22578722722342026), (17.26842105263158, 2.434116218233865), (105.84210526315789, 10.898862613084471), (2.8357894736842097, 0.33925699083754773), (2.998684210526316, 0.36412925424635767), (0.29236842105263156, 0.07492101103653662), (1.933684210526316, 0.4107305360664921), (5.6605263157894745, 1.1563174243153342), (1.051052631578947, 0.1134183710197758), (3.1352631578947374, 0.347895113442724), (1130.921052631579, 195.92764543961292)]


In [28]:
# 计算高斯概率密度函数，由公式可得
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x - mean, 2))/(2 * math.pow(stdev, 2)))
    return (1/(math.sqrt(2 * math.pi) * stdev)) * exponent

# 计算所属类的概率，这里的inputVector是测试集的一条数据，所以依然是第一个数字为类别，后面为特征值
# 输出的结果是这条数据属于每个类别的概率值
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1 
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i+1] # inputvector[0]为标签值，从1开始才是特征值
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

In [29]:
inputvector = testSet[0]
result = calculateClassProbabilities(summaries, inputvector)
print(result)

{1.0: 5.6021186514362725e-08, 2.0: 3.4710043955901537e-13, 3.0: 1.7194108149444513e-35}


In [30]:
# 单一预测，返回最大概率的类
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probility in probabilities.items():
        if bestLabel is None or probility > bestProb:
            bestLabel = classValue
            bestProb = probility
    return bestLabel

# 多重预测，对每个样本进行预测
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [31]:
predictions = getPredictions(summaries, testSet)
print(predictions[0])

1.0


In [32]:
# 最后，返回预测的精度，即预测结果与标签值相同的，correct+1
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][class_index] == predictions[i]:
            correct += 1
    return (correct / float(len(testSet))) * 100.0

In [33]:
accuracy = getAccuracy(testSet, predictions)
print("Accuracy:{}".format(accuracy))

Accuracy:100.0


### 2、用sklearn实现

In [34]:
from sklearn.naive_bayes import GaussianNB
import numpy as np

In [35]:
trainSet = np.array(trainSet)
testSet = np.array(testSet)

trainSet = trainSet.T
testSet = testSet.T

# 将标签值提取出来
labelOftrainSet = trainSet[0]
labelOftestSet = testSet[0]
trainSet = trainSet[1:].T
testSet = testSet[1:].T

In [36]:
clf = GaussianNB().fit(trainSet, labelOftrainSet)

In [37]:
clf.score(testSet, labelOftestSet)

1.0