# 准备工作

In [1]:
# 导入相应的包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# 导入数据
pima = pd.read_csv('./dataset/pima/pima.csv')

In [3]:
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
def split_data(data, ratio=0.8):
    index = np.random.permutation(data.shape[0])
    x_traing, y_traing = data.loc[index[:int(len(index)*0.8)]].iloc[:,:data.shape[1]-1], data.loc[index[:int(len(index)*0.8)]].iloc[:,data.shape[1]-1] 
    x_test, y_test = data.loc[index[int(len(index)*0.8):]].iloc[:,:data.shape[1]-1], data.loc[index[int(len(index)*0.8):]].iloc[:,data.shape[1]-1] 
    return x_traing, y_traing, x_test, y_test

In [5]:
x_traing, y_traing, x_test, y_test = split_data(pima)

In [6]:
x_traing.shape

(614, 8)

In [7]:
y_traing.shape

(614,)

In [8]:
x_test.shape

(154, 8)

In [9]:
y_test.shape

(154,)

In [10]:
traing_mean = x_traing.groupby(y_traing).mean()# 训练集患病的特征平均值和没患病的特征平均值

In [11]:
traing_std = x_traing.groupby(y_traing).std()# 训练集患病特征的标准差和没患病的特征的标准差

In [12]:
traing_mean

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.276961,111.235294,68.45098,19.294118,67.558824,30.341912,0.422833,31.129902
1,4.961165,140.31068,71.334951,21.985437,96.65534,35.311165,0.52335,37.514563


In [13]:
traing_std

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2.993729,26.513054,18.433293,15.001614,96.069035,7.935615,0.301809,11.523943
1,3.803522,32.461769,21.489179,17.648182,133.178745,7.046682,0.343938,10.99256


In [14]:
p_0 = np.sum(y_traing == 0)/y_traing.shape[0]#没患病的概率

In [15]:
p_1 = np.sum(y_traing == 1)/y_traing.shape[0]# 患病的概率

In [16]:
pra = [p_0, p_1]

In [17]:
# 用高斯分布计算概率分布密度
def pdf(mean, std, x):
    e_part = np.power(np.e, - np.square(np.tile(x, (mean.shape[0], 1)) - mean)/(2* np.square(std)))
    return e_part.div((np.sqrt(2 * np.pi) * std))

In [18]:
test = pd.DataFrame(pima.head(1).iloc[:,:pima.shape[1]-1])
test


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50


In [19]:
p = pdf(traing_mean, traing_std, test)
p = p.cumprod(1).iloc[:,p.shape[1]-1]
p

Outcome
0    2.363274e-13
1    1.030595e-12
Name: Age, dtype: float64

In [20]:
p.idxmax(axis=0)

1

In [21]:
def predict(data):
    x_traing, y_traing, x_test, y_test = split_data(data)
    traing_mean, traing_std = x_traing.groupby(y_traing).mean(), x_traing.groupby(y_traing).std()
    p_0 = np.sum(y_traing == 0)/y_traing.shape[0]#没患病的概率
    p_1 = np.sum(y_traing == 1)/y_traing.shape[0]# 患病的概率
    pra = [p_0, p_1]
    count = 0
    for index, test in x_test.iterrows():
        pd = pdf(traing_mean, traing_std, test)
        pd = pd.cumprod(1).iloc[:, pd.shape[1]-1]
        pd = pd*pra
        pred = pd.idxmax(axis=0)
        if pred == y_test.loc[index]:
            count+=1
    print('acc:%f%%' %int(count*100/y_test.shape[0]))

In [22]:
predict(pima)

acc:69.000000%


# 使用sklearn

In [23]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(pima.iloc[:,:pima.shape[1]-1], pima.iloc[:,pima.shape[1]-1], test_size=0.2)

clf = GaussianNB()
clf.fit(X_train, y_train)
cross_val_score(clf, X_test, y_test, cv=3)

array([0.67307692, 0.71153846, 0.68      ])