# 准备工作

In [1]:
# 导入相应的包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# 导入数据
pima = pd.read_csv('./dataset/pima/pima.csv')

In [3]:
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
def split_data(data, ratio=0.8):
    index = np.random.permutation(data.shape[0])
    x_traing, y_traing = data.loc[index[:int(len(index)*0.8)]].iloc[:,:data.shape[1]-1], data.loc[index[:int(len(index)*0.8)]].iloc[:,data.shape[1]-1] 
    x_test, y_test = data.loc[index[int(len(index)*0.8):]].iloc[:,:data.shape[1]-1], data.loc[index[int(len(index)*0.8):]].iloc[:,data.shape[1]-1] 
    return x_traing, y_traing, x_test, y_test

In [5]:
x_traing, y_traing, x_test, y_test = split_data(pima)

In [6]:
x_traing.shape

(614, 8)

In [7]:
y_traing.shape

(614,)

In [8]:
x_test.shape

(154, 8)

In [9]:
y_test.shape

(154,)

In [10]:
traing_mean = x_traing.groupby(y_traing).mean()# 训练集患病的特征平均值和没患病的特征平均值

In [11]:
traing_std = x_traing.groupby(y_traing).std()# 训练集患病特征的标准差和没患病的特征的标准差

In [12]:
traing_mean

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.405542,109.062972,68.40806,19.556675,67.292191,30.356423,0.433214,31.370277
1,4.981567,140.62212,69.488479,21.585253,91.465438,34.745622,0.55524,37.437788


In [13]:
traing_std

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.060711,25.708222,17.719753,14.892881,93.120674,7.437469,0.312574,11.578298
1,3.785894,32.443756,22.645029,17.999572,132.486233,7.08262,0.390773,10.953904


In [14]:
p_0 = np.sum(y_traing == 0)/y_traing.shape[0]#没患病的概率

In [15]:
p_1 = np.sum(y_traing == 1)/y_traing.shape[0]# 患病的概率

In [16]:
pra = [p_0, p_1]

In [17]:
# 用高斯分布计算概率分布密度
def pdf(mean, std, x):
    e_part = np.power(np.e, - np.square(np.tile(x, (mean.shape[0], 1)) - mean)/(2* np.square(std)))
    return e_part.div((np.sqrt(2 * np.pi) * std))

In [18]:
test = pd.DataFrame(pima.head(1).iloc[:,:pima.shape[1]-1])
test


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50


In [19]:
p = pdf(traing_mean, traing_std, test)
p = p.cumprod(1).iloc[:,p.shape[1]-1]
p

Outcome
0    2.477261e-13
1    8.944695e-13
Name: Age, dtype: float64

In [20]:
p.idxmax(axis=0)

1

In [21]:
def predict(data):
    x_traing, y_traing, x_test, y_test = split_data(data)
    traing_mean, traing_std = x_traing.groupby(y_traing).mean(), x_traing.groupby(y_traing).std()
    p_0 = np.sum(y_traing == 0)/y_traing.shape[0]#没患病的概率
    p_1 = np.sum(y_traing == 1)/y_traing.shape[0]# 患病的概率
    pra = [p_0, p_1]
    count = 0
    for index, test in x_test.iterrows():
        pd = pdf(traing_mean, traing_std, test)
        pd = pd.cumprod(1).iloc[:, pd.shape[1]-1]
        pd = pd*pra
        pred = pd.idxmax(axis=0)
        if pred == y_test.loc[index]:
            count+=1
    print('acc:%f%%' %int(count*100/y_test.shape[0]))

In [22]:
predict(pima)

acc:72.000000%


# 使用sklearn

In [23]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(pima.iloc[:,:pima.shape[1]-1], pima.iloc[:,pima.shape[1]-1], test_size=0.2)

clf = GaussianNB()
clf.fit(X_train, y_train)
cross_val_score(clf, X_test, y_test, cv=3)

array([0.73076923, 0.80769231, 0.8       ])

# 使用贝叶斯处理非结构化数据

In [24]:
import os

In [25]:
def input_data_file(path='./dataset/txt_sentoken/'):
    data_file = pd.DataFrame()
    count = 0
    for dirpath, dirnames, filenames in os.walk(path):
        if filenames is not None:
            files = pd.DataFrame()
            files['file_name'] = filenames
            files['dir'] = [dirpath]*len(filenames)
            files['class'] = [dirpath.split('/')[-1]] * len(filenames)
            data_file = data_file.append(files, ignore_index=True)
    return data_file
    

In [26]:
data_file = input_data_file()

In [27]:
x_training_file, y_training_file, x_test_file, y_test_file = split_data(data_file)

In [28]:
x_training_file.shape

(1600, 2)

In [29]:
y_training_file.shape

(1600,)

In [30]:
x_test_file.shape

(400, 2)

In [31]:
y_test_file.shape

(400,)

In [32]:
y_training_file.groupby(y_training_file).count()

class
neg    792
pos    808
Name: class, dtype: int64

In [33]:
#计算类别的概率
total_ratio = y_training_file.groupby(y_training_file).count()/y_training_file.shape[0]
total_ratio

class
neg    0.495
pos    0.505
Name: class, dtype: float64

In [34]:
def count_vocabulary(data):
    """计算总的词汇表
    """
    vocabularys = pd.DataFrame()
    count = 0
    for index, row in data.iterrows():
        count +=1
        print('%.2f %%' %(count*100/data.shape[0]), end='\r')
        with open (row['dir']+'/'+row['file_name'], 'r') as file:
            for line in file:
                tokens = line.split()
                for token in tokens:
                    token = token.strip('\'".,?:-')
                    token = token.lower()
                    if not token  in vocabularys.index and token != '':
                        vocabularys = vocabularys.append(pd.DataFrame([[1]],index=[token]))
                    elif token != '':
                        vocabularys.loc[token] += 1
    return vocabularys

In [35]:
#计算词汇表
vocabularys = count_vocabulary(x_training_file)

100.00 %

In [36]:
vocabularys.shape

(44842, 1)

In [37]:
for index, file in x_training_file.groupby(y_training_file):
    # 计算每个类别对应的单词个数
    data = count_vocabulary(file)
    vocabularys[index] = data

100.00 %

In [38]:
vocabularys = vocabularys.fillna(0)

In [39]:
#计算概率，由于考虑到概率很小，转换为log,后续连乘用加法 P(W|h)
vocabularys['neg'] = np.log((vocabularys['neg'] + 1)/ (np.sum(vocabularys['neg'])+vocabularys.shape[0]))
vocabularys['pos'] = np.log((vocabularys['pos'] + 1)/ (np.sum(vocabularys['pos'])+vocabularys.shape[0]))
vocabularys.head(5)

Unnamed: 0,0,neg,pos
ladies,23,-10.711235,-10.763216
and,28299,-3.779682,-3.647072
gentlemen,9,-11.250231,-11.941871
1997's,22,-10.557084,-11.02558
independence,53,-9.938045,-9.96087


In [40]:
result = 0
count = 0
pro = 0
# 进行预测  
for index, row in x_test_file.iterrows():
    pro+=1
    print('%.2f%%' % (int(pro*100)/x_test_file.shape[0]) ,end='\r')
    with open(row['dir']+'/'+row['file_name'], 'r') as file:
        result = np.log(total_ratio)
        for line in file:
            tokens = line.split()
            for token in tokens:
                if token in vocabularys.index:
                    result += vocabularys.loc[token][['neg', 'pos']]
        if result.idxmax(axis=0) == y_test_file.loc[index]:
            count+=1
print('\nacc:%.2f%%'% int(count*100/y_test_file.shape[0]))
        

100.00%
acc:78.00%
