In [1]:
"""生成示例数据
"""
import pandas as pd


def create_data():
    data = {"x": ['r', 'g', 'r', 'b', 'g', 'g', 'r', 'r', 'b', 'g', 'g', 'r', 'b', 'b', 'g'],
            "y": ['m', 's', 'l', 's', 'm', 's', 'm', 's', 'm', 'l', 'l', 's', 'm', 'm', 'l'],
            "labels": ['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B']}
    data = pd.DataFrame(data, columns=["labels", "x", "y"])
    return data

In [2]:
"""加载并预览数据
"""
data = create_data()
data

Unnamed: 0,labels,x,y
0,A,r,m
1,A,g,s
2,A,r,l
3,A,b,s
4,A,g,m
5,A,g,s
6,A,r,m
7,A,r,s
8,B,b,m
9,B,g,l


In [3]:
"""P(种类) 先验概率计算
"""


def get_P_labels(labels):
    labels = list(labels)  # 转换为 list 类型
    P_label = {}  # 设置空字典用于存入 label 的概率
    for label in labels:
        P_label[label] = labels.count(
            label) / float(len(labels))  # p = count(y) / count(Y)
    return P_label


P_labels = get_P_labels(data["labels"])
P_labels

{'A': 0.5333333333333333, 'B': 0.4666666666666667}

In [4]:
"""导入特征数据并预览
"""
import numpy as np

train_data = np.array(data.iloc[:, 1:])
train_data

array([['r', 'm'],
       ['g', 's'],
       ['r', 'l'],
       ['b', 's'],
       ['g', 'm'],
       ['g', 's'],
       ['r', 'm'],
       ['r', 's'],
       ['b', 'm'],
       ['g', 'l'],
       ['g', 'l'],
       ['r', 's'],
       ['b', 'm'],
       ['b', 'm'],
       ['g', 'l']], dtype=object)

In [5]:
"""类别 A,B 索引
"""
labels = data["labels"]
label_index = []
for y in P_labels.keys():
    temp_index = []
    # enumerate 函数返回 Series 类型数的索引和值，其中 i 为索引，label 为值
    for i, label in enumerate(labels):
        if (label == y):
            temp_index.append(i)
        else:
            pass
    label_index.append(temp_index)
label_index

[[0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13, 14]]

In [6]:
"""特征 x 为 r 的索引
"""
x_index = [i for i, feature in enumerate(train_data[:, 0]) if feature == 'r']  # 效果等同于求类别索引中 for 循环
x_index

[0, 2, 6, 7, 11]

In [7]:
x_label = set(x_index) & set(label_index[0])
print('既符合 x = r 又是 A 类别的索引值：', x_label)
x_label_count = len(x_label)
print('先验概率 P(r|A):', x_label_count / float(len(label_index[0])))

既符合 x = r 又是 A 类别的索引值： {0, 2, 6, 7}
先验概率 P(r|A): 0.5


In [8]:
"""P(特征∣种类) 先验概率计算
"""


def get_P_fea_lab(P_label, features, data):
    P_fea_lab = {}
    train_data = data.iloc[:, 1:]
    train_data = np.array(train_data)
    labels = data["labels"]
    for each_label in P_label.keys():
        label_index = [i for i, label in enumerate(
            labels) if label == each_label]  # labels 中出现 y 值的所有数值的下标索引
        # features[0] 在 trainData[:,0] 中出现的值的所有下标索引
        for j in range(len(features)):
            feature_index = [i for i, feature in enumerate(
                train_data[:, j]) if feature == features[j]]
            # set(x_index)&set(y_index) 列出两个表相同的元素
            fea_lab_count = len(set(feature_index) & set(label_index))
            key = str(features[j]) + '|' + str(each_label)
            P_fea_lab[key] = fea_lab_count / float(len(label_index))
    return P_fea_lab


features = ['r', 'm']
get_P_fea_lab(P_labels, features, data)

{'m|A': 0.375,
 'm|B': 0.42857142857142855,
 'r|A': 0.5,
 'r|B': 0.14285714285714285}

In [9]:
"""朴素贝叶斯分类器
"""


def classify(data, features):
    # 求 labels 中每个 label 的先验概率
    labels = data['labels']
    P_label = get_P_labels(labels)
    P_fea_lab = get_P_fea_lab(P_label, features, data)

    P = {}
    P_show = {}  # 后验概率
    for each_label in P_label:
        P[each_label] = P_label[each_label]
        for each_feature in features:
            key = str(each_label)+'|'+str(features)
            P_show[key] = P[each_label] * \
                P_fea_lab[str(each_feature) + '|' + str(each_label)]
            P[each_label] = P[each_label] * \
                P_fea_lab[str(each_feature) + '|' +
                          str(each_label)]  # 由于分母相同，只需要比较分子
    print(P_show)
    features_label = max(P, key=P.get)  # 概率最大值对应的类别
    return features_label

In [10]:
classify(data, ['r', 'm'])

{"A|['r', 'm']": 0.1, "B|['r', 'm']": 0.02857142857142857}


'A'

In [11]:
!wget http://labfile.oss.aliyuncs.com/courses/1081/course-10-company.csv

--2018-09-24 22:16:32--  http://labfile.oss.aliyuncs.com/courses/1081/course-10-company.csv
Resolving labfile.oss.aliyuncs.com (labfile.oss.aliyuncs.com)... 118.178.161.16
Connecting to labfile.oss.aliyuncs.com (labfile.oss.aliyuncs.com)|118.178.161.16|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3999 (3.9K) [text/csv]
Saving to: 'course-10-company.csv'

     0K ...                                                   100%  600K=0.007s

2018-09-24 22:16:33 (600 KB/s) - 'course-10-company.csv' saved [3999/3999]



In [12]:
"""导入数据集并预览
"""
import pandas as pd

enterprise_data = pd.read_csv('course-10-company.csv')
enterprise_data.head()

Unnamed: 0,industrial_risk,management_risk,finacial_flexibility,credibility,competitiveness,operating_risk,label
0,P,P,A,A,A,P,NB
1,N,N,A,A,A,N,NB
2,A,A,A,A,A,A,NB
3,P,P,P,P,P,P,NB
4,N,N,P,P,P,N,NB


In [13]:
enterprise_data = enterprise_data.replace(
    {"P": 1, "A": 2, "N": 3, "NB": 0, "B": 1})  # 对元素值进行替换
enterprise_data

Unnamed: 0,industrial_risk,management_risk,finacial_flexibility,credibility,competitiveness,operating_risk,label
0,1,1,2,2,2,1,0
1,3,3,2,2,2,3,0
2,2,2,2,2,2,2,0
3,1,1,1,1,1,1,0
4,3,3,1,1,1,3,0
5,2,2,1,1,1,2,0
6,1,1,2,1,1,1,0
7,1,1,1,2,2,1,0
8,1,1,2,1,2,1,0
9,1,1,2,2,1,1,0


In [14]:
"""数据集划分
"""
from sklearn.model_selection import train_test_split

# 得到企业运营评估数据集中 feature 的全部序列: industrial_risk, management_risk 等特征
feature_data = enterprise_data.iloc[:, :-1]
label_data = enterprise_data["label"]  # 得到企业运营评估数据集中 label 的序列
x_train, x_test, y_train, y_test = train_test_split(
    feature_data, label_data, test_size=0.3, random_state=4)

x_test  # 输出企业运营评估数据测试集查看

Unnamed: 0,industrial_risk,management_risk,finacial_flexibility,credibility,competitiveness,operating_risk
33,3,1,2,1,2,1
213,2,3,3,3,3,3
39,2,2,2,1,2,1
6,1,1,2,1,1,1
101,2,2,2,2,2,1
206,3,2,3,3,3,1
240,3,3,3,2,3,2
71,1,1,2,1,1,1
106,2,1,3,1,2,1
11,1,1,1,2,1,1


In [15]:
"""利用 sicit-learn 构建多项式朴素贝叶斯分类器
"""
from sklearn.naive_bayes import MultinomialNB


def sk_classfy(x_train, y_train, x_test):
    sk_clf = MultinomialNB(alpha=1.0, fit_prior=True)  # 定义多项式模型分类器
    sk_clf.fit(x_train, y_train)  # 进行模型训练
    return sk_clf.predict(x_test)


y_predict = sk_classfy(x_train, y_train, x_test)
y_predict

array([0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [16]:
"""准确率计算
"""


def get_accuracy(test_labels, pred_lables):
    correct = np.sum(test_labels == pred_lables)  # 计算预测正确的数据个数
    n = len(test_labels)  # 总测试集数据个数
    accur = correct/n
    return accur


get_accuracy(y_test, y_predict)

0.7866666666666666