# 高斯朴素贝叶斯分类器

贝叶斯定理是基于条件概率。条件概率可以帮助我们通过已经发生的事情来计算将要发生的事情。高斯朴素贝叶斯算法是一种特殊类型的NB算法，它特别用于当特征具有连续值时。同时假定所有特征都遵循高斯分布，即正态分布。

# Census Income Dataset

Census Income dataset is to predict whether the income of a person >$50K/yr (greater than $50K/yr) or <=$50K/yr. The data was collected by Barry Becker from 1994 Census [dataset](https://archive.ics.uci.edu/ml/datasets/Adult).

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [None]:
# ' *, *' 此分隔符包含了数据值前后的空格
adult_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
                       header = None, delimiter=' *, *', engine='python')
adult_df.columns = ['age', 'workclass','fnlwgt','education','education_num','marital_status',
                    'occupation','relationship','race','sex','capital_gain','capital_loss',
                    'hours_per_week','native_country', 'income']

In [None]:
adult_df.head()

# 处理缺失数据

In [None]:
adult_df.isnull().sum() #测试数据集中是否有任何空值

In [None]:
# 测试任何分类属性是否包含"?"
for value in ['workclass','education','marital_status','occupation','relationship',
              'race','sex','native_country', 'income']:
    print value, ":", sum(adult_df[value] == '?')

In [None]:
adult_df_rev = adult_df
adult_df_rev.describe(include='all')

In [None]:
for value in ['workclass','education','marital_status','occupation','relationship',
              'race','sex','native_country', 'income']:
    adult_df_rev[value].replace(['?'], [adult_df_rev.describe(include='all')[value][2]], inplace='True')

In [None]:
adult_df_rev.head()

# 对所有标签进行编码，其值介于0和n_classes-1之间

In [None]:
le = preprocessing.LabelEncoder()
workclass_cat = le.fit_transform(adult_df.workclass)
education_cat = le.fit_transform(adult_df.education)
marital_cat = le.fit_transform(adult_df.marital_status)
occupation_cat = le.fit_transform(adult_df.occupation)
relationship_cat = le.fit_transform(adult_df.relationship)
race_cat = le.fit_transform(adult_df.race)
sex_cat = le.fit_transform(adult_df.sex)
native_country_cat = le.fit_transform(adult_df.native_country)

In [None]:
adult_df_rev['workclass_cat'] = workclass_cat
adult_df_rev['education_cat'] = education_cat
adult_df_rev['marital_cat'] = marital_cat
adult_df_rev['occupation_cat'] = occupation_cat
adult_df_rev['relationship_cat'] = relationship_cat
adult_df_rev['race_cat'] = race_cat
adult_df_rev['sex_cat'] = sex_cat
adult_df_rev['native_country_cat'] = native_country_cat

In [None]:
dummy_fields = ['workclass','education','marital_status','occupation',
                'relationship', 'race','sex','native_country']
adult_df_rev = adult_df_rev.drop(dummy_fields, axis = 1)

In [None]:
adult_df_rev.head()

In [None]:
adult_df_rev = adult_df_rev.reindex_axis(['age', 'workclass_cat','fnlwgt',
                                          'education_cat','education_num','marital_cat',
                                          'occupation_cat','relationship_cat','race_cat',
                                          'sex_cat', 'capital_gain', 'capital_loss',
                                          'hours_per_week','native_country_cat', 'income'],
                                         axis = 1)

In [None]:
adult_df_rev.head()

# 数据标准化

In [None]:
num_features = ['age', 'workclass_cat','fnlwgt','education_cat','education_num','marital_cat',
                'occupation_cat','relationship_cat','race_cat','sex_cat', 'capital_gain', 
                'capital_loss','hours_per_week','native_country_cat']
scaled_features = {}
for each in num_features:
    mean, std = adult_df_rev[each].mean(), adult_df_rev[each].std()
    scaled_features[each] = [mean, std]
    adult_df_rev.loc[:,each] = (adult_df_rev[each] - mean) / std

In [None]:
adult_df_rev.head()

# 数据切片

In [None]:
features = adult_df_rev.values[:, :14]
target = adult_df_rev.values[:, 14]
features_train, features_test, target_train, target_test = train_test_split(features, target,
                                                                            test_size=0.33,
                                                                           random_state = 10)

# 高斯朴素贝叶斯实现

In [None]:
clf = GaussianNB()
clf.fit(features_train, target_train)
target_pred = clf.predict(features_test)

# 高斯朴素贝叶斯模型的准确性

In [None]:
accuracy_score(target_test, target_pred, normalize=True)

In [None]:
target_pred_score = clf.predict_proba(features_test)
fpr, tpr, thresholds = metrics.roc_curve(target_test, target_pred_score[:,1], pos_label='>50K')
auc = metrics.auc(fpr, tpr)
auc

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.title('ROC')
plt.plot(fpr, tpr, color='darkorange', label="AUC = %0.2f" % auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.xlim([-0.02,1.0])
plt.ylim([0.0,1.02])
plt.legend(loc="lower right", prop={'size':8});