In [150]:
import os, sys, time
import pandas as pd
import numpy as np

attr_dict = {"age":0, "workclass":1, "fnlwgt":0, "education":1, "education-num":0, "marital-status":1, "occupation":1, "relationship":1, "race":1, "sex":1, "capital-gain":0, "capital-loss":0, "hours-per-week":0, "native-country":1, "salary":0} # 0: continuous, 1: discrete

train_data = pd.read_csv("adult.data",names=attr_dict.keys(),index_col=False)
test_data = pd.read_csv("adult.test",names=attr_dict.keys(),index_col=False,header=0)

def preprocessing(data):
    """
    Select some useful attributes
    """
    # attributes = ["workclass","education","marital-status","occupation","relationship","race","sex","native-country","salary"] # discrete
    attributes = list(attr_dict.keys())
    attributes.remove("fnlwgt")
    attributes.remove("capital-gain")
    attributes.remove("capital-loss")
    return data[attributes]

def fill_data(data,flag=1):
    """
    Fill in missing data (?)
    """
    if flag == 0: # directly remove missing data
        for a in data.columns.values:
            if attr_dict[a]: # discrete
                data = data[data[a] != " ?"] # remove unknown
        return data
    else: # fill data with the most value
        for a in data.columns.values:
            if attr_dict[a]: # discrete
                data.loc[data[a] == " ?",a] = data[a].value_counts().argmax() # view or copy? Use loc!
            else: # continuous
                pass
        return data

# Data cleaning
train_data = preprocessing(train_data)
test_data = preprocessing(test_data)
train_data = fill_data(train_data,1)
test_data = fill_data(test_data,1)

In [151]:
class NaiveBayesClassifier():
    """
    A Naive Bayes Classifier
    """
    def __init__(self,train_data,attr_dict):
        """
        Initialize and calculate the aprior probability
        """
        self.train_data = train_data
        self.attr_dict = attr_dict

        # calculate the aprior probability P(x_i|y)
        self.prob = {}
        self.prob[" >50K"] = train_data["salary"].value_counts(normalize=True)[" >50K"]
        self.prob[" <=50K"] = 1 - self.prob[" >50K"]
        self.attributes = train_data.columns.values[train_data.columns.values != "salary"]
        less_than_50k = train_data[train_data["salary"] == " <=50K"]
        greater_than_50k = train_data[train_data["salary"] == " >50K"]
        for a in self.attributes:
            if self.attr_dict[a]: # discrete
                count_a_less_than_50k = less_than_50k[a].value_counts()
                count_a_greater_than_50k = greater_than_50k[a].value_counts()
                V = len(train_data[a].unique())
                for xi in train_data[a].unique():
                    # laplacian smoothing
                    self.prob[(xi," <=50K")] = (count_a_less_than_50k.get(xi,0) + 1) / (len(less_than_50k) + V)
                    self.prob[(xi," >50K")] = (count_a_greater_than_50k.get(xi,0) + 1) / (len(greater_than_50k) + V)
            else: # continuous
                # use Gaussian aprior
                mu_less_than_50k = np.mean(less_than_50k[a])
                sigma_less_than_50k = np.var(less_than_50k[a])
                self.prob[(a," <=50K")] = lambda x: 1 / np.sqrt(2*np.pi*sigma_less_than_50k) * np.exp(-(x-mu_less_than_50k)**2/(2*sigma_less_than_50k)) # use anonymous function
                mu_greater_than_50k = np.mean(greater_than_50k[a])
                sigma_greater_than_50k = np.var(greater_than_50k[a])
                self.prob[(a," >50K")] = lambda x: 1 / np.sqrt(2*np.pi*sigma_greater_than_50k) * np.exp(-(x-mu_greater_than_50k)**2/(2*sigma_greater_than_50k))

    def predict(self,test_data):
        """
        Predict the salary of test data
        """
        acc = 0
        for i, row in test_data.iterrows():
            # calculate P(y|x_1,...,x_n)
            prod = np.array([self.prob[" <=50K"],self.prob[" >50K"]])
            for a in self.attributes:
                xi = row[a]
                if self.attr_dict[a]: # discrete
                    prod[0] *= self.prob[(xi," <=50K")]
                    prod[1] *= self.prob[(xi," >50K")]
                else: # continuous
                    prod[0] *= self.prob[(a," <=50K")](xi)
                    prod[1] *= self.prob[(a," >50K")](xi)

            # find the catagory with the max probability
            catagory = " <=50K" if prod.argmax() == 0 else " >50K"
            if catagory == row["salary"][:-1]: # be careful of "."
                acc += 1
            if i % 1000 == 0:
                print("Finish {}/{}".format(i,len(test_data)))

        acc /= len(test_data)
        print("Accurary: {:.2f}%".format(acc * 100))
        return acc

In [152]:
nb = NaiveBayesClassifier(train_data,attr_dict)
nb.predict(test_data)

Finish 0/16281
Finish 1000/16281
Finish 2000/16281
Finish 3000/16281
Finish 4000/16281
Finish 5000/16281
Finish 6000/16281
Finish 7000/16281
Finish 8000/16281
Finish 9000/16281
Finish 10000/16281
Finish 11000/16281
Finish 12000/16281
Finish 13000/16281
Finish 14000/16281
Finish 15000/16281
Finish 16000/16281
Accurary: 82.18%


0.8217554204287206