In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB 
from sklearn.metrics import accuracy_score

In [2]:
adult_df = pd.read_csv("adult.csv")

In [3]:
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
adult_df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [5]:
adult_df['income'].value_counts()

 <=50K    24720
 >50K      7841
Name: income, dtype: int64

In [6]:
adult_df['income'].unique()  #Array has first value as highest one here <=50K

array([' <=50K', ' >50K'], dtype=object)

In [7]:
adult_df['income'] = np.where(adult_df['income'] == ' <=50K',0,1)

In [8]:
print(list(adult_df.columns))

['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']


In [9]:
for value in ['workclass','education','marital_status','occupation', 'relationship', 'race', 'sex','native_country']:
    print(value,":",sum(adult_df[value]=='?'))   #we can check for # or some other as well  #type error as income 

workclass : 0
education : 0
marital_status : 0
occupation : 0
relationship : 0
race : 0
sex : 0
native_country : 0


In [10]:
adult_df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income             int32
dtype: object

In [11]:
le = preprocessing.LabelEncoder()  #based on the class it will put labels say 5 class -->0 to 4
workclass_cat = le.fit_transform(adult_df.workclass)  #categorical to numeric
education_cat = le.fit_transform(adult_df.education)
marital_cat = le.fit_transform(adult_df.marital_status)
occupation_cat = le.fit_transform(adult_df.occupation)
relationship_cat = le.fit_transform(adult_df.relationship)
race_cat = le.fit_transform(adult_df.race)
sex_cat = le.fit_transform(adult_df.sex)
native_country_cat = le.fit_transform(adult_df.native_country)


In [12]:
adult_df['workclass_cat'] = workclass_cat
adult_df['education_cat'] = education_cat
adult_df['marital_cat'] = marital_cat
adult_df['occupation_cat'] = occupation_cat
adult_df['relationship_cat'] = relationship_cat
adult_df['race_cat'] = race_cat
adult_df['sex_cat'] = sex_cat
adult_df['native_country_cat'] = native_country_cat

In [13]:
dummy_fields = ['workclass','education','marital_status','occupation', 'relationship', 'race', 'sex','native_country']
adult_df = adult_df.drop(dummy_fields,axis=1)

In [14]:
adult_df = adult_df.reindex_axis(['age', 'workclass_cat', 'fnlwgt', 'education_cat', 'education_num',
                                  'marital_cat', 'occupation_cat', 'relationship_cat', 'race_cat', 'sex_cat', 
                                  'capital_gain', 'capital_loss', 'hours_per_week', 'native_country_cat', 'income'],axis=1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
adult_df[:3]

Unnamed: 0,age,workclass_cat,fnlwgt,education_cat,education_num,marital_cat,occupation_cat,relationship_cat,race_cat,sex_cat,capital_gain,capital_loss,hours_per_week,native_country_cat,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0


In [16]:
adult_df.columns

Index(['age', 'workclass_cat', 'fnlwgt', 'education_cat', 'education_num',
       'marital_cat', 'occupation_cat', 'relationship_cat', 'race_cat',
       'sex_cat', 'capital_gain', 'capital_loss', 'hours_per_week',
       'native_country_cat', 'income'],
      dtype='object')

In [18]:
num_features = ['age', 'workclass_cat', 'fnlwgt', 'education_cat', 'education_num',
       'marital_cat', 'occupation_cat', 'relationship_cat', 'race_cat',
       'sex_cat', 'capital_gain', 'capital_loss', 'hours_per_week',
       'native_country_cat']

scaled_features = {}
for each in num_features:
    mean,std = adult_df[each].mean(),adult_df[each].std()
    scaled_features[each] = [mean,std]
    adult_df.loc[:,each] = (adult_df[each] - mean)/std #==>which is the formula for standard scaler = xi - mu / std

In [20]:
features = adult_df.values[:,:14]
target = adult_df.values[:,14]
features_train,features_test,target_train,target_test = train_test_split(features,target,test_size=0.3,random_state=123)

In [21]:
clf = GaussianNB()
clf.fit(features_train,target_train)
target_pred = clf.predict(features_test)

In [22]:
accuracy_score(target_test,target_pred)

0.8048930289691882