In [412]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
import seaborn as sns; sns.set(color_codes=True)
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.naive_bayes import GaussianNB

df1 = pd.read_csv('adult-training1.csv', parse_dates=True)

In [413]:
df = pd.read_csv('adult-training1.csv', parse_dates=True)

# Data Cleaning

## Removing rows with garbage value

In [414]:
df1.shape

(48842, 15)

In [415]:
df1=df1.replace(' ?',np.NaN)
df1=df1.dropna()

In [416]:
df1.shape

(45222, 15)

## Dropping unrequired columns

In [417]:
df1=df1.drop('education-num',axis=1)
df1=df1.drop('capital-gain',axis=1)
df1=df1.drop('capital-loss',axis=1)

In [418]:
df1.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


## Creating bins for Age

In [419]:
df1['age'].unique()

array([39, 50, 38, 53, 28, 37, 49, 52, 31, 42, 30, 23, 32, 34, 25, 43, 40,
       54, 35, 59, 56, 19, 20, 45, 22, 48, 21, 24, 57, 44, 41, 29, 47, 46,
       36, 79, 27, 18, 33, 76, 55, 61, 70, 64, 71, 66, 51, 58, 26, 17, 60,
       90, 75, 65, 77, 62, 63, 67, 74, 72, 69, 68, 73, 81, 78, 88, 80, 84,
       83, 85, 82, 86, 89, 87], dtype=int64)

In [420]:
ages = df1['age']
bins = [0,25,45,65,100]
labels = ["Young", "Middle", "Senior", "Old"]
df1['age'] = pd.cut(ages, bins, labels = labels,include_lowest = True)


In [421]:
df1.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,Middle,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,Senior,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,Middle,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,Senior,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,Middle,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [422]:
df1.groupby('age').count()

Unnamed: 0_level_0,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Young,8441,8441,8441,8441,8441,8441,8441,8441,8441,8441,8441
Middle,23546,23546,23546,23546,23546,23546,23546,23546,23546,23546,23546
Senior,11891,11891,11891,11891,11891,11891,11891,11891,11891,11891,11891
Old,1344,1344,1344,1344,1344,1344,1344,1344,1344,1344,1344


## Categorizing Education

In [423]:
df1.education.unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' 7th-8th', ' Doctorate',
       ' Assoc-voc', ' Prof-school', ' 5th-6th', ' 10th', ' Preschool',
       ' 12th', ' 1st-4th'], dtype=object)

In [424]:
def Job2Num(Job_String):
    if Job_String == ' 11th' or Job_String == ' 12th' or Job_String == ' Preschool' or Job_String ==' 9th' or Job_String ==' 7th-8th' or Job_String ==' 5th-6th' or Job_String ==' 10th' or Job_String ==' 1st-4th' :
        return 'Dropout'
    elif Job_String == ' Assoc-acdm' or Job_String == ' Assoc-voc':
        return 'Associates'
    elif Job_String == '  HS-grad' or Job_String == ' Some-college' or Job_String == ' Prof-school' :
        return 'some-college'
    else:
        return Job_String

In [425]:
education=df1.education
df1['education'] = education.apply(Job2Num)

In [426]:
df1.education.unique()

array([' Bachelors', ' HS-grad', 'Dropout', ' Masters', 'some-college',
       'Associates', ' Doctorate'], dtype=object)

## Categorizing Marital Status

In [427]:
df1['marital-status'].unique()

array([' Never-married', ' Married-civ-spouse', ' Divorced',
       ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
       ' Widowed'], dtype=object)

In [428]:
def MaritalStatus(Job_String):
    if Job_String == ' Divorced' or Job_String == ' Married-spouse-absent' or Job_String == ' Separated':
        return 'Not-married'
    elif Job_String == ' Married-civ-spouse' or Job_String == ' Married-AF-spouse':
        return 'Married'
    else:
        return Job_String

In [429]:
maritalStatus=df1['marital-status']
df1['marital-status'] = maritalStatus.apply(MaritalStatus)

In [430]:
df1['marital-status'].unique()

array([' Never-married', 'Married', 'Not-married', ' Widowed'],
      dtype=object)

In [431]:
df1.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,Middle,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,Senior,Self-emp-not-inc,83311,Bachelors,Married,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,Middle,Private,215646,HS-grad,Not-married,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,Senior,Private,234721,Dropout,Married,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,Middle,Private,338409,Bachelors,Married,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


## Categorizing Occupation

In [432]:
df1.occupation.unique()

array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Prof-specialty', ' Other-service', ' Sales', ' Transport-moving',
       ' Farming-fishing', ' Machine-op-inspct', ' Tech-support',
       ' Craft-repair', ' Protective-serv', ' Armed-Forces',
       ' Priv-house-serv'], dtype=object)

In [433]:
def occupationJob(Job_String):
    if Job_String == ' Craft-repair' or Job_String == ' Farming-fishing' or Job_String == ' Handlers-cleaners' or Job_String ==' Machine-op-inspct' or Job_String== ' Transport-moving' :
        return ' Blue-Collar'
    elif Job_String == ' Adm-clerical':
        return ' Admin'
    elif Job_String == ' Exec-managerial':
        return ' White-collar'
    elif Job_String == ' Other-service' or Job_String == ' Priv-house-serv' :
        return ' Service'
    elif Job_String == ' Prof-specialty':
        return ' Professional'
    elif Job_String == ' Protective-serv' or Job_String == ' Tech-support' :
        return ' Other-Occupations'
    else:
        return Job_String

In [434]:
occupation=df1['occupation']
df1['occupation'] = occupation.apply(occupationJob)

In [435]:
df1.occupation.unique()

array([' Admin', ' White-collar', ' Blue-Collar', ' Professional',
       ' Service', ' Sales', ' Other-Occupations', ' Armed-Forces'],
      dtype=object)

In [436]:
df1.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,Middle,State-gov,77516,Bachelors,Never-married,Admin,Not-in-family,White,Male,40,United-States,<=50K
1,Senior,Self-emp-not-inc,83311,Bachelors,Married,White-collar,Husband,White,Male,13,United-States,<=50K
2,Middle,Private,215646,HS-grad,Not-married,Blue-Collar,Not-in-family,White,Male,40,United-States,<=50K
3,Senior,Private,234721,Dropout,Married,Blue-Collar,Husband,Black,Male,40,United-States,<=50K
4,Middle,Private,338409,Bachelors,Married,Professional,Wife,Black,Female,40,Cuba,<=50K


## Categorizing Native country

In [437]:
df1['native-country'].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' Mexico',
       ' Puerto-Rico', ' Honduras', ' England', ' Canada', ' Germany',
       ' Iran', ' Philippines', ' Poland', ' Columbia', ' Cambodia',
       ' Thailand', ' Ecuador', ' Laos', ' Taiwan', ' Haiti', ' Portugal',
       ' Dominican-Republic', ' El-Salvador', ' France', ' Guatemala',
       ' Italy', ' China', ' South', ' Japan', ' Yugoslavia', ' Peru',
       ' Outlying-US(Guam-USVI-etc)', ' Scotland', ' Trinadad&Tobago',
       ' Greece', ' Nicaragua', ' Vietnam', ' Hong', ' Ireland',
       ' Hungary', ' Holand-Netherlands'], dtype=object)

In [438]:
df1.loc[(df1['native-country']==' Cambodia'),'native-country'] = 'Asia'
df1.loc[(df1['native-country']==' Canada'),'native-country'] = 'British-Commonwealth'   
df1.loc[(df1['native-country']==' China'),'native-country'] = 'China'      
df1.loc[(df1['native-country']==' Columbia'),'native-country'] = 'South-America'   
df1.loc[(df1['native-country']==' Cuba'),'native-country'] = 'Other'       
df1.loc[(df1['native-country']==' Dominican-Republic'),'native-country'] = 'Latin-America'
df1.loc[(df1['native-country']==' Ecuador'),'native-country'] = 'South-America'    
df1.loc[(df1['native-country']==' El-Salvador'),'native-country'] = 'South-America'
df1.loc[(df1['native-country']==' England'),'native-country'] = 'British-Commonwealth'
df1.loc[(df1['native-country']==' France'),'native-country'] = 'Euro'
df1.loc[(df1['native-country']==' Germany'),'native-country'] = 'Euro'
df1.loc[(df1['native-country']==' Greece'),'native-country'] = 'Euro'
df1.loc[(df1['native-country']==' Guatemala'),'native-country'] = 'Latin-America'
df1.loc[(df1['native-country']==' Haiti'),'native-country'] = 'Latin-America'
df1.loc[(df1['native-country']==' Holand-Netherlands'),'native-country'] = 'Euro'
df1.loc[(df1['native-country']==' Honduras'),'native-country'] = 'Latin-America'
df1.loc[(df1['native-country']==' Hong'),'native-country'] = 'China'
df1.loc[(df1['native-country']==' Hungary'),'native-country'] = 'Euro'
df1.loc[(df1['native-country']==' India'),'native-country'] = 'British-Commonwealth'
df1.loc[(df1['native-country']==' Iran'),'native-country'] = 'Other'
df1.loc[(df1['native-country']==' Ireland'),'native-country'] = 'British-Commonwealth'
df1.loc[(df1['native-country']==' Italy'),'native-country'] = 'Euro'
df1.loc[(df1['native-country']==' Jamaica'),'native-country'] = 'Latin-America'
df1.loc[(df1['native-country']==' Japan'),'native-country'] = 'Other'
df1.loc[(df1['native-country']==' Laos'),'native-country'] = 'Asia'
df1.loc[(df1['native-country']==' Mexico'),'native-country'] = 'Latin-America'
df1.loc[(df1['native-country']==' Nicaragua'),'native-country'] = 'Latin-America'
df1.loc[(df1['native-country']==' Outlying-US(Guam-USVI-etc)'),'native-country'] = 'Latin-America'
df1.loc[(df1['native-country']==' Peru'),'native-country'] = 'South-America'
df1.loc[(df1['native-country']==' Philippines'),'native-country'] = 'Asia'
df1.loc[(df1['native-country']==' Poland'),'native-country'] = 'Euro'
df1.loc[(df1['native-country']==' Portugal'),'native-country'] = 'Euro'
df1.loc[(df1['native-country']==' Puerto-Rico'),'native-country'] = 'Latin-America'
df1.loc[(df1['native-country']==' Scotland'),'native-country'] = 'British-Commonwealth'
df1.loc[(df1['native-country']==' South'),'native-country'] = 'Euro'
df1.loc[(df1['native-country']==' Taiwan'),'native-country'] = 'China'
df1.loc[(df1['native-country']==' Thailand'),'native-country'] = 'Asia'
df1.loc[(df1['native-country']==' Trinadad&Tobago'),'native-country'] = 'Latin-America'
df1.loc[(df1['native-country']==' United-States'),'native-country'] = 'United-States'
df1.loc[(df1['native-country']==' Vietnam'),'native-country'] = 'Asia'
df1.loc[(df1['native-country']==' Yugoslavia'),'native-country'] = 'Euro'

In [439]:
df1.groupby('native-country').count()

Unnamed: 0_level_0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,salary
native-country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Asia,442,442,442,442,442,442,442,442,442,442,442
British-Commonwealth,485,485,485,485,485,485,485,485,485,485,485
China,196,196,196,196,196,196,196,196,196,196,196
Euro,664,664,664,664,664,664,664,664,664,664,664
Latin-America,1548,1548,1548,1548,1548,1548,1548,1548,1548,1548,1548
Other,278,278,278,278,278,278,278,278,278,278,278
South-America,317,317,317,317,317,317,317,317,317,317,317
United-States,41292,41292,41292,41292,41292,41292,41292,41292,41292,41292,41292


In [440]:
df1.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,Middle,State-gov,77516,Bachelors,Never-married,Admin,Not-in-family,White,Male,40,United-States,<=50K
1,Senior,Self-emp-not-inc,83311,Bachelors,Married,White-collar,Husband,White,Male,13,United-States,<=50K
2,Middle,Private,215646,HS-grad,Not-married,Blue-Collar,Not-in-family,White,Male,40,United-States,<=50K
3,Senior,Private,234721,Dropout,Married,Blue-Collar,Husband,Black,Male,40,United-States,<=50K
4,Middle,Private,338409,Bachelors,Married,Professional,Wife,Black,Female,40,Other,<=50K


## Categorizing workclass

In [441]:
df1.workclass.unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' Self-emp-inc', ' Without-pay'], dtype=object)

In [442]:
def WorkClass(Job_String):
    if Job_String == ' State-gov' or Job_String == ' Local-gov' :
        return 'State-gov'
    
    if Job_String == ' Self-emp-not-inc' or Job_String == ' Self-emp-inc' :
        return ' Self-emp-inc'
    else:
        return Job_String

In [443]:
workclass=df1['workclass']
df1['workclass'] = workclass.apply(WorkClass)

In [444]:
df1.workclass.unique()

array(['State-gov', ' Self-emp-inc', ' Private', ' Federal-gov',
       ' Without-pay'], dtype=object)

## Cleaning Salary

In [445]:
df1.salary.unique()

array([' <=50K', ' >50K'], dtype=object)

In [446]:
def Salary(Job_String):
    if Job_String == ' <=50K' or Job_String == ' <=50K.' :
        return '<=50K'
    elif Job_String == ' >50K' or Job_String == ' >50K.' :
        return '>50K'
    else:
        return Job_String

In [447]:
salary=df1['salary']
df1['salary'] = salary.apply(Salary)


In [448]:
salary1=df['salary']
df['salary'] = salary1.apply(Salary)

In [449]:
df1.salary.unique()

array(['<=50K', '>50K'], dtype=object)

# ML Algorithms using scikit-learn

#### Used the following dataframes for the different ML models:

1. df1 for Categorical Naive Bayes
2. dfg for Gaussian Naive Bayes
3. dfm for Multinomial Naive Bayes

Since Categorical and Multinomial Naive Bayes needed same type of cleaned data before processing for individual models we have used dfm=df1, and then manipulated both the data frames differently.

In [450]:
dfm=df1.copy()

## 1. Categorical Naive Bayes

### Converting to categorical integer data for Categorical Naive Bayes

In [451]:
df1.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,Middle,State-gov,77516,Bachelors,Never-married,Admin,Not-in-family,White,Male,40,United-States,<=50K
1,Senior,Self-emp-inc,83311,Bachelors,Married,White-collar,Husband,White,Male,13,United-States,<=50K
2,Middle,Private,215646,HS-grad,Not-married,Blue-Collar,Not-in-family,White,Male,40,United-States,<=50K
3,Senior,Private,234721,Dropout,Married,Blue-Collar,Husband,Black,Male,40,United-States,<=50K
4,Middle,Private,338409,Bachelors,Married,Professional,Wife,Black,Female,40,Other,<=50K


In [452]:
print(df1.dtypes)

age               category
workclass           object
fnlwgt               int64
education           object
marital-status      object
occupation          object
relationship        object
race                object
sex                 object
hours-per-week       int64
native-country      object
salary              object
dtype: object


In [453]:
df1.workclass = pd.Categorical(pd.factorize(df1.workclass)[0])

In [454]:
df1.education = pd.Categorical(pd.factorize(df1.education)[0])

In [455]:
df1["marital-status"] = pd.Categorical(pd.factorize(df1["marital-status"])[0])

In [456]:
df1.occupation = pd.Categorical(pd.factorize(df1.occupation)[0])

In [457]:
df1.relationship = pd.Categorical(pd.factorize(df1.relationship)[0])

In [458]:
df1.race = pd.Categorical(pd.factorize(df1.race)[0])

In [459]:
df1.sex = pd.Categorical(pd.factorize(df1.sex)[0])

In [460]:
df1["hours-per-week"] = pd.Categorical(pd.factorize(df1["hours-per-week"])[0])

In [461]:
df1["native-country"] = pd.Categorical(pd.factorize(df1["native-country"])[0])

In [462]:
df1["salary"]= df1["salary"].astype(str) 
# df1.salary = pd.Categorical(pd.factorize(df1.salary)[0])

In [463]:
df1.age = pd.Categorical(pd.factorize(df1.age)[0])

In [464]:
df1.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,0,0,77516,0,0,0,0,0,0,0,0,<=50K
1,1,1,83311,0,1,1,1,0,0,1,0,<=50K
2,0,2,215646,1,2,2,0,0,0,0,0,<=50K
3,1,2,234721,2,1,2,1,1,0,0,0,<=50K
4,0,2,338409,0,1,3,2,1,1,0,1,<=50K


In [465]:
print(df1.dtypes)

age               category
workclass         category
fnlwgt               int64
education         category
marital-status    category
occupation        category
relationship      category
race              category
sex               category
hours-per-week    category
native-country    category
salary              object
dtype: object


### Categorical Naive Bayes Model Fitting and Implementation

In [466]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df1, test_size=0.05, random_state=42)

In [467]:
used_features=["age","fnlwgt","workclass",
"education",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"hours-per-week",
"native-country"]

In [468]:
clf = CategoricalNB()
clf.fit(X_train[used_features].values, X_train["salary"])
y_pred = clf.predict(X_test[used_features])

CategoricalNB(alpha=1.0, class_prior=None, fit_prior=True)

### Model Evaluation

In [469]:
from sklearn.metrics import confusion_matrix
print("The Confusion Matrix for Categorical Naive Bayes: ")
print(confusion_matrix(X_test["salary"], y_pred))

The Confusion Matrix for Categorical Naive Bayes: 
[[1432  259]
 [ 162  409]]


In [470]:
accuracy = clf.score(X_test[used_features],X_test["salary"])
print("The Accuracy for Categorical Naive Bayes is: " + str(100*accuracy))

The Accuracy for Categorical Naive Bayes is: 81.38815207780725


In [471]:
from sklearn.metrics import classification_report

print(classification_report(X_test["salary"], y_pred))

              precision    recall  f1-score   support

       <=50K       0.90      0.85      0.87      1691
        >50K       0.61      0.72      0.66       571

    accuracy                           0.81      2262
   macro avg       0.76      0.78      0.77      2262
weighted avg       0.83      0.81      0.82      2262



## 2. Gaussian Naive Bayes

### Data Manipulation for Gaussian Naive Bayes

In [472]:
dfg = pd.read_csv('adult-training1.csv', parse_dates=True)

In [473]:
dfg=dfg.drop('education-num',axis=1)
dfg=dfg.drop('capital-gain',axis=1)
dfg=dfg.drop('capital-loss',axis=1)

In [474]:
dfg.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [475]:
dfg.workclass = pd.factorize(dfg.workclass)[0]+1
dfg.education = pd.factorize(dfg.education)[0]+1
dfg["marital-status"] = pd.factorize(dfg["marital-status"])[0]+1
dfg.occupation = pd.factorize(dfg.occupation)[0]+1
dfg.relationship = pd.factorize(dfg.relationship)[0]+1
dfg.race = pd.factorize(dfg.race)[0]+1
dfg.sex = pd.factorize(dfg.sex)[0]+1
dfg["hours-per-week"] = pd.factorize(dfg["hours-per-week"])[0]+1
dfg["native-country"] = pd.factorize(dfg["native-country"])[0]+1
dfg.salary = pd.factorize(dfg.salary)[0]+1
dfg.age = pd.factorize(dfg.age)[0]+1

In [476]:
dfg.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,1,1,77516,1,1,1,1,1,1,1,1,1
1,2,2,83311,1,2,2,2,1,1,2,1,1
2,3,3,215646,2,3,3,1,1,1,1,1,1
3,4,3,234721,3,2,3,2,2,1,1,1,1
4,5,3,338409,1,2,4,3,2,2,1,2,1


### Gaussian Naive Bayes Model Fitting and Implementation

In [477]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df1, test_size=0.2)
used_features=["age","fnlwgt","workclass",
"education",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"hours-per-week",
"native-country"]

In [478]:
gnb = GaussianNB(var_smoothing=0)
gnb.fit(X_train[used_features].values, X_train["salary"])
y_predg = gnb.predict(X_test[used_features])

GaussianNB(priors=None, var_smoothing=0)

### Model Evaluation 

In [479]:
from sklearn.metrics import confusion_matrix
print("The Confusion Matrix for Gaussian Naive Bayes: ")
print(confusion_matrix(X_test["salary"], y_predg))

The Confusion Matrix for Gaussian Naive Bayes: 
[[5122 1691]
 [ 676 1556]]


In [480]:
accuracy_g = gnb.score(X_test[used_features],X_test["salary"])
print("The Accuracy for Categorical Naive Bayes is: " + str(100*accuracy_g))

The Accuracy for Categorical Naive Bayes is: 73.83084577114428


In [481]:
from sklearn.metrics import classification_report

print(classification_report(X_test["salary"], y_predg))

              precision    recall  f1-score   support

       <=50K       0.88      0.75      0.81      6813
        >50K       0.48      0.70      0.57      2232

    accuracy                           0.74      9045
   macro avg       0.68      0.72      0.69      9045
weighted avg       0.78      0.74      0.75      9045



## 3. Multinomial Naive Bayes

### Data Manipulation for Multinomial Naive Bayes

In [482]:
hpw = dfm['hours-per-week']
bins = [0,20,40,60,100]
labels = ["Part-Time", "Full-Time", "Over-Time", "Too much"]
dfm['hours-per-week'] = pd.cut(hpw, bins, labels = labels,include_lowest = True)

In [483]:
dfm.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,salary
0,Middle,State-gov,77516,Bachelors,Never-married,Admin,Not-in-family,White,Male,Full-Time,United-States,<=50K
1,Senior,Self-emp-inc,83311,Bachelors,Married,White-collar,Husband,White,Male,Part-Time,United-States,<=50K
2,Middle,Private,215646,HS-grad,Not-married,Blue-Collar,Not-in-family,White,Male,Full-Time,United-States,<=50K
3,Senior,Private,234721,Dropout,Married,Blue-Collar,Husband,Black,Male,Full-Time,United-States,<=50K
4,Middle,Private,338409,Bachelors,Married,Professional,Wife,Black,Female,Full-Time,Other,<=50K


### One-hot encoding

In [484]:
dfm.shape

(45222, 12)

In [485]:
dumDF = pd.get_dummies(dfm, columns=['age'])
d1m=pd.get_dummies(dumDF, columns=['workclass'])
d2=pd.get_dummies(d1m, columns=['education'])
d3=pd.get_dummies(d2, columns=['marital-status'])
d4=pd.get_dummies(d3, columns=['occupation'])
d5=pd.get_dummies(d4, columns=['relationship'])
d6=pd.get_dummies(d5, columns=['race'])
d7=pd.get_dummies(d6, columns=['sex'])
d8=pd.get_dummies(d7, columns=['hours-per-week'])
d9=pd.get_dummies(d8, columns=['native-country'])

In [486]:
d9.shape
d9.head()

(45222, 55)

Unnamed: 0,fnlwgt,salary,age_Young,age_Middle,age_Senior,age_Old,workclass_ Federal-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Without-pay,...,hours-per-week_Over-Time,hours-per-week_Too much,native-country_Asia,native-country_British-Commonwealth,native-country_China,native-country_Euro,native-country_Latin-America,native-country_Other,native-country_South-America,native-country_United-States
0,77516,<=50K,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,83311,<=50K,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,215646,<=50K,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,234721,<=50K,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,338409,<=50K,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [487]:
d9.columns

Index(['fnlwgt', 'salary', 'age_Young', 'age_Middle', 'age_Senior', 'age_Old',
       'workclass_ Federal-gov', 'workclass_ Private',
       'workclass_ Self-emp-inc', 'workclass_ Without-pay',
       'workclass_State-gov', 'education_ Bachelors', 'education_ Doctorate',
       'education_ HS-grad', 'education_ Masters', 'education_Associates',
       'education_Dropout', 'education_some-college',
       'marital-status_ Never-married', 'marital-status_ Widowed',
       'marital-status_Married', 'marital-status_Not-married',
       'occupation_ Admin', 'occupation_ Armed-Forces',
       'occupation_ Blue-Collar', 'occupation_ Other-Occupations',
       'occupation_ Professional', 'occupation_ Sales', 'occupation_ Service',
       'occupation_ White-collar', 'relationship_ Husband',
       'relationship_ Not-in-family', 'relationship_ Other-relative',
       'relationship_ Own-child', 'relationship_ Unmarried',
       'relationship_ Wife', 'race_ Amer-Indian-Eskimo',
       'race_ Asian

### Multinomial Naive Bayes Model Fitting and Implementation

In [488]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
X_train, X_test = train_test_split(d9, test_size=0.2,random_state=42)

In [489]:
usdfeat= ['age_Young', 'age_Middle', 'age_Senior', 'age_Old',
       'workclass_ Federal-gov', 'workclass_ Private',
       'workclass_ Self-emp-inc', 'workclass_ Without-pay',
       'workclass_State-gov', 'education_ Bachelors', 'education_ Doctorate',
       'education_ HS-grad', 'education_ Masters', 'education_Associates',
       'education_Dropout', 'education_some-college',
       'marital-status_ Never-married', 'marital-status_ Widowed',
       'marital-status_Married', 'marital-status_Not-married',
       'occupation_ Admin', 'occupation_ Armed-Forces',
       'occupation_ Blue-Collar', 'occupation_ Other-Occupations',
       'occupation_ Professional', 'occupation_ Sales', 'occupation_ Service',
       'occupation_ White-collar', 'relationship_ Husband',
       'relationship_ Not-in-family', 'relationship_ Other-relative',
       'relationship_ Own-child', 'relationship_ Unmarried',
       'relationship_ Wife', 'race_ Amer-Indian-Eskimo',
       'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White',
       'sex_ Female', 'sex_ Male', 'hours-per-week_Part-Time',
       'hours-per-week_Full-Time', 'hours-per-week_Over-Time',
       'hours-per-week_Too much', 'native-country_Asia',
       'native-country_British-Commonwealth', 'native-country_China',
       'native-country_Euro', 'native-country_Latin-America',
       'native-country_Other', 'native-country_South-America',
       'native-country_United-States']

In [490]:
mnb.fit(X_train[usdfeat].values,X_train["salary"])
y_predm = mnb.predict(X_test[usdfeat])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Model Evaluation

In [491]:
from sklearn.metrics import confusion_matrix
print("The Confusion Matrix for Multinomial Naive Bayes: ")
print(confusion_matrix(X_test["salary"], y_predm))

The Confusion Matrix for Multinomial Naive Bayes: 
[[5512 1233]
 [ 599 1701]]


In [492]:
accuracyf=1-((X_test["salary"]!=y_predm).sum())/X_test.shape[0]
print("The Accuracy for Categorical Naive Bayes is: " + str(accuracyf))

The Accuracy for Categorical Naive Bayes is: 0.7974571586511885


In [494]:
from sklearn.metrics import classification_report

print(classification_report(X_test["salary"], y_predm))

              precision    recall  f1-score   support

       <=50K       0.90      0.82      0.86      6745
        >50K       0.58      0.74      0.65      2300

    accuracy                           0.80      9045
   macro avg       0.74      0.78      0.75      9045
weighted avg       0.82      0.80      0.80      9045

