In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

In [43]:
data = pd.read_csv("./adult.csv", names=["Age", "Workclass", "Fnlwgt", "Education", "Education-Num", "Marital-Status", "Occupation", "Relationship", "Race", "Sex", "Capital-Gain", "Capital-Loss", "Hours-Per-Week", "Native-Country", "Income"])
data

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-Per-Week,Native-Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [44]:
# Data transformation
income = data['Income'].map({' <=50K': 0, ' >50K': 1})
labelEncoder = LabelEncoder()

labelEncoder.fit(data["Workclass"])
workclass = labelEncoder.transform(data["Workclass"])

labelEncoder.fit(data["Education"])
education = labelEncoder.transform(data["Education"])

labelEncoder.fit(data["Marital-Status"])
marital = labelEncoder.transform(data["Marital-Status"])

labelEncoder.fit(data["Occupation"])
occupation = labelEncoder.transform(data["Occupation"])

labelEncoder.fit(data["Relationship"])
relationship = labelEncoder.transform(data["Relationship"])

labelEncoder.fit(data["Race"])
race = labelEncoder.transform(data["Race"])

labelEncoder.fit(data["Sex"])
sex = labelEncoder.transform(data["Sex"])

labelEncoder.fit(data["Native-Country"])
native_country = labelEncoder.transform(data["Native-Country"])

data_obj = {"Age" : np.array(data["Age"]) ,
            "Workclass" : workclass,
            "Fnlwgt" : np.array(data["Fnlwgt"]) ,
            "Education" : education,
            "Education-Num" : np.array(data["Education-Num"]), 
            "Marital-Status" : marital, 
            "Occupation": occupation, 
            "Relationship": relationship, 
            "Race": race, 
            "Sex" : sex, 
            "Capital-Gain": np.array(data["Capital-Gain"]), 
            "Capital-Loss" : np.array(data["Capital-Loss"]), 
            "Hours-Per-Week" : np.array(data["Hours-Per-Week"]), 
            "Native-Country":native_country , 
            "Income" : income}
data = pd.DataFrame(data_obj)
data

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-Per-Week,Native-Country,Income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,7,12,2,13,5,4,0,0,0,38,39,0
32557,40,4,154374,11,9,2,7,0,4,1,0,0,40,39,1
32558,58,4,151910,11,9,6,1,4,4,0,0,0,40,39,0
32559,22,4,201490,11,9,4,1,3,4,1,0,0,20,39,0


In [45]:
X = data.drop("Income", axis=1)
y = data["Income"]

In [46]:
# spit data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# Get model
gaussian_nb = GaussianNB()
multinomial_nb = MultinomialNB()
bernoulli_nb = BernoulliNB()

In [49]:
# Training model
gaussian_nb.fit(X_train, y_train)
multinomial_nb.fit(X_train, y_train)
bernoulli_nb.fit(X_train, y_train)

In [50]:
# Prediction
gaussian_nb_preds = gaussian_nb.predict(X_test)
multinomial_nb_preds = multinomial_nb.predict(X_test)
bernoulli_nb_preds = bernoulli_nb.predict(X_test)

In [51]:
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"{model_name} Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print()

evaluate_model(y_test, gaussian_nb_preds, "Gaussian Naive Bayes")
evaluate_model(y_test, multinomial_nb_preds, "Multinomial Naive Bayes")
evaluate_model(y_test, bernoulli_nb_preds, "Bernoulli_nb Naive Bayes")

Gaussian Naive Bayes Metrics:
Accuracy: 0.7990
Precision: 0.6747
Recall: 0.3221
F1 Score: 0.4360

Multinomial Naive Bayes Metrics:
Accuracy: 0.7840
Precision: 0.6306
Recall: 0.2521
F1 Score: 0.3602

Bernoulli_nb Naive Bayes Metrics:
Accuracy: 0.7368
Precision: 0.4704
Recall: 0.7225
F1 Score: 0.5698

