In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split


In [3]:
# read data from file and convert it to numpy array
input_file = 'income_data.txt'
X,y = [],[]

with open(input_file,'r') as f:
    lines = f.readlines()
    for line in lines:
        if '?' in line:
            continue
        data = line[:-1].split(', ') # last character is a newline char '\n', so skip that. hence the line[:-1]
        # print(data)
        X.append(data)


X_data = []
for x in X[:len(X)-1]:
    X_data.append(x)

X_data = np.array(X_data)
print(X_data.shape)

(30162, 15)


In [4]:
# encode the strings to numeric data. if some data is numeric, leave it alone, else encode it. 
label_encoder = [] 
X_encoded = np.empty(X_data.shape)
for i,item in enumerate(X_data[0]):
    if item.isdigit(): 
        X_encoded[:, i] = X_data[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(X_data[:, i])

XX = X_encoded[:, :-1].astype(int) #these are the data to train the algorithm
yy = X_encoded[:, -1].astype(int)

In [5]:
# split data into training and testing set, and train the GaussianNB classifier
X_train, X_test, y_train, y_test = train_test_split(XX, yy, test_size=0.2, random_state=42)
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)

In [42]:
classifier.score(X_test, y_test)

0.7908171722194597

In [32]:
# perform 5 fold cross-validation for the accuracy
from sklearn.model_selection import cross_val_score
print("CV Accuracy:")
print(cross_val_score(classifier,XX,yy,scoring='accuracy', cv=5))

f1 = cross_val_score(classifier, XX, yy, scoring='f1_weighted', cv=5)
print ("F1 score: " + str(round(100*f1.mean(), 2)) + "%")


CV Accuracy:
[0.78783358 0.78418697 0.79210875 0.78498011 0.79409814]
F1 score: 75.9%


In [35]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_true=y_test, y_pred=y_test_pred))

print(confusion_matrix(y_test, y_test_pred).ravel()) #this will print true negative, false positive, false negative, true positive

[[4280  223]
 [1039  491]]
[4280  223 1039  491]


In [40]:
from sklearn.metrics import classification_report
target_names = ['<=50K', '>50K']
print(classification_report(y_test, y_test_pred, target_names = target_names))

              precision    recall  f1-score   support

       <=50K       0.80      0.95      0.87      4503
        >50K       0.69      0.32      0.44      1530

    accuracy                           0.79      6033
   macro avg       0.75      0.64      0.65      6033
weighted avg       0.77      0.79      0.76      6033



In [36]:
input_data = ['39', 'State-gov', '77516', 'Masters', '13', 'Never-married', 'Adm-clerical', 'Not-in-family', 'White', 'Male', '2174', '0', '40', 'United-States'] 
count = 0
input_data_encoded = [-1] * len(input_data)

for i,item in enumerate(input_data):
    if item.isdigit():
        input_data_encoded[i] = int(input_data[i])
    else:
        input_data_encoded[i] = int(label_encoder[count].transform([input_data[i]]))
        count = count + 1 

input_data_encoded = np.array(input_data_encoded)


In [37]:
print(input_data_encoded)


[   39     5 77516    12    13     4     0     1     4     1  2174     0
    40    38]


In [38]:
input_data_encoded = np.array(input_data_encoded).reshape(1,-1) # as our sample has one row, need to reshape it so it is compatible to predict function
# Predict and print output for a particular datapoint
output_class = classifier.predict(input_data_encoded)
print (label_encoder[-1].inverse_transform(output_class)[0])


<=50K
