In [1]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

#### importing a couple of packages

In [2]:
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
import numpy as np

#### load the dataset

In [3]:
input_file = 'adult_data.txt'
# Reading the data
X = []
y = []
count_lessthan50k = 0
count_morethan50k = 0
num_images_threshold = 10000

In [4]:
with open(input_file, 'r') as file:
    for line in file.readlines():
        if '?' in line:
            continue
        data = line[:-1].split(', ')
        if data[-1] == '<=50K' and count_lessthan50k <num_images_threshold:
            X.append(data)
            count_lessthan50k = count_lessthan50k + 1
        elif data[-1] == '>50K' and count_morethan50k <num_images_threshold:
            X.append(data)
            count_morethan50k = count_morethan50k + 1
        if count_lessthan50k >= num_images_threshold and count_morethan50k >= num_images_threshold:
            break
X = np.array(X)

#### converting string attributes to numerical data while retaining the original numerical data

In [5]:
# Convert string data to numerical data
label_encoder = []
X_encoded = np.empty(X.shape)
for i,item in enumerate(X[0]):
    if item.isdigit():
        X_encoded[:, i] = X[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(X[:,i])
X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)

#### Training the Classifier

In [6]:
# Build a classifier
classifier_gaussiannb = GaussianNB()
classifier_gaussiannb.fit(X, y)

GaussianNB(priors=None)

#### spliting the data into training and testing set

In [7]:
# Cross validation
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25,random_state=5)
classifier_gaussiannb = GaussianNB()
classifier_gaussiannb.fit(X_train, y_train)
y_test_pred = classifier_gaussiannb.predict(X_test)

#### extracting performance metrics

In [9]:
# compute F1 score of the classifier
f1 = model_selection.cross_val_score(classifier_gaussiannb,X, y, scoring='f1_weighted', cv=5)
print("F1 score: " + str(round(100*f1.mean(), 2)) + "%")

F1 score: 63.06%


#### classifying a single datapoint

In [10]:
# Testing encoding on single data instance
input_data = ['39', 'State-gov', '77516', 'Bachelors', '13',
'Never-married', 'Adm-clerical', 'Not-in-family', 'White',
'Male', '2174', '0', '40', 'United-States']
count = 0
input_data_encoded = [-1] * len(input_data)
input_data_encoded = np.array(input_data_encoded).reshape(-1,1)
for i,item in enumerate(input_data):
    if item.isdigit():
        input_data_encoded[i] = int(input_data[i])
input_data_encoded = np.array(input_data_encoded)

In [11]:
# Predict and print output for a particular datapoint
output_class = classifier_gaussiannb.predict(input_data_encoded)
print(label_encoder[-1].inverse_transform(output_class)[0])

<=50K
