In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing, tree
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier

%matplotlib inline

In [2]:
main_df = pd.read_csv("data/CencusIncome.data.txt", header = None)
main_df = main_df.rename(columns={0: 'age', 1: 'workclass', 2: 'fnlwgt', 3: 'education', 4: 'education-num', 5: 'marital-status', 6: 'occupation',7: 'relationship', 8: 'race',9: 'sex', 10: 'capital-gain', 11: 'capital-loss', 12: 'hours-per-week', 13: 'native-country', 14: 'label'})

main_df = main_df[main_df['workclass'] != '?']
main_df = main_df[main_df['marital-status'] != '?']
main_df = main_df[main_df['native-country'] != '?']
main_df = main_df.drop(['fnlwgt'], axis=1)

main_df.head(1000)

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [3]:
le = preprocessing.LabelEncoder()

for column in main_df.columns.values:
    le.fit(main_df[column])
    main_df[column] = le.transform(main_df[column])

main_df.head()

y = np.array(main_df['label'])
x = np.array(main_df.drop(['label'],1))

In [4]:
gnb = GaussianNB()

score = cross_val_score(gnb, x, y, cv=10)

print()
for i in range(10):
    print("Fold-" + str(i + 1) + ":", "%0.6f" % score[i])
print()
print("Mean: %0.6f" % score.mean())
print("Accuration: %0.6f (+/- %0.6f)" % (score.mean(), score.std() * 2))


Fold-1: 0.808151
Fold-2: 0.813391
Fold-3: 0.812396
Fold-4: 0.809082
Fold-5: 0.818031
Fold-6: 0.821014
Fold-7: 0.808088
Fold-8: 0.814385
Fold-9: 0.818302
Fold-10: 0.818966

Mean: 0.814181
Accuration: 0.814181 (+/- 0.009050)


In [5]:
ID3learn = tree.DecisionTreeClassifier(criterion="entropy")

score = cross_val_score(ID3learn, x, y, cv=10)

print()
for i in range(10):
    print("Fold-" + str(i + 1) + ":", "%0.6f" % score[i])
print()
print("Mean: %0.6f" % score.mean())
print("Accuration: %0.6f (+/- %0.6f)" % (score.mean(), score.std() * 2))


Fold-1: 0.808482
Fold-2: 0.818031
Fold-3: 0.816042
Fold-4: 0.805104
Fold-5: 0.820020
Fold-6: 0.810739
Fold-7: 0.816374
Fold-8: 0.824329
Fold-9: 0.823939
Fold-10: 0.810676

Mean: 0.815374
Accuration: 0.815374 (+/- 0.012331)


In [6]:
n_neighbors = 61

KNNlearn = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')

score = cross_val_score(KNNlearn, x, y, cv=10)

print()
for i in range(10):
    print("Fold-" + str(i + 1) + ":", "%0.6f" % score[i])
print()
print("Mean: %0.6f" % score.mean())
print("Accuration: %0.6f (+/- %0.6f)" % (score.mean(), score.std() * 2))


Fold-1: 0.820742
Fold-2: 0.829632
Fold-3: 0.824992
Fold-4: 0.824660
Fold-5: 0.829632
Fold-6: 0.829632
Fold-7: 0.823666
Fold-8: 0.833610
Fold-9: 0.829244
Fold-10: 0.832891

Mean: 0.827870
Accuration: 0.827870 (+/- 0.007910)


In [7]:
MLPlearn = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(5, 2))

score = cross_val_score(MLPlearn, x, y, cv=10)

print()
for i in range(10):
    print("Fold-" + str(i + 1) + ":", "%0.6f" % score[i])
print()
print("Mean: %0.6f" % score.mean())
print("Accuration: %0.2f (+/- %0.6f)" % (score.mean(), score.std() * 2))


Fold-1: 0.807488
Fold-2: 0.832615
Fold-3: 0.813722
Fold-4: 0.751077
Fold-5: 0.751077
Fold-6: 0.751077
Fold-7: 0.802453
Fold-8: 0.751077
Fold-9: 0.797414
Fold-10: 0.823607

Mean: 0.788161
Accuration: 0.79 (+/- 0.063404)


In [8]:
KNNlearn = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
KNNlearn.fit(x, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=61, p=2,
           weights='uniform')

In [9]:
from sklearn.externals import joblib

joblib.dump(KNNlearn, 'KNN.model')

['KNN.model']

In [None]:
from sklearn.externals import joblib
from sklearn import preprocessing
import pandas as pd

KNNlearn = joblib.load('KNN.model')

test_df = pd.read_csv("data/CencusIncome.test.txt", header=None, skiprows=1)

test_df = test_df.rename(columns={0: 'age', 1: 'workclass', 2: 'fnlwgt', 3: 'education', 4: 'education-num', 5: 'marital-status', 6: 'occupation',7: 'relationship', 8: 'race',9: 'sex', 10: 'capital-gain', 11: 'capital-loss', 12: 'hours-per-week', 13: 'native-country', 14: 'label'}) 

test_df = test_df.drop(['fnlwgt'], axis=1)

le = preprocessing.LabelEncoder()

for column in main_df.columns.values:
    le.fit(test_df[column])
    test_df[column] = le.transform(test_df[column])

y = np.array(test_df['label'])
x = np.array(test_df.drop(['label'],1))

score = KNNlearn.score(x,y)
print("Accuracy : " + score * 100 + "%")