In [2]:
# All data cleaning and transformation described in:
# http://blog.districtdatalabs.com/building-a-classifier-from-census-data

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [4]:
header = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'martial-status',
         'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
          'hours-per-week', 'native-country', 'income']
data = pd.read_csv('data/adult.data', names=header)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,martial-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
import json

meta = {
    'target_names': list(data.income.unique()),
    'feature_names': list(data.columns),
    'categorical_features': {
        column: list(data[column].unique())
        for column in data.columns
        if data[column].dtype == 'object'
    },
}

with open('data/meta.json', 'w') as f:
    json.dump(meta, f, indent=2)

In [6]:
from sklearn.datasets.base import Bunch

def load_data(root='data'):
    # Load the meta data from the file
    with open(os.path.join(root, 'meta.json'), 'r') as f:
        meta = json.load(f)

    names = meta['feature_names']
    
    # Load the training and test data, skipping the bad row in the test data
    train = pd.read_csv(os.path.join(root, 'adult.data'), names=names)
    test  = pd.read_csv(os.path.join(root, 'adult.test'), names=names, skiprows=1)

    # Remove the target from the categorical features
    meta['categorical_features'].pop('income')

    # Return the bunch with the appropriate data chunked apart
    return Bunch(
        data = train[names[:-1]],
        target = train[names[-1]],
        data_test = test[names[:-1]],
        target_test = test[names[-1]],
        target_names = meta['target_names'],
        feature_names = meta['feature_names'],
        categorical_features = meta['categorical_features']
    )

dataset = load_data()

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class EncodeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """

    def __init__(self, columns=None):
        self.columns  = columns
        self.encoders = None

    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to encode.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns

        # Fit a label encoder for each column in the data frame
        self.encoders = {
            column: LabelEncoder().fit(data[column])
            for column in self.columns
        }
        return self

    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        for column, encoder in self.encoders.items():
            output[column] = encoder.transform(data[column])

        return output

In [8]:
from sklearn.preprocessing import Imputer

class ImputeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """

    def __init__(self, columns=None):
        self.columns = columns
        self.imputer = None

    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to impute.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns

        # Fit an imputer for each column in the data frame
        self.imputer = Imputer(missing_values=0, strategy='most_frequent')
        self.imputer.fit(data[self.columns])

        return self

    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        output[self.columns] = self.imputer.transform(output[self.columns])

        return output

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
yencode = LabelEncoder().fit(dataset.target)
Y = yencode.transform([y.rstrip(".") for y in dataset.target])

Y_TEST = yencode.transform([y.rstrip(".") for y in dataset.target_test])

In [10]:
census = Pipeline([
        ('encoder',  EncodeCategorical(dataset.categorical_features.keys())),
        ('imputer', ImputeCategorical(['workclass', 'native-country', 'occupation']))
])
X = census.fit_transform(dataset.data)
X_TEST = census.fit_transform(dataset.data_test)


In [175]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
number_of_samples = Y.shape[0]

X_map = {0: [], 1: [], 2: [], 3: []}
Y_map = {0: [], 1: [], 2: [], 3: []}

for fold_idx, indexes in enumerate(kf.split(X, Y)):
    train_index, test_index = indexes[0], indexes[1]

    print ("TRAIN:", train_index, "TEST:", test_index)
    first_test_idx, last_test_idx = test_index[0], test_index[-1]
    
    train_data = X[:first_test_idx].append(X[last_test_idx:number_of_samples-1])
    #test_data  = Y[first_test_idx:last_test_idx]
    
    #test_data = Y[:first_test_idx] + Y[last_test_idx:number_of_samples-1]
    test_data = np.concatenate((Y[:first_test_idx], Y[last_test_idx:number_of_samples-1]), axis=0)

    X_map[fold_idx] = train_data
    Y_map[fold_idx] = test_data



TRAIN: [ 6513  6514  6515 ..., 32558 32559 32560] TEST: [   0    1    2 ..., 6510 6511 6512]
TRAIN: [    0     1     2 ..., 32558 32559 32560] TEST: [ 6513  6514  6515 ..., 13022 13023 13024]
TRAIN: [    0     1     2 ..., 32558 32559 32560] TEST: [13025 13026 13027 ..., 19534 19535 19536]
TRAIN: [    0     1     2 ..., 32558 32559 32560] TEST: [19537 19538 19539 ..., 26046 26047 26048]
TRAIN: [    0     1     2 ..., 26046 26047 26048] TEST: [26049 26050 26051 ..., 32558 32559 32560]


In [192]:
# train 4 different classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB
first_KNN = KNeighborsClassifier(n_neighbors=10)
first_KNN.fit(X_map[0], Y_map[0])

Y_predicted = first_KNN.predict(X_TEST)
print(accuracy_score(Y_TEST, Y_predicted))


#################################################################################
second_NN = MLPClassifier(hidden_layer_sizes=(2, 2, 2), solver='lbfgs', alpha=0.01, verbose=True)
print(second_NN)
second_NN.fit(X_map[1], Y_map[1])
Y_predicted = second_NN.predict(X_TEST)
print(accuracy_score(Y_TEST, Y_predicted))

#######################################################################################
third_NN = LinearSVC(C=1.0, dual=False)
third_NN.fit(X_map[2], Y_map[2])
Y_predicted = third_NN.predict(X_TEST)
print (accuracy_score(Y_TEST, Y_predicted))

########################################################################################
fourth_NN = SVC(kernel='rbf', C=0.1)
fourth_NN.fit(X_map[3], Y_map[3])
Y_predicted = fourth_NN.predict(X_TEST)
print (accuracy_score(Y_TEST, Y_predicted))

##################################
fifth_NN = KNeighborsClassifier(n_neighbors=25)
fifth_NN.fit(X_map[4], Y_map[4])
Y_predicted = fifth_NN.predict(X_TEST)
print (accuracy_score(Y_TEST, Y_predicted))



0.797309747559
MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(2, 2, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)
0.236226276027
0.802284871937
0.763773723973
0.801854922916


In [193]:
from collections import Counter
class SimpleVotingClassifier():
    predicted = {}
    
    def __init__(self, classifiers):
        self.classifiers = classifiers

    def predict(self, X):
        number_of_elems = len(X)
        print (number_of_elems)
        for cl_idx, classifier in enumerate(self.classifiers):
            result = classifier.predict(X)
            self.predicted[cl_idx] = result
        final_results = self.vote()
        return final_results
    
    def vote_for_elem(self, keys):
        counter = Counter(keys)
        common = counter.most_common()
        return counter.most_common()[0][0]
    
    def vote(self):
        print ("Voting!")
        number_of_elems = len(self.predicted[0])
        partial_results = []
        for idx in range(number_of_elems):
            keys = []
            for key in self.predicted.keys():
                keys.append(self.predicted[key][idx])
            voting_result = self.vote_for_elem(keys)
            partial_results.append(voting_result)
            keys = []
        return partial_results

In [194]:
vc = SimpleVotingClassifier([first_KNN, second_NN, third_NN, fourth_NN, fifth_NN])
Y_predicted = vc.predict(np.array(X_TEST))
print (accuracy_score(Y_TEST, Y_predicted))

16281
Voting!
0.806707204717
