In [83]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.neural_network import MLPClassifier
from sklearn import linear_model
from sklearn import ensemble

In [19]:
# Reading in adult.data from UCL Database
adult_data_raw = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None)


# Renaming Columns and stripping strings of whitespace
cols = ['Age','Workclass','fnlwgt','Education','Education_num','Marital Status','Occupation','Relationship','Race',
       'Sex','Capital Gain','Capital Loss','Hours per Week','Country','Salary']
adult_data_raw.columns = cols
adult_data_raw = adult_data_raw.applymap(lambda x: x.strip() if type(x) is str else x)

# Dropping all data entries with unknown values and entries where GDP information is unavailable
na_indices = []
unknown_countries = []
for i in range(adult_data_raw.shape[0]):
    if '?' in adult_data_raw.iloc[i].values:
        na_indices.append(i)
    if 'South' in adult_data_raw.iloc[i].values:
        unknown_countries.append(i)
    if 'Outlying-US(Guam-USVI-etc)' in adult_data_raw.iloc[i].values:
        unknown_countries.append(i)
    if 'Laos' in adult_data_raw.iloc[i].values:
        unknown_countries.append(i)
    if 'Taiwan' in adult_data_raw.iloc[i].values:
        unknown_countries.append(i)

dropped_indices = na_indices + unknown_countries
adult_data_raw = adult_data_raw.drop(dropped_indices).reset_index(drop=True)

# Ensuring all values in Age are numeric
adult_data_raw['Age'] = pd.to_numeric(adult_data_raw['Age'])

# Consolidating and encoding Workclass to:
# Not-working: 0, Private: 1, Self-emp: 2, Government: 3
adult_data_raw['Workclass'] = adult_data_raw['Workclass'].astype(str)
adult_data_raw['Workclass'] = adult_data_raw['Workclass'].replace('Never-worked', 'Not-working')
adult_data_raw['Workclass'] = adult_data_raw['Workclass'].replace('Without-pay', 'Not-working')
adult_data_raw['Workclass'] = adult_data_raw['Workclass'].replace('Self-emp-not-inc', 'Self-emp')
adult_data_raw['Workclass'] = adult_data_raw['Workclass'].replace('Self-emp-inc', 'Self-emp')
adult_data_raw['Workclass'] = adult_data_raw['Workclass'].replace('Local-gov', 'Government')
adult_data_raw['Workclass'] = adult_data_raw['Workclass'].replace('State-gov', 'Government')
adult_data_raw['Workclass'] = adult_data_raw['Workclass'].replace('Federal-gov', 'Government')
adult_data_raw['Workclass'] = adult_data_raw['Workclass'].replace('Not-working', 0)
adult_data_raw['Workclass'] = adult_data_raw['Workclass'].replace('Private', 1)
adult_data_raw['Workclass'] = adult_data_raw['Workclass'].replace('Self-emp', 2)
adult_data_raw['Workclass'] = adult_data_raw['Workclass'].replace('Government', 3)
adult_data_raw['Workclass'] = pd.to_numeric(adult_data_raw['Workclass'])

# Dropping fnlwgt column beacuse it does not appear to provide value to classification
adult_data_raw = adult_data_raw.drop('fnlwgt', axis=1)

# Consolidating and encoding Education to:
# Dropout: 0, HS-grad: 1, Prof-school: 2, Associates: 3, Bachelors: 4, Masters: 5, Doctorate: 6
adult_data_raw['Education'] = adult_data_raw['Education'].astype(str)
adult_data_raw['Education'] = adult_data_raw['Education'].replace('Some-college', 'Dropout')
adult_data_raw['Education'] = adult_data_raw['Education'].replace('Assoc-voc', 'Associates')
adult_data_raw['Education'] = adult_data_raw['Education'].replace('11th', 'Dropout')
adult_data_raw['Education'] = adult_data_raw['Education'].replace('Assoc-acdm', 'Associates')
adult_data_raw['Education'] = adult_data_raw['Education'].replace('10th', 'Dropout')
adult_data_raw['Education'] = adult_data_raw['Education'].replace('7th-8th', 'Dropout')
adult_data_raw['Education'] = adult_data_raw['Education'].replace('9th', 'Dropout')
adult_data_raw['Education'] = adult_data_raw['Education'].replace('12th', 'Dropout')
adult_data_raw['Education'] = adult_data_raw['Education'].replace('5th-6th', 'Dropout')
adult_data_raw['Education'] = adult_data_raw['Education'].replace('1st-4th', 'Dropout')
adult_data_raw['Education'] = adult_data_raw['Education'].replace('Preschool', 'Dropout')
adult_data_raw['Education'] = adult_data_raw['Education'].replace('Dropout', 0)
adult_data_raw['Education'] = adult_data_raw['Education'].replace('HS-grad', 1)
adult_data_raw['Education'] = adult_data_raw['Education'].replace('Prof-school', 2)
adult_data_raw['Education'] = adult_data_raw['Education'].replace('Associates', 3)
adult_data_raw['Education'] = adult_data_raw['Education'].replace('Bachelors', 4)
adult_data_raw['Education'] = adult_data_raw['Education'].replace('Masters', 5)
adult_data_raw['Education'] = adult_data_raw['Education'].replace('Doctorate', 6)
adult_data_raw['Education'] = pd.to_numeric(adult_data_raw['Education'])

# Dropping Education_num in favor of using encoded Education column
adult_data_raw = adult_data_raw.drop('Education_num', axis=1)


# Consolidating and encoding Marital Status to:
# Married: 0 , Single: 1, Divorced: 2, Separated: 3, Widowed: 4
adult_data_raw['Marital Status'] = adult_data_raw['Marital Status'].astype(str)
adult_data_raw['Marital Status'] = adult_data_raw['Marital Status'].replace('Married-civ-spouse', 'Married')
adult_data_raw['Marital Status'] = adult_data_raw['Marital Status'].replace('Never-married', 'Single')
adult_data_raw['Marital Status'] = adult_data_raw['Marital Status'].replace('Married-spouse-absent', 'Married')
adult_data_raw['Marital Status'] = adult_data_raw['Marital Status'].replace('Married-AF-spouse', 'Married')
adult_data_raw['Marital Status'] = adult_data_raw['Marital Status'].replace('Married', 0)
adult_data_raw['Marital Status'] = adult_data_raw['Marital Status'].replace('Single', 1)
adult_data_raw['Marital Status'] = adult_data_raw['Marital Status'].replace('Divorced', 2)
adult_data_raw['Marital Status'] = adult_data_raw['Marital Status'].replace('Separated', 3)
adult_data_raw['Marital Status'] = adult_data_raw['Marital Status'].replace('Widowed', 4)
adult_data_raw['Marital Status'] = pd.to_numeric(adult_data_raw['Marital Status'])


# Consolidating and encoding Occupation to:
# White-collar:0 , Blue-collar: 1, Pink-collar:2 , Other-service: 3
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].astype(str)
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Prof-specialty', 'White-collar')
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Craft-repair', 'Blue-collar')
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Exec-managerial', 'White-collar')
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Adm-clerical', 'White-collar')
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Sales', 'Pink-collar')
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Machine-op-inspct', 'Blue-collar')
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Transport-moving', 'Blue-collar')
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Handlers-cleaners', 'Blue-collar')
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Tech-support', 'Pink-collar')
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Farming-fishing', 'Blue-collar')
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Protective-serv', 'Pink-collar')
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Priv-house-serv', 'Blue-collar')
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('White-collar', 0)
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Blue-collar', 1)
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Pink-collar', 2)
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Other-service', 3)
adult_data_raw['Occupation'] = adult_data_raw['Occupation'].replace('Armed-Forces', 5)
adult_data_raw['Occupation'] = pd.to_numeric(adult_data_raw['Occupation'])

# Consolidating and encoding Relationship to:
# Not-in-family: 0, Unmarried: 1, Other-relative: 2, Own-child: 3, Wife: 4, Husband: 5
adult_data_raw['Relationship'] = adult_data_raw['Relationship'].astype(str)
adult_data_raw['Relationship'] = adult_data_raw['Relationship'].replace('Not-in-family', 0)
adult_data_raw['Relationship'] = adult_data_raw['Relationship'].replace('Unmarried', 1)
adult_data_raw['Relationship'] = adult_data_raw['Relationship'].replace('Other-relative', 2)
adult_data_raw['Relationship'] = adult_data_raw['Relationship'].replace('Own-child', 3)
adult_data_raw['Relationship'] = adult_data_raw['Relationship'].replace('Wife', 4)
adult_data_raw['Relationship'] = adult_data_raw['Relationship'].replace('Husband', 5)
adult_data_raw['Relationship'] = pd.to_numeric(adult_data_raw['Relationship'])

# Consolidating and encoding Race to:
# Other: 0, Amer-Indian-Eskimo: 1, Asian-Pac-Islander: 2, Black: 3, White: 4
adult_data_raw['Race'] = adult_data_raw['Race'].astype(str)
adult_data_raw['Race'] = adult_data_raw['Race'].replace('Other', 0)
adult_data_raw['Race'] = adult_data_raw['Race'].replace('Amer-Indian-Eskimo', 1)
adult_data_raw['Race'] = adult_data_raw['Race'].replace('Asian-Pac-Islander', 2)
adult_data_raw['Race'] = adult_data_raw['Race'].replace('Black', 3)
adult_data_raw['Race'] = adult_data_raw['Race'].replace('White', 4)
adult_data_raw['Race'] = pd.to_numeric(adult_data_raw['Race'])

# Converting Sex to 1 if Male and 0 if Female
adult_data_raw['Sex'] = adult_data_raw['Sex'].apply(lambda x: 1 if x == 'Male' else 0)

# Ensuring all Capital Gain, Capital Loss, and Hours per week values are numeric
adult_data_raw['Capital Gain'] = pd.to_numeric(adult_data_raw['Capital Gain'])
adult_data_raw['Capital Loss'] = pd.to_numeric(adult_data_raw['Capital Loss'])
adult_data_raw['Hours per Week'] = pd.to_numeric(adult_data_raw['Hours per Week'])

# Consolidating certain countries to conincide with information on GDP per Country 
adult_data_raw['Country'] = adult_data_raw['Country'].astype(str)
adult_data_raw['Country'] = adult_data_raw['Country'].replace('Scotland', 'United Kingdom')
adult_data_raw['Country'] = adult_data_raw['Country'].replace('England', 'United Kingdom')
adult_data_raw['Country'] = adult_data_raw['Country'].replace('Trinadad&Tobago', 'Trinidad and Tobago')
adult_data_raw['Country'] = adult_data_raw['Country'].replace('Hong', 'Hong Kong SAR, China')
adult_data_raw['Country'] = adult_data_raw['Country'].replace('Holand-Netherlands', 'Netherlands')
adult_data_raw['Country'] = adult_data_raw['Country'].replace('Columbia', 'Colombia')

# Converting Salary to 1 if >50k and 0 if <= to 50k
adult_data_raw['Salary'] = adult_data_raw['Salary'].apply(lambda x: 1 if x == '>50K' else 0)

# Rearranging columns for readability
new_cols = ['Age','Workclass','Education','Marital Status','Occupation','Relationship','Race',
            'Sex','Capital Gain','Capital Loss','Hours per Week','Country','Salary']
adult_data_raw = adult_data_raw[new_cols]

In [20]:
adult_data_raw.head()

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per Week,Country,Salary
0,39,3,4,1,0,0,4,1,2174,0,40,United-States,0
1,50,2,4,0,0,5,4,1,0,0,13,United-States,0
2,38,1,1,2,1,0,4,1,0,0,40,United-States,0
3,53,1,0,0,1,5,3,1,0,0,40,United-States,0
4,28,1,4,0,0,4,3,0,0,0,40,Cuba,0


In [124]:
features = ['Age', 'Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 
            'Capital Gain', 'Capital Loss', 'Hours per Week', 'Salary']
data = adult_data_raw[features]

# # Likelihood encoding for categorical variables
# categorical_variables = ['Workclass','Education','Marital Status','Occupation','Relationship','Race']
# large_salary = data.loc[data['Salary'] == 1] 
# for category in categorical_variables:
#     unique_vals = list(data[category].unique())
#     x_value_counts = dict(large_salary[category].value_counts())
#     missing_vals = list(set(unique_vals).difference(list(x_value_counts.keys())))
#     for missing in missing_vals:
#         x_value_counts.update({missing: 0})
#     y_value_counts = dict(data[category].value_counts())
#     likelihood = {k: x_value_counts[k] / y_value_counts[k] for k in y_value_counts if k in x_value_counts}
#     data[category].replace(likelihood, inplace=True)
    
X = data.drop(['Salary'], axis=1)
Y = data['Salary']
# X = X.apply(stats.zscore)
display(X.head())

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per Week
0,39,3,4,1,0,0,4,1,2174,0,40
1,50,2,4,0,0,5,4,1,0,0,13
2,38,1,1,2,1,0,4,1,0,0,40
3,53,1,0,0,1,5,3,1,0,0,40
4,28,1,4,0,0,4,3,0,0,0,40


In [131]:
mlp = MLPClassifier(activation='logistic', hidden_layer_sizes=(100,))
mlp.fit(X,Y)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [132]:
mlp.score(X,Y)

0.8233060163901659

In [138]:
params = {'n_estimators': 1000,
          'max_depth': 3,
          'loss': 'deviance'}

clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X, Y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [139]:
clf.score(X, Y)

0.8779399027250316

### Analysis
Using the Adult data set from UCI database; classifying whether or not an adult makes more or less than 50K a year based on feature values.

Features being used: Age, Workclass, Education, Martial Status, Occupation, Relationship, Sex, Capital Gain, Captial Loss, and Hours per week

Note: Impact (Likelihood) Encoding and zscore normalization introdcued to observe performance changes

Algorithims Run: Neural Network and Boosted Tree Classifer

Results:
    - Cleaned Data 
        - Neural Network; Logistic; (100,)                  ->   0.8233
        - Neural Network; Logistic; (50, 75, 25,)           ->   0.8262
        
        - Neural Network; Hyperbolic Tangent; (100,)        ->   0.8240
        - Neural Network; Hyperbolic Tangent; (50, 75, 25,) ->   0.8054

        - Boosted Tree Classifer; Deviance; Depth=3         ->   0.8779
    - Z-Score 
        - Neural Network; Logistic; (100,)                  ->   0.8415
        - Neural Network; Logistic; (50, 75, 25,)           ->   0.8422
        
        - Neural Network; Hyperbolic Tangent; (100,)        ->   0.8489
        - Neural Network; Hyperbolic Tangent; (50, 75, 25,) ->   0.8798

        - Boosted Tree Classifer; Deviance; Depth=3         ->   0.8779
    - Impact Encoding 
        - Neural Network; Logistic; (100,)                  ->   0.8293
        - Neural Network; Logistic; (50, 75, 25,)           ->   0.8301
        
        - Neural Network; Hyperbolic Tangent; (100,)        ->   0.8254
        - Neural Network; Hyperbolic Tangent; (50, 75, 25,) ->   0.8268

        - Boosted Tree Classifer; Deviance; Depth=3         ->   0.8796
    - Impact Encoding and Z-Score
        - Neural Network; Logistic; (100,)                  ->   0.8463
        - Neural Network; Logistic; (50, 75, 25,)           ->   0.8472
        
        - Neural Network; Hyperbolic Tangent; (100,)        ->   0.8503
        - Neural Network; Hyperbolic Tangent; (50, 75, 25,) ->   0.8645

        - Boosted Tree Classifer; Deviance; Depth=3         ->   0.8779

#### Notes:
The boosted tree classifier with depth of 3 and deviance loss function consistently produced results of accuracy greater than 87% proving a better model for this data.  However, it took slightly longer to run the algorithim.

The neural networks also provided results of accuracy above 80% in all cases showing it to be a strong algorithim as well.  There were various cases where different shapes of the neural network and activation function proved either the hyperbolic tangent or sigmoid function to outperform the other by a slight margin.

For the best practice for adjusting the data, z-score normalization and the combination of both impact encoding and z-score normalization produced the highest results across all models and variations.

The variations in complexity was as follows: in most cases, the boosted tree classifier took slightly more time over the neural networks.  Using the hyperbolic tangent activation function over the sigmoid function took more time in some cases but not all.  The sigmoid activation function was, if not the fastest, close to it in all cases.  Final note, the additional hidden layers took more time to run.