# Baseline model - census income prediction

Task: predict whether a person earns more or less than $50k.

In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score


sns.set(style='white', context='notebook', palette='deep')

## 2. Load Data

In [59]:
train_df = pd.read_csv("data/census_income/adult_data", sep=', ')   
test_df = pd.read_csv("data/census_income/adult.test", sep=', ')

dataset = pd.concat([train_df, test_df])

  train_df = pd.read_csv("data/census_income/adult_data", sep=', ')
  test_df = pd.read_csv("data/census_income/adult.test", sep=', ')


In [60]:
# Reformat Column We Are Predicting
dataset['income']=dataset['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})
dataset.head(4)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0


## 3- Analyze Data

In [61]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48842 entries, 0 to 16280
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       48842 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education.num   48842 non-null  int64 
 5   marital.status  48842 non-null  object
 6   occupation      48842 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital.gain    48842 non-null  int64 
 11  capital.loss    48842 non-null  int64 
 12  hours.per.week  48842 non-null  int64 
 13  native.country  48842 non-null  object
 14  income          48842 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 6.0+ MB


**Missing values** are flagged with "?"

In [62]:
dataset.apply(lambda x: x[x == '?'].count())

age                  0
workclass         2799
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     857
income               0
dtype: int64

In [63]:
# Identify Numeric features
numeric_features = ['age','fnlwgt','education.num','capital.gain','capital.loss','hours.per.week','income']

# Identify Categorical features
cat_features = ['workclass','education','marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

## Feature Engineering

**Sex**

In [64]:
# Convert Sex value to 0 and 1
dataset["sex"] = dataset["sex"].map({"Male": 0, "Female":1})

**Marital Status**

In [65]:
dataset['marital.status'].value_counts()

marital.status
Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: count, dtype: int64

In [66]:
# Create Married Column - Binary Yes(1) or No(0)
dataset["marital.status"] = dataset["marital.status"].replace(['Never-married','Divorced','Separated','Widowed'], 'Single')
dataset["marital.status"] = dataset["marital.status"].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'], 'Married')
dataset["marital.status"] = dataset["marital.status"].map({"Married":1, "Single":0})
dataset["marital.status"] = dataset["marital.status"].astype(int)

**Education**

In [67]:
dummies_ed = pd.get_dummies(dataset['education'], prefix='education')

dataset = pd.concat([dataset, dummies_ed], axis=1)

# If you want to drop the original 'native.country' column
dataset = dataset.drop('education', axis=1)

**Workclass**

In [68]:
dataset['workclass'] = dataset['workclass'].str.replace('?', 'Unemployed')   # Missing values

In [69]:
dummies_w = pd.get_dummies(dataset['workclass'], prefix='workclass')

dataset = pd.concat([dataset, dummies_w], axis=1)

dataset = dataset.drop('workclass', axis=1)

**Occupation**

In [70]:
dataset['occupation'] = dataset['occupation'].str.replace('?', 'Unemployed')   # Missing values

dummies_o = pd.get_dummies(dataset['occupation'], prefix='occupation')

dataset = pd.concat([dataset, dummies_o], axis=1)

dataset = dataset.drop('occupation', axis=1)

**Race**

In [71]:
dummies_r = pd.get_dummies(dataset['race'], prefix='race')

dataset = pd.concat([dataset, dummies_r], axis=1)

dataset = dataset.drop('race', axis=1)

**Relationship**

In [72]:
dummies_re = pd.get_dummies(dataset['relationship'], prefix='relationship')

dataset = pd.concat([dataset, dummies_re], axis=1)

dataset = dataset.drop('relationship', axis=1)

**Native Country and fnlwgt** dropped

In [73]:
dataset.drop(labels=["native.country", "fnlwgt"], axis = 1, inplace = True)
print('Dataset with Dropped Labels')

Dataset with Dropped Labels


In [74]:
dataset

Unnamed: 0,age,education.num,marital.status,sex,capital.gain,capital.loss,hours.per.week,income,education_10th,education_11th,...,race_Asian-Pac-Islander,race_Black,race_Other,race_White,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife
0,39,13,0,0,2174,0,40,0,False,False,...,False,False,False,True,False,True,False,False,False,False
1,50,13,1,0,0,0,13,0,False,False,...,False,False,False,True,True,False,False,False,False,False
2,38,9,0,0,0,0,40,0,False,False,...,False,False,False,True,False,True,False,False,False,False
3,53,7,1,0,0,0,40,0,False,True,...,False,True,False,False,True,False,False,False,False,False
4,28,13,1,1,0,0,40,0,False,False,...,False,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,13,0,1,0,0,36,0,False,False,...,False,False,False,True,False,True,False,False,False,False
16277,64,9,0,0,0,0,40,0,False,False,...,False,True,False,False,False,False,True,False,False,False
16278,38,13,1,0,0,0,50,0,False,False,...,False,False,False,True,True,False,False,False,False,False
16279,44,13,0,0,5455,0,40,0,False,False,...,True,False,False,False,False,False,False,True,False,False


In [75]:
for col in dataset.columns:
    if dataset[col].dtype == 'bool':
        dataset[col] = dataset[col].astype(int)

## Modeling

In [76]:
X_train, Y_train = dataset.iloc[:32561].drop('income', axis=1), dataset.iloc[:32561]['income']
X_test, Y_test = dataset.iloc[32561:].drop('income', axis=1), dataset.iloc[32561:]['income']

models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier(n_estimators=100, max_features=3)))
models.append(('XGB', XGBClassifier()))

names = []
accuracies = []
aucs = []
precisions = []
recalls = []

for name, model in models:

    model.fit(X_train, Y_train)
    
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    accuracy = accuracy_score(Y_test, predictions)
    auc = roc_auc_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)

    names.append(name)
    accuracies.append(accuracy)
    aucs.append(auc)
    precisions.append(precision_score(Y_test, predictions))
    recalls.append(recall_score(Y_test, predictions))

    msg = "%s: accuracy %f - AUC %f" % (name, accuracy, auc)
    print(msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR: accuracy 0.826301 - AUC 0.717203
KNN: accuracy 0.848842 - AUC 0.777127
CART: accuracy 0.819299 - AUC 0.747460
NB: accuracy 0.829187 - AUC 0.790211
RF: accuracy 0.840612 - AUC 0.759257
XGB: accuracy 0.871568 - AUC 0.796404


In [77]:
metrics = pd.DataFrame({'Accuracy': accuracies, 'AUC': aucs, 'Precision': precisions, 'Recall': recalls}, index=names).transpose()
metrics['Best Value'] = metrics.max(axis=1)
metrics['Best Model'] = metrics.idxmax(axis=1)
metrics

Unnamed: 0,LR,KNN,CART,NB,RF,XGB,Best Value,Best Model
Accuracy,0.826301,0.848842,0.819299,0.829187,0.840612,0.871568,0.871568,XGB
AUC,0.717203,0.777127,0.74746,0.790211,0.759257,0.796404,0.796404,XGB
Precision,0.675034,0.695235,0.61901,0.619798,0.683808,0.767939,0.767939,XGB
Recall,0.5104,0.641186,0.611284,0.716329,0.605044,0.653926,0.716329,NB
