In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from IPython.display import display

## Dataset
<table>
<thead>
<tr>
<th>Column Name</th>
<th>Type</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>age</td>
<td>Continuous</td>
<td>The age of the individual</td>
</tr>
<tr>
<td>workclass</td>
<td>Categorical</td>
<td>The type of employer the  individual has (government,  military, private, etc.).</td>
</tr>
<tr>
<td>fnlwgt</td>
<td>Continuous</td>
<td>The number of people the census  takers believe that observation  represents (sample weight). This  variable will not be used.</td>
</tr>
<tr>
<td>education</td>
<td>Categorical</td>
<td>The highest level of education  achieved for that individual.</td>
</tr>
<tr>
<td>education_num</td>
<td>Continuous</td>
<td>The highest level of education in  numerical form.</td>
</tr>
<tr>
<td>marital_status</td>
<td>Categorical</td>
<td>Marital status of the individual.</td>
</tr>
<tr>
<td>occupation</td>
<td>Categorical</td>
<td>The occupation of the individual.</td>
</tr>
<tr>
<td>relationship</td>
<td>Categorical</td>
<td>Wife, Own-child, Husband,  Not-in-family, Other-relative,  Unmarried.</td>
</tr>
<tr>
<td>race</td>
<td>Categorical</td>
<td>White, Asian-Pac-Islander,  Amer-Indian-Eskimo, Other, Black.</td>
</tr>
<tr>
<td>gender</td>
<td>Categorical</td>
<td>Female, Male.</td>
</tr>
<tr>
<td>capital_gain</td>
<td>Continuous</td>
<td>Capital gains recorded.</td>
</tr>
<tr>
<td>capital_loss</td>
<td>Continuous</td>
<td>Capital Losses recorded.</td>
</tr>
<tr>
<td>hours_per_week</td>
<td>Continuous</td>
<td>Hours worked per week.</td>
</tr>
<tr>
<td>native_country</td>
<td>Categorical</td>
<td>Country of origin of the  individual.</td>
</tr>
<tr>
<td>income</td>
<td>Categorical</td>
<td>"&gt;50K" or "&lt;=50K", meaning  whether the person makes more  than \$50,000 annually.</td>
</tr>
</tbody>
</table>

In [2]:
data = pd.read_csv("census_data.csv")

display(data.info())
display(data.head())
display(data.describe())

display(data['income_bracket'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
age               32561 non-null int64
workclass         32561 non-null object
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
gender            32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
income_bracket    32561 non-null object
dtypes: int64(5), object(9)
memory usage: 3.5+ MB


None

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,10.080679,1077.648844,87.30383,40.437456
std,13.640433,2.57272,7385.292085,402.960219,12.347429
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,48.0,12.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


 <=50K    24720
 >50K      7841
Name: income_bracket, dtype: int64

## Feature Engineering

In [3]:
# Label Fix
def label_fix(label):
    if label==' <=50K':
        return 0
    else:
        return 1
    
data['income_bracket'] = data['income_bracket'].apply(label_fix)

display(data['income_bracket'].value_counts())

0    24720
1     7841
Name: income_bracket, dtype: int64

## Train/Test Split

In [4]:
X = data.drop('income_bracket',axis=1)
y = data['income_bracket']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,random_state=101)

## Model

In [5]:
import tensorflow as tf

data.columns

Index(['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'gender', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country', 'income_bracket'],
      dtype='object')

In [6]:
# Categorical Features
gender = tf.feature_column.categorical_column_with_vocabulary_list("gender", ["Female", "Male"])
occupation = tf.feature_column.categorical_column_with_hash_bucket("occupation", hash_bucket_size=1000)
marital_status = tf.feature_column.categorical_column_with_hash_bucket("marital_status", hash_bucket_size=1000)
relationship = tf.feature_column.categorical_column_with_hash_bucket("relationship", hash_bucket_size=1000)
education = tf.feature_column.categorical_column_with_hash_bucket("education", hash_bucket_size=1000)
workclass = tf.feature_column.categorical_column_with_hash_bucket("workclass", hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket("native_country", hash_bucket_size=1000)

# Numeric features
age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

# Feature Col
feat_cols = [
    gender,
    occupation,
    marital_status,
    relationship,
    education,
    workclass,
    native_country,
    age,
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week]

In [7]:
# Linear Classifier Model
model = tf.estimator.LinearClassifier(
    feature_columns=feat_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpr8hmlqcb', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f023403f0b8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [8]:
# Train Input Function
train_input_func = tf.estimator.inputs.pandas_input_fn(
    x=X_train, y=y_train,
    batch_size=100, num_epochs=1, shuffle=True)

In [9]:
# Verbose = False
tf.logging.set_verbosity(tf.logging.WARN)

# Train Estimator
model.train(
    input_fn=train_input_func,
    steps=5000)

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x7f01e5abbc88>

## Evaluate Train

In [10]:
results = model.evaluate(train_input_func)

results



{'accuracy': 0.81532997,
 'accuracy_baseline': 0.75833625,
 'auc': 0.85623074,
 'auc_precision_recall': 0.62531924,
 'average_loss': 4.254148,
 'label/mean': 0.24166374,
 'loss': 425.26553,
 'precision': 0.64277864,
 'prediction/mean': 0.23018228,
 'recall': 0.5308642,
 'global_step': 228}

## Evaluate Test

In [11]:
# Test Input Function
test_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test, y=y_test,
      batch_size=10,
      num_epochs=1,
      shuffle=False)

In [12]:
results = model.evaluate(test_input_func)

results



{'accuracy': 0.81676733,
 'accuracy_baseline': 0.7611833,
 'auc': 0.85741645,
 'auc_precision_recall': 0.6324147,
 'average_loss': 3.8695955,
 'label/mean': 0.23881666,
 'loss': 38.691994,
 'precision': 0.63774735,
 'prediction/mean': 0.23049027,
 'recall': 0.53879124,
 'global_step': 228}

## Predict

In [13]:
# Predict Input Function
pred_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      batch_size=len(X_test),
      shuffle=False)

In [14]:
y_test_pred = model.predict(pred_input_func)

In [15]:
y_test_pred_list = []
for pred in y_test_pred:
    y_test_pred_list.append(pred['class_ids'][0])

In [16]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_test_pred_list))

[[6722  714]
 [1076 1257]]


In [17]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_test_pred_list))

0.8167673252124066


In [18]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred_list))

             precision    recall  f1-score   support

          0       0.86      0.90      0.88      7436
          1       0.64      0.54      0.58      2333

avg / total       0.81      0.82      0.81      9769

