California Census Data. Try to predict what class of income one has with the help of various features (job, gender, adress etc.)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import classification_report

# function to classify the income
def func(num): 
    if num == ' <=50K': 
        return 0
    else: 
        return 1

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
dataset = pd.read_csv('census_data.csv')
dataset.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
income = dataset['income_bracket']
new_income = income.apply(func) # Transform string to bool

In [4]:
# Create new dataset
newdataset = dataset.drop(['income_bracket'], axis=1)
newdataset['income'] = new_income

#Declare Features and Labes
x_data = newdataset.drop('income',axis=1)
labels = newdataset['income']

In [5]:
# Perform train test split 70/30
x_train, x_test, y_train, y_test = train_test_split(x_data, labels, test_size=0.30)

In [6]:
# Adding the feature columns to Tensorflow

# Using hash buckets since idk the real lenghs
workclass = tf.feature_column.categorical_column_with_hash_bucket('workclass', hash_bucket_size=1000)
education = tf.feature_column.categorical_column_with_hash_bucket('education', hash_bucket_size=1000)
marital_status = tf.feature_column.categorical_column_with_hash_bucket('marital_status', hash_bucket_size=1000)
occupation = tf.feature_column.categorical_column_with_hash_bucket('occupation', hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket('native_country', hash_bucket_size=1000)

# Using vocabulary list since I know the unique values 
relationship = tf.feature_column.categorical_column_with_vocabulary_list('relationship',['Wife', 'Own-child', 'Husband', 
                                                                                         'Not-in-family', 'Other-relative', 
                                                                                         'Unmarried'])                                                                                     
race = tf.feature_column.categorical_column_with_vocabulary_list('race', ['White', 'Asian-Pac-Islander','Amer-Indian-Eskimo',
                                                                          'Other', 'Black'])
gender = tf.feature_column.categorical_column_with_vocabulary_list('gender', ['Female','Male'])

# Using numeric column because these are continuus values
age = tf.feature_column.numeric_column('age')
education_num = tf.feature_column.numeric_column('education_num')
capital_gain = tf.feature_column.numeric_column('capital_gain')
capital_loss = tf.feature_column.numeric_column('capital_loss')
hours_per_week  = tf.feature_column.numeric_column('hours_per_week')

# Combine all feature columns
feat_cols = [workclass,education,marital_status,occupation,native_country,relationship, race,gender,age,education_num,
            capital_gain,capital_loss,hours_per_week]

In [7]:
# Create input function
input_func = tf.estimator.inputs.pandas_input_fn(x = x_train,y= y_train, batch_size=25, num_epochs=None,shuffle=True)

# Set up linear classifier model and train
model = tf.estimator.LinearClassifier(feature_columns=feat_cols)
model.train(input_fn = input_func, steps=5000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'C:\\Users\\Florian\\AppData\\Local\\Temp\\tmp_uyzss63', '_session_config': None, '_save_checkpoints_secs': 600, '_tf_random_seed': 1, '_save_summary_steps': 100}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\Florian\AppData\Local\Temp\tmp_uyzss63\model.ckpt.
INFO:tensorflow:loss = 17.32868, step = 1
INFO:tensorflow:global_step/sec: 376.503
INFO:tensorflow:loss = 6.7214203, step = 101 (0.266 sec)
INFO:tensorflow:global_step/sec: 457.161
INFO:tensorflow:loss = 25.03404, step = 201 (0.219 sec)
INFO:tensorflow:global_step/sec: 457.163
INFO:tensorflow:loss = 92.927376, step = 301 (0.219 sec)
INFO:tensorflow:global_step/sec: 457.162
INFO:tensorflow:loss = 9.239565, step = 401 (0.219 sec)
INFO:tensorflow:global_step/sec: 457.161


<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x2052914c048>

In [8]:
# Set up an prediction input func  for the test data
pred_input_func = tf.estimator.inputs.pandas_input_fn(x = x_test, batch_size=len(x_test), shuffle=False)
predictions_gen = model.predict(input_fn=pred_input_func) # Returns a generator 

# Get prediciton list
predictions = list(predictions_gen)

# Only get the predicted label values for calculating the model score
final_preds = [pred['class_ids'][0] for pred in predictions]

INFO:tensorflow:Restoring parameters from C:\Users\Florian\AppData\Local\Temp\tmp_uyzss63\model.ckpt-5000


In [9]:
print(classification_report(y_true = y_test, y_pred=final_preds))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      7424
           1       0.67      0.62      0.65      2345

    accuracy                           0.84      9769
   macro avg       0.78      0.76      0.77      9769
weighted avg       0.83      0.84      0.83      9769

