In [66]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
df = pd.read_csv("census_data.csv")

In [67]:
df.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [68]:
df["income_bracket"].unique()

array([' <=50K', ' >50K'], dtype=object)

In [69]:
#income bracket will be used as label
df["income_bracket"] = [0 if income == " <=50K" else 1 for income in df["income_bracket"]]


In [70]:
#extract numerical features
cols_to_norm = ["age","education_num","capital_gain","capital_loss","hours_per_week"]
df[cols_to_norm] = normalize(df[cols_to_norm])

In [71]:
#create function to create feature columns from numeric data
def convert_numeric_column(column_name):
    data = tf.feature_column.numeric_column(column_name)
    return data
    
education_num_feature = convert_numeric_column("education_num")
capital_gain_feature = convert_numeric_column("capital_gain")
capital_loss_feature = convert_numeric_column("capital_loss")
hours_feature = convert_numeric_column("hours_per_week")
age = convert_numeric_column("age")

In [72]:
#create function to create feature columns from categoric data
def convert_categoric_column(column_name,hash_bucket_size):
    data = tf.feature_column.categorical_column_with_hash_bucket(column_name,hash_bucket_size=hash_bucket_size)
    return data

education = convert_categoric_column("education",16)
workclass = convert_categoric_column("workclass",10)
marital = convert_categoric_column("marital_status",7)
occupation = convert_categoric_column("occupation",14)
relationship = convert_categoric_column("relationship",6)
race = convert_categoric_column("race",5)
gender = convert_categoric_column("gender",2)
native_country = convert_categoric_column("native_country",60)


In [73]:
feature_columns = [education_num_feature,capital_gain_feature,capital_loss_feature,hours_feature,
                  age,education,workclass,marital,occupation,relationship,race,gender,native_country]

In [74]:
feature_columns

[_NumericColumn(key='education_num', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='capital_gain', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='capital_loss', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='hours_per_week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _HashedCategoricalColumn(key='education', hash_bucket_size=16, dtype=tf.string),
 _HashedCategoricalColumn(key='workclass', hash_bucket_size=10, dtype=tf.string),
 _HashedCategoricalColumn(key='marital_status', hash_bucket_size=7, dtype=tf.string),
 _HashedCategoricalColumn(key='occupation', hash_bucket_size=14, dtype=tf.string),
 _HashedCategoricalColumn(key='relationship', hash_bucket_size=6, dtype=tf.string),
 _HashedCategoricalColumn(key='race', hash_bucket_size

In [75]:
#one third of the data is used as a test data
X = df.drop("income_bracket", axis = 1)
Y = df["income_bracket"]
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.33, random_state = 42)

In [76]:
#define input function
input_function = tf.estimator.inputs.pandas_input_fn(x=X_train,
                                                     y=Y_train,
                                                     batch_size=5,
                                                     num_epochs=1000,
                                                     shuffle=True)

In [77]:
#Linear Classification
model = tf.estimator.LinearClassifier(feature_columns=feature_columns,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/5q/q6__ntnx3ts0_fpp4phm75d00000gn/T/tmpx0u96n0t', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a2598e978>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [78]:
model.train(input_fn=input_function,steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/5q/q6__ntnx3ts0_fpp4phm75d00000gn/T/tmpx0u96n0t/model.ckpt.
INFO:tensorflow:loss = 3.465736, step = 1
INFO:tensorflow:global_step/sec: 113.618
INFO:tensorflow:loss = 1.7902493, step = 101 (0.884 sec)
INFO:tensorflow:global_step/sec: 264.072
INFO:tensorflow:loss = 1.7846215, step = 201 (0.376 sec)
INFO:tensorflow:global_step/sec: 267.293
INFO:tensorflow:loss = 2.789328, step = 301 (0.379 sec)
INFO:tensorflow:global_step/sec: 271.537
INFO:tensorflow:loss = 2.9357781, step = 401 (0.366 sec)
INFO:tensorflow:global_step/sec: 323.162
INFO:tensorflow:loss = 1.0743244, step = 501 (0.309 sec)
INFO:tensorflow:global_step/sec: 320.368
INFO:tensorflow:loss = 0.74770945, step = 601 (0.315 sec)
INFO:tensorflo

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1a287496d8>

In [79]:
prediction_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,
                                                            y=Y_test,
                                                            batch_size=5,
                                                            num_epochs=1,
                                                            shuffle=False)

In [80]:
predictions = list(model.predict(input_fn=prediction_input_func))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/5q/q6__ntnx3ts0_fpp4phm75d00000gn/T/tmpx0u96n0t/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [81]:
predictions[5]

{'logits': array([-0.20581904], dtype=float32),
 'logistic': array([0.44872612], dtype=float32),
 'probabilities': array([0.5512739 , 0.44872612], dtype=float32),
 'class_ids': array([0]),
 'classes': array([b'0'], dtype=object)}

In [82]:
final_predictions = [pred["class_ids"][0] for pred in predictions]

In [83]:
print(classification_report(Y_test,final_predictions))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88      8196
           1       0.65      0.41      0.50      2550

   micro avg       0.81      0.81      0.81     10746
   macro avg       0.74      0.67      0.69     10746
weighted avg       0.79      0.81      0.79     10746

