In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np 
import pandas as pd

import tensorflow as tf

from sklearn.model_selection import train_test_split

In [2]:
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'

dataframe = pd.read_csv(URL)
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [3]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

In [4]:
print(f"{len(train)} train examples")
print(f"{len(val)} val examples")
print(f"{len(test)} test examples")

193 train examples
49 val examples
61 test examples


In [5]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if(shuffle):
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [6]:
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, batch_size=batch_size)
test_ds = df_to_dataset(test, batch_size=batch_size)


In [7]:
for feature_batch, label_batch in train_ds.take(1):
    print(f"Every feature: {list(feature_batch.keys())}")
    print(f"A batch of ages: {feature_batch['age']}")
    print(f"A batch of targets: {label_batch}")

Every feature: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
A batch of ages: [63 53 65 45 71]
A batch of targets: [1 0 0 0 0]


In [17]:
for feature_batch, label_batch in val_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['age'])
  print('A batch of targets:', label_batch )

Every feature: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
A batch of ages: tf.Tensor([50 54 62 48 53], shape=(5,), dtype=int32)
A batch of targets: tf.Tensor([0 1 0 0 1], shape=(5,), dtype=int32)


In [19]:
example_batch = next(iter(train_ds))[0]
example_batch

{'age': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([59, 62, 41, 59, 54], dtype=int32)>,
 'sex': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([1, 0, 1, 1, 0], dtype=int32)>,
 'cp': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([4, 4, 2, 4, 2], dtype=int32)>,
 'trestbps': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([164, 140, 110, 140, 132], dtype=int32)>,
 'chol': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([176, 268, 235, 177, 288], dtype=int32)>,
 'fbs': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([1, 0, 0, 0, 1], dtype=int32)>,
 'restecg': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([2, 2, 0, 0, 2], dtype=int32)>,
 'thalach': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 90, 160, 153, 162, 159], dtype=int32)>,
 'exang': <tf.Tensor: shape=(5,), dtype=int32, numpy=array([0, 0, 0, 1, 1], dtype=int32)>,
 'oldpeak': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([1. , 3.6, 0. , 0. , 0. ], dtype=float32)>,
 'slope': <tf.Tensor: shape=(5,), dtype=in

In [26]:
def demo(feature_column):
    feature_layer = tf.keras.layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [27]:
age = tf.feature_column.numeric_column('age')
demo(age)

[[59.]
 [62.]
 [41.]
 [59.]
 [54.]]


### Bucketized Columns


In [30]:
age_buckets = tf.feature_column.bucketized_column(
    age,
    boundaries = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
)
demo(age_buckets)

[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]]


### Categorical Columns: OHE, One Hot Encoding

In [35]:
thal = tf.feature_column.categorical_column_with_vocabulary_list(
    'thal', ['fixed', 'normal', 'reversible']
)
thal_one_hot = tf.feature_column.indicator_column(thal)
demo(thal_one_hot)

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


### Categorical Columns: Embeddings

In [37]:
thal_embedding = tf.feature_column.embedding_column(thal, dimension=9)
demo(thal_embedding)

[[ 5.7823944e-01  2.3441516e-01 -1.1780685e-01  3.6928627e-01
   2.1831822e-01  2.7618557e-01 -4.5954311e-01 -1.7218904e-01
  -7.4731797e-05]
 [ 3.1367791e-01  1.4578246e-01  2.6094120e-02  2.5095624e-01
  -1.6969848e-01 -1.7715278e-01 -1.6077013e-01  2.6549144e-02
   4.4841453e-01]
 [ 3.1367791e-01  1.4578246e-01  2.6094120e-02  2.5095624e-01
  -1.6969848e-01 -1.7715278e-01 -1.6077013e-01  2.6549144e-02
   4.4841453e-01]
 [-1.6825844e-02  4.2055368e-02 -1.2867145e-01 -1.0961540e-01
  -1.8707359e-01 -3.3634040e-01 -1.8041520e-01  4.0318066e-01
  -1.0506474e-01]
 [ 3.1367791e-01  1.4578246e-01  2.6094120e-02  2.5095624e-01
  -1.6969848e-01 -1.7715278e-01 -1.6077013e-01  2.6549144e-02
   4.4841453e-01]]


### Categorical Columns: Hashed Features

In [40]:
thal_hashed = tf.feature_column.categorical_column_with_hash_bucket('thal', hash_bucket_size=1000)
demo(tf.feature_column.indicator_column(thal_hashed))

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Categorical Columns: Crossed Feature Columns

In [42]:
crossed_feature = tf.feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
demo(tf.feature_column.indicator_column(crossed_feature))

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Feature Selection

In [46]:
feature_columns = []
# Numeric Column
for header in []:
    feature_columns.append(tf.feature_column.numeric_column(header))
    
# Bucketized Column
age_buckets = tf.feature_column.bucketized_column(
    age,
    boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
)

# Indicator Columns
thal = tf.feature_column.categorical_column_with_vocabulary_list(
    'thal', ['fixed', 'normal', 'reversible']
)
thal_one_hot = tf.feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# Embedding Columns
thal_embedding =tf.feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# Crossed Columns
crossed_feature = tf.feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = tf.feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

### Feature Layer

In [51]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)

## Model

In [52]:
model = tf.keras.Sequential([
    feature_layer,
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1)
])
model.compile(
    optimizer='adam', 
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy']
)
model.fit(train_ds, validation_data=val_ds, epochs=5)

Train for 7 steps, validate for 2 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x13f4c3850>

In [53]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.73770493
