## Agenda 

1. Exploring sample data for schema extraction
2. Large data processing using TensorFlow Data Pipeline
3. Understanding feature_columns for Data Transformation
4. Build Model
5. Model Training

<hr>

### Import modules

In [19]:
import tensorflow as tf
import pandas as pd
from tensorflow import feature_column
from tensorflow.keras import layers

### Get data schema & column names
* Pandas can be used here to do this
* We need to prepare the schema before loading large data

In [11]:
data = pd.read_csv('data/heart.csv',nrows=10)

In [13]:
columns = data.columns.to_list()

In [17]:
columns

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'target']

In [14]:
data.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal         object
target        int64
dtype: object

### Building Data Pipeline for multiple & large data sources
* Schema information obtained needs to be used for decoding csv
* interleave - process multiple input files
* shuffle - randomly shuffles datset, 
* batch - set of data

In [16]:
record_defaults = [tf.constant([], dtype=tf.int64)] * 9 + \
[tf.constant([], dtype=tf.float64)] + \
[tf.constant([], dtype=tf.int64)] * 2 + \
[tf.constant([], dtype=tf.string)] + \
[tf.constant([], dtype=tf.int64)]

In [4]:
@tf.function
def preprocess(line,record_defaults=record_defaults,columns=columns):
    fields = tf.io.decode_csv(line, record_defaults=record_defaults)
    features = dict(zip(columns,fields))
    label = features.pop('target')
    return features,label

def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

### Getting data from multiple files & create batch of data

In [18]:
train_set = csv_reader_dataset(['data/heart.csv'], batch_size=3)
for X_batch, y_batch in train_set.take(2):
    print("X =", X_batch)
    print("y =", y_batch)

X = {'age': <tf.Tensor: id=146, shape=(3,), dtype=int64, numpy=array([61, 57, 61])>, 'sex': <tf.Tensor: id=154, shape=(3,), dtype=int64, numpy=array([1, 0, 1])>, 'cp': <tf.Tensor: id=149, shape=(3,), dtype=int64, numpy=array([3, 2, 4])>, 'trestbps': <tf.Tensor: id=158, shape=(3,), dtype=int64, numpy=array([150, 130, 120])>, 'chol': <tf.Tensor: id=148, shape=(3,), dtype=int64, numpy=array([243, 236, 260])>, 'fbs': <tf.Tensor: id=151, shape=(3,), dtype=int64, numpy=array([1, 0, 0])>, 'restecg': <tf.Tensor: id=153, shape=(3,), dtype=int64, numpy=array([0, 2, 0])>, 'thalach': <tf.Tensor: id=157, shape=(3,), dtype=int64, numpy=array([137, 174, 140])>, 'exang': <tf.Tensor: id=150, shape=(3,), dtype=int64, numpy=array([1, 0, 1])>, 'oldpeak': <tf.Tensor: id=152, shape=(3,), dtype=float64, numpy=array([1. , 0. , 3.6])>, 'slope': <tf.Tensor: id=155, shape=(3,), dtype=int64, numpy=array([2, 2, 2])>, 'ca': <tf.Tensor: id=147, shape=(3,), dtype=int64, numpy=array([0, 1, 1])>, 'thal': <tf.Tensor: id

### Feature Columns
* Feature columns as the intermediaries between raw data and Estimators. 
* Feature columns are very rich, enabling you to transform a diverse range of raw data into formats that Estimators can use, allowing easy experimentation.
* What kind of data can a deep neural network operate on? The answer is, of course, numbers (for example, tf.float32). 
* Some examples are - numeric_column, bucketized_column, categorical_column_with_vocabulary_list etc.

In [23]:
# Unit-testing transformation
example_batch = next(iter(train_set))[0]
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [24]:
age = feature_column.numeric_column("age")
demo(age)

[[61.]
 [57.]
 [61.]]


In [25]:
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
demo(age_buckets)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]


In [26]:
thal = feature_column.categorical_column_with_vocabulary_list(
      'thal', ['fixed', 'normal', 'reversible'])

thal_one_hot = feature_column.indicator_column(thal)
demo(thal_one_hot)

W0727 15:43:53.053673 140600608700224 deprecation.py:323] From /home/awantik/anaconda3/lib/python3.7/site-packages/tensorflow/python/feature_column/feature_column_v2.py:2655: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0727 15:43:53.346688 140600608700224 deprecation.py:323] From /home/awantik/anaconda3/lib/python3.7/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4215: IndicatorColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
W0727 15:43:53.348438 140600608700224 deprecation.py:323] From /home/awantik/anaconda3/lib/python3.7/site-packages/tensorflow/python/feature_column/feature

[[0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [27]:
thal_embedding = feature_column.embedding_column(thal, dimension=8)
demo(thal_embedding)

[[-0.28974992  0.2767802  -0.289907    0.16144833  0.4364657   0.3567376
  -0.0526059   0.05884144]
 [-0.28974992  0.2767802  -0.289907    0.16144833  0.4364657   0.3567376
  -0.0526059   0.05884144]
 [-0.06115608 -0.22891812  0.08867108 -0.4072269  -0.25880584  0.10197378
  -0.16998404  0.0012828 ]]


In [28]:
feature_columns = []

# numeric cols
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
    feature_columns.append(feature_column.numeric_column(header))

# bucketized cols
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# indicator cols
thal = feature_column.categorical_column_with_vocabulary_list(
      'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# embedding cols
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# crossed cols
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

In [29]:
feature_columns

[NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='trestbps', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='chol', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='thalach', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='oldpeak', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='slope', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='ca', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 BucketizedColumn(source_column=NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='thal', vocabulary_list=('fixed', 'normal', 'reversi

### Build Model
* 1st layer for feature preprocessing
* Rest of the layers are neural network layers

In [30]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [31]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'],
              run_eagerly=True)

model.fit(train_set,
          validation_data=train_set,
          epochs=5)

Epoch 1/5


W0727 15:46:20.501655 140600608700224 deprecation.py:323] From /home/awantik/anaconda3/lib/python3.7/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4270: CrossedColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fdfd44a1da0>