In [1]:
import os
import tempfile
import glob

import pandas as pd
import numpy as np

import tensorflow as tf

import tensorflow_transform as tft
from tensorflow_transform.beam import impl as beam_impl
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
from tensorflow_transform.tf_metadata import dataset_metadata, dataset_schema

import apache_beam as beam
from apache_beam.io import tfrecordio

In [2]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [3]:
!rm -Rf data/transform_fn
!rm -Rf data/transformed_metadata

### Transform TFRecords

In [4]:
csv = pd.read_csv('data/leads.csv')
field_types = dict(csv.dtypes)
csv_records = csv.to_dict(orient='records')

with tf.python_io.TFRecordWriter('data/leads.tfrecords') as writer:
    for row in csv_records:
        example = tf.train.Example()
        for k, v in row.items():
            if field_types[k] == 'int64':
                example.features.feature[k].int64_list.value.append(v)
            elif field_types[k] == 'float64':
                example.features.feature[k].float_list.value.append(v)
            else:
                example.features.feature[k].bytes_list.value.append(str(v).encode('utf-8'))
        writer.write(example.SerializeToString())

### Use TFT/Beam to transform data for model

In [5]:
# schema for raw data
RAW_DATA_FEATURE = {
    'dx': tf.FixedLenFeature(shape=[1], dtype=tf.string),
    'enrolled': tf.FixedLenFeature(shape=[1], dtype=tf.int64)
}

RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(
    dataset_schema.from_feature_spec(RAW_DATA_FEATURE))

In [6]:
# train our tft transformer
with beam.Pipeline() as pipeline:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema)

        data = (
            pipeline
            | 'Read' >> tfrecordio.ReadFromTFRecord('data/leads.tfrecords')
            | 'Decode' >> beam.Map(coder.decode))

        def preprocessing_fn(inputs):
            return {
                'dx': tf.string_split(tf.reshape(inputs['dx'], [-1]), '|'),
                'enrolled': inputs['enrolled']
            }

        (transformed_data, transformed_metadata), transform_fn = (
            (data, RAW_DATA_METADATA)
            | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
        
        transformed_data_coder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)

        _ = (
            transformed_data
            | 'Encode' >> beam.Map(transformed_data_coder.encode)
            | 'Write' >> tfrecordio.WriteToTFRecord('data/leads_transformed.tfrecords'))

        _ = (
            transform_fn
            | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn('data'))



In [7]:
# load data
def fetch_tf_records(input_file_pattern, feature_spec):
    input_filenames = glob.glob(input_file_pattern)
    
    n = 0
    for f in input_filenames:
        n += sum(1 for _ in tf.python_io.tf_record_iterator(f))
    
    ds = tf.data.TFRecordDataset(input_filenames)
    ds = ds.map(lambda x: tf.parse_single_example(x, feature_spec))
    ds = ds.batch(n).repeat(1)
    
    return ds.make_one_shot_iterator().get_next()


ds_pre = fetch_tf_records('data/leads.tfrecords', RAW_DATA_FEATURE)

ds_post = fetch_tf_records('data/leads_transformed.tfrecords*', 
                           transformed_metadata.schema.as_feature_spec())

with tf.Session() as sess:
    print(sess.run(ds_pre))
    print(sess.run(ds_post))

{'dx': array([['A|B|C'],
       ['D|A'],
       ['E|B|C']], dtype=object), 'enrolled': array([[1],
       [0],
       [1]])}
{'dx': SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [0, 2],
       [1, 0],
       [1, 1],
       [2, 0],
       [2, 1],
       [2, 2]]), values=array(['A', 'B', 'C', 'D', 'A', 'E', 'B', 'C'], dtype=object), dense_shape=array([3, 3])), 'enrolled': array([[1],
       [0],
       [1]])}
