# House price prediciton regression - TFX

In [1]:
import os
import pathlib
import pprint
import tempfile
import urllib

import absl
import tensorflow as tf
import tensorflow_model_analysis as tfma
tf.get_logger().propagate = False
pp = pprint.PrettyPrinter()

import tfx
from tfx.components import CsvExampleGen
from tfx.components import Evaluator
from tfx.components import ExampleValidator
from tfx.components import Pusher
from tfx.components import ResolverNodehttps
from tfx.components import SchemaGen
from tfx.components import StatisticsGen
from tfx.components import Trainer
from tfx.components import Transform
from tfx.components.base import executor_spec
from tfx.components.trainer.executor import GenericExecutor
from tfx.dsl.experimental import latest_blessed_model_resolver
from tfx.orchestration import metadata
from tfx.orchestration import pipeline
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.proto import pusher_pb2
from tfx.proto import trainer_pb2
from tfx.types import Channel
from tfx.types.standard_artifacts import Model
from tfx.types.standard_artifacts import ModelBlessing
from tfx.utils.dsl_utils import external_input


SyntaxError: invalid syntax (<ipython-input-1-5b58e9f01123>, line 18)

In [None]:
print('TensorFlow version: {}'.format(tf.__version__))
print('TFX version: {}'.format(tfx.__version__))

## Path initialization

In [None]:
# This is the root directory for your TFX pip package installation.
_tfx_root = pathlib.Path().parent.absolute().parent

# This is the directory containing the TFX Chicago Taxi Pipeline example.
_housepred_root = _tfx_root.joinpath('tfx_metadata_store')

# This is the path where your model will be pushed for serving.
_serving_model_dir = _housepred_root.joinpath('housepred_model')

# Path to the data storage
_data_root = _tfx_root.joinpath('data')

# Set up logging.
absl.logging.set_verbosity(absl.logging.INFO)

The data is coming from the [Melbourne house price prediction Kaggle](https://www.kaggle.com/anthonypino/melbourne-housing-market#Melbourne_housing_FULL.csv)

In [None]:
RAWDATA_PATH = 'https://raw.githubusercontent.com/fabiansd/AI-workshop/master/data/melb_data.csv'

_rawdata_filepath = _data_root.joinpath('melbourne.csv')

urllib.request.urlretrieve(RAWDATA_PATH, _rawdata_filepath)

In [None]:
!head {_rawdata_filepath}

In [None]:
# Here, we create an InteractiveContext using default parameters. This will
# use a temporary directory with an ephemeral ML Metadata database instance.
# To use your own pipeline root or database, the optional properties
# `pipeline_root` and `metadata_connection_config` may be passed to
# InteractiveContext. Calls to InteractiveContext are no-ops outside of the
# notebook.
context = InteractiveContext()

## ExampleGen

This component takes the input data path to your data source

In [2]:
example_gen = CsvExampleGen(input=external_input(_data_root))
context.run(example_gen)

NameError: name 'CsvExampleGen' is not defined

This component produces training and evaluation examples

In [None]:
artifact = example_gen.outputs['examples'].get()[0]
print(artifact.split_names, artifact.uri)

Here we can see the first 3 training examples

In [None]:
# Get the URI of the output artifact representing the training examples, which is a directory
train_uri = os.path.join(example_gen.outputs['examples'].get()[0].uri, 'train')

# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

# Iterate over the first 3 records and decode them.
for tfrecord in dataset.take(3):
  serialized_example = tfrecord.numpy()
  example = tf.train.Example()
  example.ParseFromString(serialized_example)
  pp.pprint(example)

## StatisticsGen

This comp cpmputes statistics over yout dataset for data analysis. It uses the [TensorFlow Data Validation](https://www.tensorflow.org/tfx/data_validation/get_started) library. After it has been run we can display the statistics.

In [None]:
statistics_gen = StatisticsGen(
    examples=example_gen.outputs['examples'])
context.run(statistics_gen)

In [None]:
context.show(statistics_gen.outputs['statistics'])

## SchemaGen

Creates a schema based on your data statistics. It takes the statistics generated by StatisticsGen as input looking at the training split by default (which is train: 2/3 and eval: 1/3 by default)

In [None]:
schema_gen = SchemaGen(
    statistics=statistics_gen.outputs['statistics'],
    infer_feature_shape=False)
context.run(schema_gen)

SchemaGen displays all your feature in a column alongside its properties and the domain of categorical values. Learn more about schemas in the [SchemaGen doc](https://www.tensorflow.org/tfx/guide/schemagen)

In [None]:
context.show(schema_gen.outputs['schema'])

## ExampleValidator

Detects anomalies in the data, based on the expectations defined by the schema. The ExampleValidator component will input the statistics from SchemaGen and the schema from SchemaGen.

By default, it compares the statistics from the evaluation split to the schema from the training split.

In [None]:
example_validator = ExampleValidator(
    statistics=statistics_gen.outputs['statistics'],
    schema=schema_gen.outputs['schema'])
context.run(example_validator)

We can then visualize anomalies as a table

In [None]:
context.show(example_validator.outputs['anomalies'])

## Transform

Performs feature engineering for both training and serving. It uses the [TensorFlow Transform](https://www.tensorflow.org/tfx/transform/get_started).

Transform takes the data from ExampleGen, the schema from SchemaGen and a python module that contains user-defined Transform code.

First, we define a few constants for feature engineering in the prediction_constants.py file. Note: The %%writefile cell magic will save the contents of the cell as a .py file on disk. This allows the Transform component to load your code as a module.

In [6]:
%%writefile {_melbourne_constants_module_file}

# Categorical features are assumed to each have a maximum value in the dataset.
MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]

CATEGORICAL_FEATURE_KEYS = [
    'Suburb',
    'Address',
    'Type',
    'Method',
    'SellerG', 
    'Date',
    'CouncilArea',
    'Regionname'
]

# Numerical features
DENSE_FLOAT_FEATURE_KEYS = ['Rooms',
                            'Price', 
                            'Distance',
                            'Postcode',
                            'Bedroom2',
                            'Bathroom',
                            'Car',
                            'Landsize',
                            'BuildingArea',
                            'YearBuilt']

# Number of buckets used by tf.transform for encoding numerical features into a bucket generaliation
FEATURE_BUCKET_COUNT = 10

BUCKET_FEATURE_KEYS = [
    'Lattitude', 
    'Longtitude'
]

Writing {_melbourne_constants_module_file}


In [7]:
_melbourne_transform_module_file = 'melbourne_transform.py'