### Preprocess

Preprocess does two things:
* Convert raw data format (csv in this case) into a TensorFlow friendly format such as TFRecord, so TF graph can consume it.
* Transform the data in the way you want.
Once you have gathered your data and decided how to preprocess them (a featureset class is already defined), we can preprocess the data. One way to preprocess the data is to use DataFlow. If your data is large, DataFlow can run in cloud in a distributed fashion. If not large, you can also run the DataFlow locally. <br><br>

CloudML provides a preprocess DataFlow transformation (ml.Preprocess) so it can be easily plugged into the pipeline.

What Datalab provides is generated code template with "%mlalpha preprocess" command, so you don't have to start from scratch to author your DataFlow pipeline.

Preprocessing requires a featureset class. We've done that in previous "1.Feature" notebook but we need to define it again here in this notebook scope.
Note that we choose to preprocess all numeric feature columns with [-1, 1] scale by removing the .identity() transform so it uses default transform (scaling to [-1, 1]).

In [1]:
import google.cloud.ml.features as features

class CensusFeatures(object):
  """This class is generated from command line:
        %%mlalpha features
        path: /content/datalab/tmp/ml/census/data_train.csv
        headers: SERIALNO,PUMA,NP,ACCESS,ACR,AGS,BATH,BDSP,BLD,BROADBND,BUS,COMPOTHX,CONP,DIALUP,DSL,ELEP,FIBEROP,FS,FULP,GASP,HANDHELD,HFL,INSP,LAPTOP,MHP,MODEM,MRGI,MRGP,MRGT,MRGX,OTHSVCEX,REFR,RMSP,RNTM,RNTP,RWAT,SATELLITE,SINK,SMP,STOV,TEL,TEN,TOIL,VALP,VEH,WATP,YBL,FES,FPARC,GRNTP,HHL,HHT,HINCP,HUGCL,HUPAC,HUPAOC,HUPARC,KIT,LNGI,MULTG,MV,NOC,NPF,NPP,NR,NRC,PARTNER,PLM,PSF,R18,R60,R65,RESMODE,SMOCP,SMX,SRNT,SSMC,SVAL,TAXP,WIF,WKEXREL,WORKSTAT
        target: HINCP
        id: SERIALNO
     Please modify it as appropriate!!!
  """
  csv_columns = ('SERIALNO','PUMA','NP','ACCESS','ACR','AGS','BATH','BDSP','BLD','BROADBND','BUS','COMPOTHX','CONP','DIALUP','DSL','ELEP','FIBEROP','FS','FULP','GASP','HANDHELD','HFL','INSP','LAPTOP','MHP','MODEM','MRGI','MRGP','MRGT','MRGX','OTHSVCEX','REFR','RMSP','RNTM','RNTP','RWAT','SATELLITE','SINK','SMP','STOV','TEL','TEN','TOIL','VALP','VEH','WATP','YBL','FES','FPARC','GRNTP','HHL','HHT','HINCP','HUGCL','HUPAC','HUPAOC','HUPARC','KIT','LNGI','MULTG','MV','NOC','NPF','NPP','NR','NRC','PARTNER','PLM','PSF','R18','R60','R65','RESMODE','SMOCP','SMX','SRNT','SSMC','SVAL','TAXP','WIF','WKEXREL','WORKSTAT')
  target = features.target('HINCP').continuous()
  key = features.key('SERIALNO')
  inputs = [
      features.numeric('CONP').identity(),
      features.numeric('WATP').identity(),
      features.numeric('FS').identity(),
      features.numeric('SMX').identity(),
      features.numeric('PSF').identity(),
      features.numeric('STOV').identity(),
      features.numeric('MULTG').identity(),
      features.numeric('WKEXREL').identity(),
      features.numeric('BATH').identity(),
      features.numeric('INSP').identity(),
      features.numeric('ACR').identity(),
      features.numeric('NPF').identity(),
      features.numeric('YBL').identity(),
      features.numeric('HFL').identity(),
      features.numeric('TAXP').identity(),
      features.numeric('GASP').identity(),
      features.numeric('GRNTP').identity(),
      features.numeric('MODEM').identity(),
      features.numeric('AGS').identity(),
      features.numeric('FIBEROP').identity(),
      features.numeric('RESMODE').identity(),
      features.numeric('SATELLITE').identity(),
      features.numeric('DIALUP').identity(),
      features.numeric('TEL').identity(),
      features.numeric('TEN').identity(),
      features.numeric('R18').identity(),
      features.numeric('BUS').identity(),
      features.numeric('HUPAC').identity(),
      features.numeric('SMOCP').identity(),
      features.numeric('HANDHELD').identity(),
      features.numeric('HUPARC').identity(),
      features.numeric('ELEP').identity(),
      features.numeric('RMSP').identity(),
      features.numeric('R60').identity(),
      features.numeric('VEH').identity(),
      features.numeric('NP').identity(),
      features.numeric('NR').identity(),
      features.numeric('SRNT').identity(),
      features.numeric('RNTM').identity(),
      features.numeric('OTHSVCEX').identity(),
      features.numeric('RNTP').identity(),
      features.numeric('MRGI').identity(),
      features.numeric('WIF').identity(),
      features.numeric('LAPTOP').identity(),
      features.numeric('REFR').identity(),
      features.numeric('TOIL').identity(),
      features.numeric('DSL').identity(),
      features.numeric('FPARC').identity(),
      features.numeric('MRGX').identity(),
      features.numeric('FES').identity(),
      features.numeric('HHT').identity(),
      features.numeric('MRGT').identity(),
      features.numeric('BLD').identity(),
      features.numeric('SMP').identity(),
      features.numeric('MRGP').identity(),
      features.numeric('WORKSTAT').identity(),
      features.numeric('MHP').identity(),
      features.numeric('FULP').identity(),
      features.numeric('HUGCL').identity(),
      features.numeric('SSMC').identity(),
      features.numeric('PUMA').identity(),
      features.numeric('LNGI').identity(),
      features.numeric('VALP').identity(),
      features.numeric('NRC').identity(),
      features.numeric('BDSP').identity(),
      features.numeric('HUPAOC').identity(),
      features.numeric('KIT').identity(),
      features.numeric('ACCESS').identity(),
      features.numeric('R65').identity(),
      features.numeric('NOC').identity(),
      features.numeric('MV').identity(),
      features.numeric('COMPOTHX').identity(),
      features.numeric('SVAL').identity(),
      features.numeric('RWAT').identity(),
      features.numeric('BROADBND').identity(),
      features.numeric('PARTNER').identity(),
      features.numeric('PLM').identity(),
      features.numeric('HHL').identity(),
      features.numeric('NPP').identity(),
      features.numeric('SINK').identity(),
  ]


Run %preprocess, and it generates the input cell for you to fill out.

In [None]:
%mlalpha preprocess

### Local Preprocessing

Fill in the cell input, like:
```
%%mlalpha preprocess
train_data_path: /content/datalab/tmp/ml/census/data_train.csv
eval_data_path: /content/datalab/tmp/ml/census/data_eval.csv
data_format: CSV
output_dir: /content/datalab/tmp/ml/census/preprocessed
feature_set_class_name: CensusFeatures
```

It generates a local DataFlow pipeline. You can run the pipeline directly, or extend it with more DataFlow transforms.

In [2]:

# header
"""
Following code is generated from command line:
%%mlalpha preprocess
train_data_path: /content/datalab/tmp/ml/census/data_train.csv
eval_data_path: /content/datalab/tmp/ml/census/data_eval.csv
data_format: CSV
output_dir: /content/datalab/tmp/ml/census/preprocessed
feature_set_class_name: CensusFeatures

Please modify as appropriate!!!
"""

# imports
import apache_beam as beam
import google.cloud.ml as ml
import google.cloud.ml.io as io
import os

# defines
feature_set = CensusFeatures()
OUTPUT_DIR = '/content/datalab/tmp/ml/census/preprocessed'
pipeline = beam.Pipeline('DirectPipelineRunner')


# preprocessing
training_data = beam.io.TextFileSource(
    '/content/datalab/tmp/ml/census/data_train.csv',
    strip_trailing_newlines=True,
    coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns))
train = pipeline | beam.Read('ReadTrainingData', training_data)

eval_data = beam.io.TextFileSource(
    '/content/datalab/tmp/ml/census/data_eval.csv',
    strip_trailing_newlines=True,
    coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns))
eval = pipeline  | beam.Read('ReadEvalData', eval_data)

(metadata, train_features, eval_features) = ((train, eval) | 'Preprocess'
    >> ml.Preprocess(feature_set, input_format='csv',
                  format_metadata={'headers': feature_set.csv_columns}))

(metadata        | 'SaveMetadata'
    >> io.SaveMetadata(os.path.join(OUTPUT_DIR, 'metadata.yaml')))

(train_features  | 'SaveTrain'
    >> io.SaveFeatures(os.path.join(OUTPUT_DIR, 'features_train'), shard_name_template=''))

(eval_features   | 'SaveEval'
    >> io.SaveFeatures(os.path.join(OUTPUT_DIR, 'features_eval'), shard_name_template=''))

# run pipeline
pipeline.run()


<apache_beam.runners.direct_runner.DirectPipelineResult at 0x7f3ba97b8a50>

In [6]:
!ls /content/datalab/tmp/ml/census/preprocessed

features_eval.tfrecord.gz  features_train.tfrecord.gz  metadata.yaml


### Cloud Preprocessing
You can also generate Cloud DataFlow pipeline. Just add "--cloud" to "%ml preprocess". <br>
Note that if you need to get it running in cloud, you need: <br>
1. Sign In using the up right sign-in button, if you have not done so. <br>
2. Set a default project by running '%projects set Your-Project-Id'.
3. Your data need to be in Cloud Storage.

Define variables that will be used later.

In [7]:
import os

bucket = 'gs://' + datalab_project_id() + '-sampledata'
train_data_path = os.path.join(bucket, 'census', 'data_train.csv')
eval_data_path = os.path.join(bucket, 'census', 'data_eval.csv')
output_dir = os.path.join(bucket, 'census', 'preprocessed')

Copy data files to Cloud Storage because Cloud DataFlow only works with data in Cloud Storage.

In [7]:
%%storage create --bucket $bucket

In [6]:
!gsutil cp gs://cloud-datalab/sampledata/ml/census/data_train.csv $train_data_path
!gsutil cp gs://cloud-datalab/sampledata/ml/census/data_eval.csv $eval_data_path

Copying gs://cloud-datalab/sampledata/ml/census/data_train.csv [Content-Type=text/csv]...
Copying     ...-test-automated-sampledata/census/data_train.csv: 4.24 MiB/4.24 MiB    
Copying gs://cloud-datalab/sampledata/ml/census/data_eval.csv [Content-Type=text/csv]...
Copying     ...l-test-automated-sampledata/census/data_eval.csv: 482.12 KiB/482.12 KiB    


The input is like:
```
%%mlalpha preprocess --cloud
train_data_path: $train_data_path
eval_data_path: $eval_data_path 
data_format: CSV
output_dir: $output_dir
feature_set_class_name: CensusFeatures
```

Run it and it becomes the following cell input.

In [11]:

# header
"""
Following code is generated from command line:
%%mlalpha preprocess --cloud
train_data_path: $train_data_path
eval_data_path: $eval_data_path 
data_format: CSV
output_dir: $output_dir
feature_set_class_name: CensusFeatures

Please modify as appropriate!!!
"""

# imports
import apache_beam as beam
import google.cloud.ml as ml
import google.cloud.ml.io as io
import os

# defines
feature_set = CensusFeatures()
OUTPUT_DIR = 'gs://cloud-ml-test-automated-sampledata/census/preprocessed'
import datetime
options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': 'preprocess-censusfeatures' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S'),
    'project': 'cloud-ml-test-automated',
    'extra_packages': ['gs://cloud-ml/sdk/cloudml-0.1.6-alpha.tar.gz'],
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)
pipeline = beam.Pipeline('DataflowPipelineRunner', options=opts)


# preprocessing
training_data = beam.io.TextFileSource(
    'gs://cloud-ml-test-automated-sampledata/census/data_train.csv',
    strip_trailing_newlines=True,
    coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns))
train = pipeline | beam.Read('ReadTrainingData', training_data)

eval_data = beam.io.TextFileSource(
    'gs://cloud-ml-test-automated-sampledata/census/data_eval.csv',
    strip_trailing_newlines=True,
    coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns))
eval = pipeline  | beam.Read('ReadEvalData', eval_data)

(metadata, train_features, eval_features) = ((train, eval) | 'Preprocess'
    >> ml.Preprocess(feature_set, input_format='csv',
                  format_metadata={'headers': feature_set.csv_columns}))

(metadata        | 'SaveMetadata'
    >> io.SaveMetadata(os.path.join(OUTPUT_DIR, 'metadata.yaml')))

(train_features  | 'SaveTrain'
    >> io.SaveFeatures(os.path.join(OUTPUT_DIR, 'features_train'), shard_name_template=''))

(eval_features   | 'SaveEval'
    >> io.SaveFeatures(os.path.join(OUTPUT_DIR, 'features_eval'), shard_name_template=''))

# run pipeline
pipeline.run()


<DataflowPipelineResult <Job
 id: u'2016-09-27_12_11_15-12127342684134936569'
 projectId: u'cloud-ml-test-automated'
 steps: []
 tempFiles: []
 type: TypeValueValuesEnum(JOB_TYPE_BATCH, 1)> at 0x7f3b507e2910>

After you run the above generated code, you can go to Developer Console to see the DataFlow job: https://pantheon.corp.google.com/dataflow (and select the right project). After the job finishes, run the following to make sure the preprocessed files were generated.

In [7]:
!gsutil list $output_dir

gs://cloud-ml-test-automated-sampledata/census/preprocessed/features_eval.tfrecord.Z
gs://cloud-ml-test-automated-sampledata/census/preprocessed/features_train.tfrecord.Z
gs://cloud-ml-test-automated-sampledata/census/preprocessed/metadata.yaml
gs://cloud-ml-test-automated-sampledata/census/preprocessed/tmp/
