In [1]:
! pip install --force --upgrade /content/cloudml.tar.gz

Processing /content/cloudml.tar.gz
Collecting oauth2client==2.2.0 (from cloudml==0.1.2)
Collecting six>=1.10.0 (from cloudml==0.1.2)
  Using cached six-1.10.0-py2.py3-none-any.whl
Collecting protobuf>=3.0.0b2.post2 (from cloudml==0.1.2)
  Using cached protobuf-3.0.0-py2.py3-none-any.whl
Collecting google-cloud-dataflow>=0.4.0 (from cloudml==0.1.2)
Collecting bs4>=0.0.1 (from cloudml==0.1.2)
Collecting numpy>=1.10.4 (from cloudml==0.1.2)
  Using cached numpy-1.11.1-cp27-cp27mu-manylinux1_x86_64.whl
Collecting pillow>=3.2.0 (from cloudml==0.1.2)
  Using cached Pillow-3.3.1-cp27-cp27mu-manylinux1_x86_64.whl
Collecting dpkt>=1.8.7 (from cloudml==0.1.2)
  Using cached dpkt-1.8.8-py2-none-any.whl
Collecting nltk>=3.2.1 (from cloudml==0.1.2)
Collecting httplib2>=0.9.1 (from oauth2client==2.2.0->cloudml==0.1.2)
Collecting rsa>=3.1.4 (from oauth2client==2.2.0->cloudml==0.1.2)
  Using cached rsa-3.4.2-py2.py3-none-any.whl
Collecting pyasn1>=0.1.7 (from oauth2client==2.2.0->cloudml==0.1.2)
  Usin

In [1]:
import apache_beam as beam
import os
import google.cloud.ml as ml
import google.cloud.ml.features as features
import google.cloud.ml.io as io
IMAGES_FILE = 'gs://cloud-ml-data/img/small_flower_photos_tst/small_data.csv'
SAVE_DIR = None  # add your own local/cloud bucket uri here if images are going to be saved
# gs://uri/to/your/bucket/ or /path/to/my/dir

First we define the fields of our dataset. In this case, it only a label and a path to an image (or image bytes).

Optionally save pre-processed images locally or on the cloud

In [2]:
class ImageDatasetFeatures(object):
  """Class that defines the features in the NYTaxiDataset."""

  csv_columns = ('image_uri', 'label')

  target = features.target('label').classification()
  raw_img = features.image('image_uri').image(grayscale=True, target_size=(200,200), 
                                              keep_aspect_ratio=True,
                                              save_dir=SAVE_DIR)

Then we define our preprocess wrapper. Basically feeds data into ml.Preprocess and prints metadata

In [3]:
def preprocess(pipeline, training_path):
  feature_set = ImageDatasetFeatures()
  # read the data
  train = pipeline | beam.Read(
      'ReadTrainingData',
      beam.io.TextFileSource(
          training_path,
          strip_trailing_newlines=True,
          coder=io.CsvCoder.from_feature_set(feature_set,
                                             feature_set.csv_columns)))
  # run preprocess
  (metadata, train_features) = (train | ml.Preprocess('Preprocess',
                                                      feature_set))
  # print metadata info
  metadata |= beam.Map('print meta', print_fn_metadata)
  return metadata, train_features

def print_fn_metadata(values):
  print values
  return values

We then create a pipeline and run it, expecting the metadata to be printed out

In [4]:
pipeline = beam.Pipeline('DirectPipelineRunner')
metadata_path, traning_path = preprocess(pipeline, IMAGES_FILE)
pipeline.run()

{'stats': {'labels': 1}, 'features': {'raw_img': {'dtype': 'float', 'type': 'dense', 'name': 'raw_img', 'columns': ['image_uri'], 'size': 1}, 'target': {'dtype': 'int64', 'type': 'dense', 'name': 'target', 'columns': ['label'], 'size': 1}}, 'columns': {'image_uri': {'image': {'keep_aspect_ratio': True, 'save_format': 'JPEG', 'grayscale': True, 'resize_method': 1, 'is_base64_str': False, 'target_size': (200, 200), 'save_dir': 'gs://cloud-ml-data/img/image_preprocessing_sample/processed/'}, 'type': 'image', 'name': 'image_uri', 'transform': 'image'}, 'label': {'items': {'daisy': 0}, 'type': 'target', 'name': 'label', 'scenario': 'discrete', 'transform': 'lookup'}}}


<apache_beam.runners.direct_runner.DirectPipelineResult at 0x7f14600acf90>

If specified a path, visit the google cloud bucket, or your local directory after running to see result preprocessed images 