In [1]:
! pip install --upgrade /content/cloudml.tar.gz


Processing /content/cloudml.tar.gz
Requirement already up-to-date: oauth2client==2.2.0 in /usr/local/lib/python2.7/dist-packages (from cloudml==0.1.2)
Requirement already up-to-date: six>=1.10.0 in /usr/local/lib/python2.7/dist-packages (from cloudml==0.1.2)
Requirement already up-to-date: protobuf>=3.0.0b2.post2 in /usr/local/lib/python2.7/dist-packages (from cloudml==0.1.2)
Requirement already up-to-date: google-cloud-dataflow>=0.4.0 in /usr/local/lib/python2.7/dist-packages (from cloudml==0.1.2)
Requirement already up-to-date: bs4>=0.0.1 in /usr/local/lib/python2.7/dist-packages (from cloudml==0.1.2)
Requirement already up-to-date: numpy>=1.10.4 in /usr/local/lib/python2.7/dist-packages (from cloudml==0.1.2)
Requirement already up-to-date: pillow>=3.2.0 in /usr/local/lib/python2.7/dist-packages (from cloudml==0.1.2)
Requirement already up-to-date: dpkt>=1.8.7 in /usr/local/lib/python2.7/dist-packages (from cloudml==0.1.2)
Requirement already up-to-date: nltk>=3.2.1 in /usr/local/lib

In [2]:
import os
import random
import google.cloud.ml as ml
from google.cloud.ml.dataflow.io import tfrecordio
import apache_beam as beam

data_dir = '/content/preprocessing_demo/geolocation/'

In this sample, we explain how a user can introduce her own feature type. The Running Example is a Geo-Location Feature (latitute, longitude)

For this example a helper function is needed, about figuring out if a point belongs in a polygon. You can consider this an external dependency


In [5]:
"""Utilities for geo-location data operations."""

def point_in_polygon(polygon_points, p):
  """Point in polygon function, Returs true if p is inside the polygon.

    Use the Ray Casting Method to determine if a point belongs in a polygon.
    Copy translated from C code from here
    google3/third_party/py/skimage/_shared/geometry.pyx

    Args:
      polygon_points: list of tuples corresponding to the coordinates of
      vertices. They must be in clock-wise or counter-clockwise order.
      p: tuple corresponding to the coordinates of point to check.
    Returns:
      True if point in polygon.
  """
  x, y = p
  result = False
  for i in range(len(polygon_points)):
    if p == polygon_points[i]:  # if point is exactly on edge, count as true
      return True
    j = (i + 1) % len(polygon_points)  # next point
    xp1, yp1 = polygon_points[i]
    xp2, yp2 = polygon_points[j]
    if yp2 == yp1:
      x_intersect = float('+inf')
    else:
      x_intersect = (xp2 - xp1) * (y - yp1) / (yp2 - yp1) + xp1
    if (y <= max(yp1, yp2) and y >= min(yp1, yp2)) and (x <= x_intersect):
      result = not result

  return result

First, user defines a **Feature Column**


In [4]:
import re
import google.cloud.ml.features as features

class GeoFeatureColumn(features.FeatureColumn):
  """Represents a feature column for geolocation.

    Essentially it is a longtitude-latitude value. The column, also receives a
    path as input, which is where the polygon information is stored.
  """

  def __init__(self, name, path, make_categorical=False):
    super(GeoFeatureColumn, self).__init__(name, 'geo-location')
    polygons = self._load_polygons(path)
    self._set_transformation('reverse-geo-code',
                             {'polygons': polygons,
                              'make_categorical': make_categorical})

  def _load_polygons(self, path):
    polygons = []
    with open(path, 'r') as polygon_file:
      for line in polygon_file:
        if line.startswith('"MULTIPOLYGON'):
          points = re.findall(r'[\-0-9\.]+ [\-0-9\.]+', line)
          # turn points into list of tuples
          points = [tuple(map(float, x.split())) for x in points]
          polygon_info = line.split(')))",')[1].split(',')
          polygons.append({'points': points, 'info': polygon_info})
    return polygons

Then a user *optionally* defines an Analyzer (example without analyzer in the Image Sample)
In this case the analyzer, a function on the whole feature column. This allows us to create some useful metadata.
The metadata defined here are the four edge points that can form a bounding box of all the other locations.

In [3]:
import google.cloud.ml.features._registries as registries
import google.cloud.ml.features._analysis as analysis

@registries.register_analyzer('geo-location')
class GeoColumnAnalyzer(analysis.ColumnAnalyzer):
  """Analyzer for Geo-Location values.

    Calculates total number of datapoints and the four edge points which can
    form a bounding box over all the given points. From that, calculates total
    area the points cover (in square miles).
    In order to perform the analysis, some filtering of 'bad values' must be
    done.
  """

  def __init__(self, column):
    super(GeoColumnAnalyzer, self).__init__(column)
    self._aggregator = GeoColumnAnalyzer.Aggregator(self._get_column_metadata())

  def apply(self, values):
    """Filtering of bad input values, and aggregation of column info."""   
    return (values | beam.Filter(lambda x: x is not None)
            | beam.Map('makefloat', lambda x: tuple(map(float, x.split(' '))))
            | beam.Filter('remove null', lambda x: x != 0)
            | beam.CombineGlobally('Analysis',
                                   self._aggregator).without_defaults())
  
  class Aggregator(beam.core.CombineFn):
    """Aggregator to combine values within a geo-location column.

       Calculates the 4 border points. (Points with min/max latitute and
       longtitude). Also calculates the total area covered.
    """

    def __init__(self, column):
      self._column = column

    def create_accumulator(self):
      return ((float('+inf'), float('+inf')), (float('+inf'), float('+inf')),
              (float('-inf'), float('-inf')), (float('-inf'), float('-inf')), 0)

    def add_input(self, stats, point):
      (min_lat, min_lon, max_lat, max_lon, count) = stats
      if point is None:
        return (min_lat, min_lon, max_lat, max_lon, count)

      lat = point[0]
      lon = point[1]
      if lat < min_lat[0]:
        min_lat = point
      if lon < min_lon[1]:
        min_lon = point
      if lat > max_lat[0]:
        max_lat = point
      if lon > max_lon[1]:
        max_lon = point

      return (min_lat, min_lon, max_lat, max_lon, count + 1)

    def merge_accumulators(self, accumulators):
      min_lats, min_lons, max_lats, max_lons, counts = zip(*accumulators)
      min_lat, min_lon, max_lat, max_lon = min_lats[0], min_lons[0], max_lats[
          0], max_lons[0]

      for p in min_lats:
        if p[0] < min_lat[0]:
          min_lat = p
      for p in max_lats:
        if p[0] > max_lat[0]:
          max_lat = p
      for p in min_lons:
        if p[1] < min_lon[1]:
          min_lon = p
      for p in max_lons:
        if p[1] > max_lon[1]:
          max_lon = p

      return (min_lat, min_lon, max_lat, max_lon, sum(counts))

    def extract_output(self, stats):
      (min_lat, min_lon, max_lat, max_lon, count) = stats
      column = self._column
      column['min_lat'] = min_lat
      column['min_lon'] = min_lon
      column['max_lat'] = max_lat
      column['max_lon'] = max_lon
      column['count'] = count
      column['polygons_dict'] = self._enumerate_polygons(column[
          'reverse-geo-code']['polygons'])
      return (self._column['name'], column)

    def _enumerate_polygons(self, polygons):
      polygon_names = set([p['info'][3] for p in polygons ])
      return dict(zip(sorted(list(polygon_names)), range(len(polygon_names))))

Then a user defines a Transform. 
This is a function that will run for each value in this column independently, while having the metadata information from the Analyzer

In [6]:
import google.cloud.ml.features._transforms as transforms
import google.cloud.ml.features._registries as registries

@registries.register_transformation('geo-location', 'reverse-geo-code')
class GeoTransform(transforms.ColumnTransform):
  """Column Transform Class for geo-location.

     Reverse geo-codes lat, lon value into a string based, on a polygon map.
  """

  @staticmethod
  def from_metadata(column):
    return GeoTransform(column['min_lat'], column['max_lat'], column['min_lon'],
                        column['max_lon'], column['polygons_dict'],
                        column['reverse-geo-code']['polygons'],
                        column['reverse-geo-code']['make_categorical'])

  def __init__(self, min_lat, max_lat, min_lon, max_lon, area, polygons_dict,
               polygons, make_categorical):
    self._min_lat = min_lat
    self._max_lat = max_lat
    self._min_lon = min_lon
    self._max_lon = max_lon
    self._polygons = polygons
    self._polygon_dict = polygons_dict
    self._make_categorical = make_categorical

  def transform(self, value):
    point = tuple(map(float, value.split()))
    if not self._in_area(point):
      cat_str = 'Outside'
    else:
      cat_str = 'Inside_Unknown'
      for polygon in self._polygons:
        if point_in_polygon(polygon['points'], point):
          cat_str = polygon['info'][3]
          break
    return self._convert_neighborhood(cat_str)

  def _convert_neighborhood(self, cat_str):
    if not self._make_categorical:
      return [cat_str]
    size = len(self._polygon_dict)
    cat_array = [0] * size
    category = self._polygon_dict.get(cat_str, None)
    if category:
      cat_array[category] = 1

    return cat_array

  def _in_area(self, point):
    if point[0] < self._min_lat[0] or point[0] > self._max_lat[0]:
      return False
    if point[1] < self._min_lon[1] or point[1] > self._max_lon[1]:
      return False
    return True

This is how a user can define their own data type and write the methods that want to be applied in that datatype.

In order to illustrate an example usage, we go through some code that uses this. We are going to use a (sample of a) publicly avaialble dataset, from NY Taxi Data.

First, we define the fields in our dataset. 

In [10]:
import google.cloud.ml.features as features
import os
POLYGONS_PATH = os.path.join(data_dir, 'ny_polygons.csv')

class NYTaxiDataFeatures(object):
  """Class that defines the features in the NYTaxiDataset."""

  csv_columns = ('VendorID', 'lpep_pickup_datetime', 'Lpep_dropoff_datetime',
                 'Store_and_fwd_flag', 'RateCodeID', 'Pickup_longitude',
                 'Pickup_latitude', 'Dropoff_longitude', 'Dropoff_latitude',
                 'Passenger_count', 'Trip_distance', 'Fare_amount', 'Extra',
                 'MTA_tax', 'Tip_amount', 'Tolls_amount', 'Ehail_fee',
                 'Total_amount', 'Payment_type', 'Trip_type', 'empty', 'empty2',
                 'pickup', 'dropoff')

  target = features.target('Total_amount').regression()
  pickup = GeoFeatureColumn('pickup', POLYGONS_PATH),
  dropoff = GeoFeatureColumn('dropoff', POLYGONS_PATH)

  other_data = [
      features.numeric('Passenger_count'), features.numeric('Trip_distance'),
      features.numeric('Tip_amount')
  ]




Then we define our preprocessing wrapper. This particular one will print the metadata from the analyzer, and save the pre-processed data into files (so that we can later view them for this demo)
We also define some  helper functions to help us print the results

In [13]:
import google.cloud.ml.io as io
def preprocess(pipeline, training_path, testing_path):
  feature_set = NYTaxiDataFeatures()

  training_data = beam.io.TextFileSource(
      training_path,
      strip_trailing_newlines=True,
      coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns))

  test_data = beam.io.TextFileSource(
      testing_path,
      strip_trailing_newlines=True,
      coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns))

  train = pipeline | beam.Read('ReadTrainingData', training_data)
  test = pipeline | beam.Read('ReadTestingData', test_data)

  (metadata, train_features, test_features) = (
      (train, test) | ml.Preprocess('Preprocess', feature_set))

  metadata |= beam.Map('print meta', print_fn_metadata)

  train_features_path = write_file(train_features, 'WriteTraining', 'features_train')
  test_features_path = write_file(test_features, 'WritePredict', 'features_predict')
  return (metadata, train_features_path, test_features_path)
  
    
class ExampleProtoCoder(beam.coders.Coder):
  """A coder to encode TensorFlow example protos.
  """

  def encode(self, x):
    return x.SerializeToString()

def write_file(pcollection, label, output_name):
  path = os.path.join(data_dir, output_name)

  print 'Writing to', path
  pcollection | beam.Write(label, tfrecordio.TFRecordSink(
      path,
      coder = ExampleProtoCoder(),
      shard_name_template='',
      compression_type=beam.io.fileio.CompressionTypes.ZLIB))

  return path

def print_results(path):
  def parse_example_from_string(serialized):
      import tensorflow as tf  # pylint: disable=g-import-not-at-top
      example = tf.train.Example()
      example.ParseFromString(serialized)
      return example

  pipeline = beam.Pipeline('DirectPipelineRunner')
  a = (pipeline | tfrecordio.TFRecordSource( path, compression_type=beam.io.fileio.CompressionTypes.ZLIB)
            | 'Deserialize' >> beam.Map(parse_example_from_string))
  a |= 'print'>> beam.Map(print_fn_train)
  pipeline.run()
  return a

def print_fn_metadata(values):
  print 'Metadata Info'
  print values.keys()
  print values['columns'].keys()
  print values['columns']['pickup'].keys()
  print 'min lat:', values['columns']['pickup']['min_lat'],
  print 'max_lat:', values['columns']['pickup']['max_lat']
  print values['features'].keys()

  return values

def print_fn_train(values):
  a = str(values)
  if 'MN13' in a and 'BK73' in a:
    print a
  return values


Unfortunately our data does not come in the format we want. But its very easy to do some data_preparation steps with dataflow. 
We define the method here.

In [8]:
def prepare_data(path):
  """Prepare raw data into a format that is ready to be used for preprocessing.

     This includes, combining two columns to form a geo-location, filtering out
     lines with null input and non full entries.
  """

  output_path = os.path.join(data_dir, 'prepared_data.csv')
  training_path = os.path.join(data_dir, 'training_data.csv')
  testing_path = os.path.join(data_dir, 'testing_data.csv')

  p = beam.Pipeline('DirectPipelineRunner')
  data = (p | beam.io.Read(
      'ReadFromText',
      beam.io.TextFileSource(path)))

  data = (data
          | beam.Map('split columns', lambda x: x.strip().split(','))
          | beam.Filter('remove non full entries', lambda row: len(row) >= 22)
          | beam.Filter('remove null locations', is_null_location))

  data |= beam.Map('Combine Pickup', concatenate, [5, 6])
  data |= beam.Map('Combine Dropoff', concatenate, [7, 8])

  # save to file as strings again
  _ = (data | beam.Map('Make strings', lambda x: ','.join(x))
       | beam.Write(
           'Save prepared',
           beam.io.TextFileSink(
               output_path, shard_name_template='')))
  p.run()

  training = open(training_path, 'w')
  testing = open(testing_path, 'w')
  with open(output_path, 'r') as f:
    for line in f:
      if random.random() < 0.2:
        testing.write(line)
      else:
        training.write(line)

  training.close()
  testing.close()

  return training_path, testing_path

def concatenate(data, idx_list):
  """Concatenates a list of columns, adds it as a new one, at the end of data.
  """
  return data + [' '.join([data[idx] for idx in idx_list])]


def is_null_location(row):
  """Check if any of the location coordinates is zero."""
  prod = 1.
  prod *= float(row[5])
  prod *= float(row[6])
  prod *= float(row[7])
  prod *= float(row[8])
  return prod != 0

Now we are finally ready to run our pipeline of pre-processing

In [14]:
print 'Preparing data..',
training_path, testing_path = prepare_data(
  '/content/preprocessing_demo/geolocation/ny_taxi_sample.csv')

print 'Done\nPreprocessing',

pipeline = beam.Pipeline('DirectPipelineRunner')
meta, train, test = preprocess(pipeline, training_path, testing_path)
pipeline.run()
print 'Done'
_ = print_results(train)

Preparing data.. Done
Preprocessing Writing to /content/preprocessing_demo/geolocation/features_train
Writing to /content/preprocessing_demo/geolocation/features_predict
Metadata Info
['stats', 'features', 'columns']
['Trip_distance', 'dropoff', 'pickup', 'Passenger_count', 'Tip_amount', 'Total_amount']
['reverse-geo-code', 'count', 'name', 'area', 'min_lat', 'transform', 'max_lon', 'max_lat', 'min_lon', 'type', 'polygons_dict']
min lat: (-74.07369232177734, 40.64374923706055) max_lat: (-73.74129486083984, 40.65449142456055)
['dropoff', 'pickup', 'target', 'other_data']
Done
features {
  feature {
    key: "dropoff"
    value {
      bytes_list {
        value: "MN13"
      }
    }
  }
  feature {
    key: "other_data"
    value {
      float_list {
        value: -1.0
        value: -0.446228712797
        value: -0.442896932364
      }
    }
  }
  feature {
    key: "pickup"
    value {
      bytes_list {
        value: "BK73"
      }
    }
  }
  feature {
    key: "target"
    value