In [None]:
class PipelineRunner:
    """Boilerplate code for running a Dataflow pipelin.
    """
    
    def __init__(self, train_dataset, output_dir, project, pipeline_options=None):
        self._tf_gcs_url = 'gs://cloud-datalab/deploy/tf/tensorflow-1.2.0-cp27-none-linux_x86_64.whl'
        self._protobuf_gcs_url = 'gs://cloud-datalab/deploy/tf/protobuf-3.1.0-py2.py3-none-any.whl'
        self._train_dataset = train_dataset
        self._output_dir = output_dir
        self._project = project
        self._pipeline_options = pipeline_options
        
    def run(self):
        """Preprocess data in Cloud with DataFlow."""
        import mltoolbox.image.classification._util as util
        from tensorflow.python.lib.io import file_io

        tmpdir = tempfile.mkdtemp()
        original_level = logging.getLogger().getEffectiveLevel()
        logging.getLogger().setLevel(logging.ERROR)
        try:
            # Workaround for DataFlow 2.0, which doesn't work well with extra packages in GCS.
            # Remove when the issue is fixed and new version of DataFlow is included in Datalab.
            staging_package_url = util.repackage_to_staging(self._output_dir)
            extra_packages = [staging_package_url, self._tf_gcs_url, self._protobuf_gcs_url]
            local_packages = [os.path.join(tmpdir, os.path.basename(p)) for p in extra_packages]
            for source, dest in zip(extra_packages, local_packages):
                file_io.copy(source, dest, overwrite=True)
            if self._pipeline_options is None:
                additional_options = {}
            else:
                additional_options = dict(self._pipeline_options)
            additional_options['extra_packages'] = local_packages

            p = create_pipeline(self._train_dataset,
                                self._output_dir,
                                self._project,
                                additional_options)
            job_results = p.run()
            dataflow_url = 'https://console.developers.google.com/dataflow?project=%s' % self._project
            html = 'Job "%s" submitted.' % p.options.get_all_options()['job_name']
            html += '<p>Click <a href="%s" target="_blank">here</a> to track preprocessing job. <br/>' % dataflow_url
            IPython.display.display_html(html, raw=True)
            job_results.wait_until_finish()
        finally:
            shutil.rmtree(tmpdir)
            logging.getLogger().setLevel(original_level)