# Setup

In [1]:
# Use the latest version of pip.
!pip install --upgrade pip
# Install tfx and kfp Python packages.
!pip install -q --upgrade tfx[kfp]==1.0.0rc1 



### Automatically restart kernel after installs

In [2]:
import sys
if not 'google.colab' in sys.modules:
  import IPython
  app = IPython.Application.instance()
  app.kernel.do_shutdown(True)

## Import packages

In [1]:
import os
import urllib
import kfp
import tfx
import tensorflow as tf
from kfp.v2.google import client
from tfx.proto import trainer_pb2
from tfx.orchestration.kubeflow.v2 import kubeflow_v2_dag_runner
from pipeline import configs, pipeline



In [2]:
print(f'TensorFlow version: {tf.__version__}')
print(f'TFX version: {tfx.__version__}')
print('KFP version: {}'.format(kfp.__version__))

TensorFlow version: 2.5.0
TFX version: 1.0.0-rc1
KFP version: 1.6.1


**If you are on AI Platform Notebooks**, authenticate with Google Cloud before running the next section, by running

`gcloud auth login`

**in the Terminal window** (which you can open via **File* > **New** in the menu). You only need to do this once per notebook instance.

In [3]:
# Set gcloud to use your project.
!gcloud config set project {configs.GOOGLE_CLOUD_PROJECT}

Updated property [core/project].


### Download data

In [4]:
data_csv_url = 'https://raw.githubusercontent.com/dimitreOliveira/bert-as-a-service_TFX/main/Data/IMDB_5k_dataset.csv'
data_csv_filename = 'IMDB_dataset.csv'

_data_dir = 'data/'
if not os.path.exists(_data_dir):
    os.makedirs(_data_dir)

# Download data
urllib.request.urlretrieve(data_csv_url, f'{_data_dir}{data_csv_filename}')

('data/IMDB_dataset.csv', <http.client.HTTPMessage at 0x7f9f6116a4d0>)

In [5]:
_transform_module_file = 'transform_utils.py'
_trainer_module_file = 'train_utils.py'

_transform_module_path = f'pipeline/{_transform_module_file}'
_trainer_module_path = f'pipeline/{_trainer_module_file}'

Copy the data and module files to GCS which can be accessed from the pipeline components.

In [6]:
# Move data to GCS
!gsutil cp data/IMDB_dataset.csv {configs.DATA_ROOT}/

# Move modelues to GCS
!gsutil cp {_transform_module_path} {configs.MODULE_ROOT}/
!gsutil cp {_trainer_module_path} {configs.MODULE_ROOT}/

Copying file://data/IMDB_dataset.csv [Content-Type=text/csv]...
/ [1 files][  6.3 MiB/  6.3 MiB]                                                
Operation completed over 1 objects/6.3 MiB.                                      
Copying file://pipeline/transform_utils.py [Content-Type=text/x-python]...
/ [1 files][  2.5 KiB/  2.5 KiB]                                                
Operation completed over 1 objects/2.5 KiB.                                      
Copying file://pipeline/train_utils.py [Content-Type=text/x-python]...
/ [1 files][ 10.0 KiB/ 10.0 KiB]                                                
Operation completed over 1 objects/10.0 KiB.                                     


In [7]:
runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
    config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(),
    output_filename=configs.PIPELINE_DEFINITION_FILE)
# Following function will write the pipeline definition to PIPELINE_DEFINITION_FILE.
_ = runner.run(
    pipeline.create_pipeline(
        pipeline_name=configs.PIPELINE_NAME,
        pipeline_root=configs.PIPELINE_ROOT,
        data_root=configs.DATA_ROOT,
        transform_module_path=os.path.join(configs.MODULE_ROOT, _transform_module_file),
        train_module_path=os.path.join(configs.MODULE_ROOT, _trainer_module_file),
        train_args=trainer_pb2.TrainArgs(num_steps=configs.TRAIN_NUM_STEPS),
        eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS),
        eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD,
        serving_model_dir=configs.SERVING_MODEL_DIR,
        enable_tuning=configs.ENABLE_TUNNING,
    ))

In [8]:
pipelines_client = client.AIPlatformClient(
    project_id=configs.GOOGLE_CLOUD_PROJECT,
    region=configs.GOOGLE_CLOUD_REGION,
)

_ = pipelines_client.create_run_from_job_spec(configs.PIPELINE_DEFINITION_FILE)