In [7]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

import numpy as np
import json

import pickle
import os 
import time
import tempfile

from googleapiclient import discovery
from googleapiclient import errors

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from google.cloud import bigquery



In [30]:
REGION = 'us-central1'
ARTIFACT_STORE = 'gs://benazirsproject-demo'

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]
DATA_ROOT='{}/data'.format(ARTIFACT_STORE)
JOB_DIR_ROOT='{}/jobs'.format(ARTIFACT_STORE)
TRAINING_FILE_PATH='{}/{}/{}'.format(DATA_ROOT, 'training', 'dataset.csv')
VALIDATION_FILE_PATH='{}/{}/{}'.format(DATA_ROOT, 'validation', 'dataset.csv')


In [38]:
## load data into bigquery ... 

%%bigquery
DATASET_LOCATION=US
DATASET_ID=covertype_dataset2
TABLE_ID=covertype
DATA_SOURCE=gs://workshop-datasets/covertype/small/dataset.csv
SCHEMA=Elevation:INTEGER,\
Aspect:INTEGER,\
Slope:INTEGER,\
Horizontal_Distance_To_Hydrology:INTEGER,\
Vertical_Distance_To_Hydrology:INTEGER,\
Horizontal_Distance_To_Roadways:INTEGER,\
Hillshade_9am:INTEGER,\
Hillshade_Noon:INTEGER,\
Hillshade_3pm:INTEGER,\
Horizontal_Distance_To_Fire_Points:INTEGER,\
Wilderness_Area:STRING,\
Soil_Type:STRING,\
Cover_Type:INTEGER

bq --location=$DATASET_LOCATION --project_id=$PROJECT_ID mk --dataset $DATASET_ID

bq --project_id=$PROJECT_ID --dataset_id=$DATASET_ID load \
--source_format=CSV \
--skip_leading_rows=1 \
--replace \
$TABLE_ID \
$DATA_SOURCE \
$SCHEMA

SyntaxError: invalid syntax (<ipython-input-38-e4ad1f808a40>, line 7)

In [39]:
%%bigquery
SELECT *
FROM `covertype_dataset.covertype`

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2085,256,18,150,27,738,176,248,208,914,Cache,C2702,5
1,2125,256,20,30,12,871,169,248,215,300,Cache,C2702,2
2,2146,256,34,150,62,1253,122,237,239,511,Cache,C2702,2
3,2186,256,38,210,102,1294,109,232,244,552,Cache,C2702,2
4,2831,256,25,277,183,1706,153,246,225,1485,Commanche,C2705,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,3136,254,12,319,60,5734,193,248,193,2467,Rawah,C7746,1
99996,3242,254,12,636,148,3551,193,248,193,2010,Commanche,C7757,0
99997,2071,255,12,234,63,342,192,247,193,247,Cache,C2706,2
99998,3248,255,12,730,113,725,192,247,193,2724,Commanche,C7756,1


In [40]:
## create training split
!bq query \
-n 0 \
--destination_table covertype_dataset.training \
--replace \
--use_legacy_sql=false \
'SELECT * \
FROM `covertype_dataset.covertype` AS cover \
WHERE \
MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(cover))), 10) IN (1, 2, 3, 4)' 

Waiting on bqjob_r2075c7d5d71d0385_000001742be3a92d_1 ... (1s) Current status: DONE   


In [41]:
## save trainign split at this location
!bq extract \
--destination_format CSV \
covertype_dataset.training \
$TRAINING_FILE_PATH

Waiting on bqjob_r7b5644a868249e85_000001742be6efe4_1 ... (0s) Current status: DONE   


In [42]:
!bq query \
-n 0 \
--destination_table covertype_dataset.validation \
--replace \
--use_legacy_sql=false \
'SELECT * \
FROM `covertype_dataset.covertype` AS cover \
WHERE \
MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(cover))), 10) IN (8)' 

Waiting on bqjob_r7cb6be8a932f43f6_000001742bf78b80_1 ... (1s) Current status: DONE   


In [43]:
!bq extract \
--destination_format CSV \
covertype_dataset.validation \
$VALIDATION_FILE_PATH

Waiting on bqjob_re87973667e9b28c_000001742bf79a1b_1 ... (0s) Current status: DONE   


In [44]:
df_train = pd.read_csv(TRAINING_FILE_PATH)
df_validation = pd.read_csv(VALIDATION_FILE_PATH)
print(df_train.shape)
print(df_validation.shape)

(40009, 13)
(9836, 13)


In [46]:
numeric_feature_indexes = slice(0, 10)
categorical_feature_indexes = slice(10, 12)
num_features_type_map = {feature: 'float64' for feature in df_train.columns[numeric_feature_indexes]}

df_train = df_train.astype(num_features_type_map)
df_validation = df_validation.astype(num_features_type_map)

In [47]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_feature_indexes),
        ('cat', OneHotEncoder(), categorical_feature_indexes) 
    ])

In [48]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log', tol=1e-3))
])

In [49]:
X_train = df_train.drop('Cover_Type', axis=1)
y_train = df_train['Cover_Type']
X_validation = df_validation.drop('Cover_Type', axis=1)
y_validation = df_validation['Cover_Type']

pipeline.set_params(classifier__alpha=0.001, classifier__max_iter=200)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  slice(0, 10, None)),
                                                 ('cat', OneHotEncoder(),
                                                  slice(10, 12, None))])),
                ('classifier',
                 SGDClassifier(alpha=0.001, loss='log', max_iter=200))])

In [50]:
accuracy = pipeline.score(X_validation, y_validation)
print(accuracy)

0.6969296461976413


## run the mpdel and hyperparameter job on ai platform 

In [51]:
TRAINING_APP_FOLDER = 'training_app'
os.makedirs(TRAINING_APP_FOLDER, exist_ok=True)

In [52]:
%%writefile {TRAINING_APP_FOLDER}/train.py

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import subprocess
import sys

import fire
import pickle
import numpy as np
import pandas as pd

import hypertune

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


def train_evaluate(job_dir, training_dataset_path, validation_dataset_path, alpha, max_iter, hptune):
    
    df_train = pd.read_csv(training_dataset_path)
    df_validation = pd.read_csv(validation_dataset_path)

    if not hptune:
        df_train = pd.concat([df_train, df_validation])

    numeric_feature_indexes = slice(0, 10)
    categorical_feature_indexes = slice(10, 12)

    preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_feature_indexes),
        ('cat', OneHotEncoder(), categorical_feature_indexes) 
    ])

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', SGDClassifier(loss='log',tol=1e-3))
    ])

    num_features_type_map = {feature: 'float64' for feature in df_train.columns[numeric_feature_indexes]}
    df_train = df_train.astype(num_features_type_map)
    df_validation = df_validation.astype(num_features_type_map) 

    print('Starting training: alpha={}, max_iter={}'.format(alpha, max_iter))
    X_train = df_train.drop('Cover_Type', axis=1)
    y_train = df_train['Cover_Type']

    pipeline.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter)
    pipeline.fit(X_train, y_train)

    if hptune:
        X_validation = df_validation.drop('Cover_Type', axis=1)
        y_validation = df_validation['Cover_Type']
        accuracy = pipeline.score(X_validation, y_validation)
        print('Model accuracy: {}'.format(accuracy))
        # Log it with hypertune
        hpt = hypertune.HyperTune()
        hpt.report_hyperparameter_tuning_metric(
          hyperparameter_metric_tag='accuracy',
          metric_value=accuracy
        )

    # Save the model
    if not hptune:
        model_filename = 'model.pkl'
        with open(model_filename, 'wb') as model_file:
            pickle.dump(pipeline, model_file)
        gcs_model_path = "{}/{}".format(job_dir, model_filename)
        subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path], stderr=sys.stdout)
        print("Saved model in: {}".format(gcs_model_path)) 
    
if __name__ == "__main__":
    fire.Fire(train_evaluate)

Writing training_app/train.py


In [53]:
%%writefile {TRAINING_APP_FOLDER}/Dockerfile

FROM gcr.io/deeplearning-platform-release/base-cpu
RUN pip install -U fire cloudml-hypertune scikit-learn==0.20.4 pandas==0.24.2
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

Writing training_app/Dockerfile


In [54]:
IMAGE_NAME='trainer_image'
IMAGE_TAG='latest'
IMAGE_URI='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, IMAGE_TAG)

In [55]:
!gcloud builds submit --tag $IMAGE_URI $TRAINING_APP_FOLDER

Creating temporary tarball archive of 3 file(s) totalling 6.2 KiB before compression.
Uploading tarball of [training_app] to [gs://benazirsproject_cloudbuild/source/1598467503.89-7e3f979ba7d9400f91639c7954a031c3.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/benazirsproject/builds/1a765ac6-e896-42e7-a23d-d128b07d5521].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/1a765ac6-e896-42e7-a23d-d128b07d5521?project=981930454113].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "1a765ac6-e896-42e7-a23d-d128b07d5521"

FETCHSOURCE
Fetching storage object: gs://benazirsproject_cloudbuild/source/1598467503.89-7e3f979ba7d9400f91639c7954a031c3.tgz#1598467504428362
Copying gs://benazirsproject_cloudbuild/source/1598467503.89-7e3f979ba7d9400f91639c7954a031c3.tgz#1598467504428362...
/ [1 files][  1.6 KiB/  1.6 KiB]                                                
Operation completed over 1 objects/1.6 KiB.         

In [56]:
%%writefile {TRAINING_APP_FOLDER}/hptuning_config.yaml

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

trainingInput:
  hyperparameters:
    goal: MAXIMIZE
    maxTrials: 4
    maxParallelTrials: 4
    hyperparameterMetricTag: accuracy
    enableTrialEarlyStopping: TRUE 
    params:
    - parameterName: max_iter
      type: DISCRETE
      discreteValues: [
          200,
          500
          ]
    - parameterName: alpha
      type: DOUBLE
      minValue:  0.00001
      maxValue:  0.001
      scaleType: UNIT_LINEAR_SCALE

Writing training_app/hptuning_config.yaml


In [59]:
JOB_NAME = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = "{}/{}".format(JOB_DIR_ROOT, JOB_NAME)
SCALE_TIER = "BASIC"

!gcloud ai-platform jobs submit training $JOB_NAME \
--region=$REGION \
--job-dir=$JOB_DIR \
--master-image-uri=$IMAGE_URI \
--scale-tier=$SCALE_TIER \
--config $TRAINING_APP_FOLDER/hptuning_config.yaml \
-- \
--training_dataset_path=$TRAINING_FILE_PATH \
--validation_dataset_path=$VALIDATION_FILE_PATH \
--hptune

Job [JOB_20200826_185451] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20200826_185451

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20200826_185451
jobId: JOB_20200826_185451
state: QUEUED


In [60]:
!gcloud ai-platform jobs describe $JOB_NAME

createTime: '2020-08-26T18:54:53Z'
etag: TDFdtRAnFOg=
jobId: JOB_20200826_185451
startTime: '2020-08-26T18:54:55Z'
state: RUNNING
trainingInput:
  args:
  - --training_dataset_path=gs://benazirsproject-demo/data/training/dataset.csv
  - --validation_dataset_path=gs://benazirsproject-demo/data/validation/dataset.csv
  - --hptune
  hyperparameters:
    enableTrialEarlyStopping: true
    goal: MAXIMIZE
    hyperparameterMetricTag: accuracy
    maxParallelTrials: 4
    maxTrials: 4
    params:
    - discreteValues:
      - 200.0
      - 500.0
      parameterName: max_iter
      type: DISCRETE
    - maxValue: 0.001
      minValue: 1e-05
      parameterName: alpha
      scaleType: UNIT_LINEAR_SCALE
      type: DOUBLE
  jobDir: gs://benazirsproject-demo/jobs/JOB_20200826_185451
  masterConfig:
    imageUri: gcr.io/benazirsproject/trainer_image:latest
  region: us-central1
trainingOutput:
  hyperparameterMetricTag: accuracy
  isHyperparameterTuningJob: true

View job in the Cloud Console at:
h

In [61]:
!gcloud ai-platform jobs stream-logs $JOB_NAME

^C


Command killed by keyboard interrupt



In [65]:
ml = discovery.build('ml', 'v1')

job_id = 'projects/{}/jobs/{}'.format(PROJECT_ID, JOB_NAME)
request = ml.projects().jobs().get(name=job_id)

try:
    response = request.execute()
except errors.HttpError as err:
    print(err)
except:
    print("Unexpected error")
    
response

{'jobId': 'JOB_20200826_185451',
 'trainingInput': {'args': ['--training_dataset_path=gs://benazirsproject-demo/data/training/dataset.csv',
   '--validation_dataset_path=gs://benazirsproject-demo/data/validation/dataset.csv',
   '--hptune'],
  'hyperparameters': {'goal': 'MAXIMIZE',
   'params': [{'parameterName': 'max_iter',
     'type': 'DISCRETE',
     'discreteValues': [200, 500]},
    {'parameterName': 'alpha',
     'minValue': 1e-05,
     'maxValue': 0.001,
     'type': 'DOUBLE',
     'scaleType': 'UNIT_LINEAR_SCALE'}],
   'maxTrials': 4,
   'maxParallelTrials': 4,
   'hyperparameterMetricTag': 'accuracy',
   'enableTrialEarlyStopping': True},
  'region': 'us-central1',
  'jobDir': 'gs://benazirsproject-demo/jobs/JOB_20200826_185451',
  'masterConfig': {'imageUri': 'gcr.io/benazirsproject/trainer_image:latest'}},
 'createTime': '2020-08-26T18:54:53Z',
 'startTime': '2020-08-26T18:54:55Z',
 'endTime': '2020-08-26T19:05:24Z',
 'state': 'SUCCEEDED',
 'trainingOutput': {'completedTri

In [66]:
response['trainingOutput']['trials'][0]

{'trialId': '3',
 'hyperparameters': {'alpha': '0.00027200708548161929', 'max_iter': '200'},
 'finalMetric': {'trainingStep': '1', 'objectiveValue': 0.7015046766978447},
 'startTime': '2020-08-26T18:55:32.497038944Z',
 'endTime': '2020-08-26T19:03:55Z',
 'state': 'SUCCEEDED'}

# retrain with best hparameters


In [67]:
alpha = response['trainingOutput']['trials'][0]['hyperparameters']['alpha']
max_iter = response['trainingOutput']['trials'][0]['hyperparameters']['max_iter']

In [68]:
JOB_NAME = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = "{}/{}".format(JOB_DIR_ROOT, JOB_NAME)
SCALE_TIER = "BASIC"

!gcloud ai-platform jobs submit training $JOB_NAME \
--region=$REGION \
--job-dir=$JOB_DIR \
--master-image-uri=$IMAGE_URI \
--scale-tier=$SCALE_TIER \
-- \
--training_dataset_path=$TRAINING_FILE_PATH \
--validation_dataset_path=$VALIDATION_FILE_PATH \
--alpha=$alpha \
--max_iter=$max_iter \
--nohptune

Job [JOB_20200826_190645] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20200826_190645

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20200826_190645
jobId: JOB_20200826_190645
state: QUEUED


In [None]:
!gcloud ai-platform jobs stream-logs $JOB_NAME

## deploy model to ai platform 

In [70]:
## create a model resource 

model_name = 'amyris'
labels = "task=classifier,domain=forestry"
filter = 'name:{}'.format(model_name)
# models = !(gcloud ai-platform models list --filter={filter} --format='value(name)')

# if not models:
!gcloud ai-platform models create  $model_name \
--regions=$REGION \
--labels=$labels
# else:
#     print("Model: {} already exists.".format(models[0]))

Using endpoint [https://ml.googleapis.com/]
Created ml engine model [projects/benazirsproject/models/amyris].


In [73]:
# create a model version
model_version = 'v01'
filter = 'name:{}'.format(model_version)
# versions = !(gcloud ai-platform versions list --model={model_name} --format='value(name)' --filter={filter})

# if not versions:
!gcloud ai-platform versions create {model_version} \
    --model={model_name} \
    --origin=$JOB_DIR \
    --runtime-version=1.15 \
    --framework=scikit-learn \
    --python-version=3.7
# else:
#     print("Model version: {} already exists.".format(versions[0]))

Using endpoint [https://ml.googleapis.com/]
Creating version (this might take a few minutes)......done.                    


In [None]:
input_file = 'serving_instances.json'

with open(input_file, 'w') as f:
    for index, row in X_validation.head().iterrows():
        f.write(json.dumps(list(row.values)))
        f.write('\n')

In [None]:
!cat $input_file

In [None]:
!gcloud ai-platform predict \
--model $model_name \
--version $model_version \
--json-instances $input_file

In [None]:
print("a")

# lab 3

1. create a pipeline folder 

In [3]:
%%writefile ./pipeline/covertype_training_pipeline.py
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""KFP pipeline orchestrating BigQuery and Cloud AI Platform services."""

import os

from helper_components import evaluate_model
from helper_components import retrieve_best_run
from jinja2 import Template
import kfp
from kfp.components import func_to_container_op
from kfp.dsl.types import Dict
from kfp.dsl.types import GCPProjectID
from kfp.dsl.types import GCPRegion
from kfp.dsl.types import GCSPath
from kfp.dsl.types import String
from kfp.gcp import use_gcp_secret

# Defaults and environment settings
BASE_IMAGE = os.getenv('BASE_IMAGE')
TRAINER_IMAGE = os.getenv('TRAINER_IMAGE')
RUNTIME_VERSION = os.getenv('RUNTIME_VERSION')
PYTHON_VERSION = os.getenv('PYTHON_VERSION')
COMPONENT_URL_SEARCH_PREFIX = os.getenv('COMPONENT_URL_SEARCH_PREFIX')
USE_KFP_SA = os.getenv('USE_KFP_SA')

TRAINING_FILE_PATH = 'datasets/training/data.csv'
VALIDATION_FILE_PATH = 'datasets/validation/data.csv'
TESTING_FILE_PATH = 'datasets/testing/data.csv'

# Parameter defaults
SPLITS_DATASET_ID = 'splits'
HYPERTUNE_SETTINGS = """
{
    "hyperparameters":  {
        "goal": "MAXIMIZE",
        "maxTrials": 6,
        "maxParallelTrials": 3,
        "hyperparameterMetricTag": "accuracy",
        "enableTrialEarlyStopping": True,
        "params": [
            {
                "parameterName": "max_iter",
                "type": "DISCRETE",
                "discreteValues": [500, 1000]
            },
            {
                "parameterName": "alpha",
                "type": "DOUBLE",
                "minValue": 0.0001,
                "maxValue": 0.001,
                "scaleType": "UNIT_LINEAR_SCALE"
            }
        ]
    }
}
"""


# Helper functions
def generate_sampling_query(source_table_name, num_lots, lots):
    """Prepares the data sampling query."""

    sampling_query_template = """
         SELECT *
         FROM 
             `{{ source_table }}` AS cover
         WHERE 
         MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(cover))), {{ num_lots }}) IN ({{ lots }})
         """
    query = Template(sampling_query_template).render(
        source_table=source_table_name, num_lots=num_lots, lots=str(lots)[1:-1])

    return query


# Create component factories
component_store = kfp.components.ComponentStore(
    local_search_paths=None, url_search_prefixes=[COMPONENT_URL_SEARCH_PREFIX])

bigquery_query_op = component_store.load_component('bigquery/query')
mlengine_train_op = component_store.load_component('ml_engine/train')
mlengine_deploy_op = component_store.load_component('ml_engine/deploy')
retrieve_best_run_op = func_to_container_op(
    retrieve_best_run, base_image=BASE_IMAGE)
evaluate_model_op = func_to_container_op(evaluate_model, base_image=BASE_IMAGE)


@kfp.dsl.pipeline(
    name='Covertype Classifier Training',
    description='The pipeline training and deploying the Covertype classifierpipeline_yaml'
)
def covertype_train(project_id,
                    region,
                    source_table_name,
                    gcs_root,
                    dataset_id,
                    evaluation_metric_name,
                    evaluation_metric_threshold,
                    model_id,
                    version_id,
                    replace_existing_version,
                    hypertune_settings=HYPERTUNE_SETTINGS,
                    dataset_location='US'):
    """Orchestrates training and deployment of an sklearn model."""

    # Create the training split
    query = generate_sampling_query(
        source_table_name=source_table_name, num_lots=10, lots=[1, 2, 3, 4])

    training_file_path = '{}/{}'.format(gcs_root, TRAINING_FILE_PATH)

    create_training_split = bigquery_query_op(
        query=query,
        project_id=project_id,
        dataset_id=dataset_id,
        table_id='',
        output_gcs_path=training_file_path,
        dataset_location=dataset_location)

    # Create the validation split
    query = generate_sampling_query(
        source_table_name=source_table_name, num_lots=10, lots=[8])

    validation_file_path = '{}/{}'.format(gcs_root, VALIDATION_FILE_PATH)

    create_validation_split = bigquery_query_op(
        query=query,
        project_id=project_id,
        dataset_id=dataset_id,
        table_id='',
        output_gcs_path=validation_file_path,
        dataset_location=dataset_location)

    # Create the testing split
    query = generate_sampling_query(
        source_table_name=source_table_name, num_lots=10, lots=[9])

    testing_file_path = '{}/{}'.format(gcs_root, TESTING_FILE_PATH)

    create_testing_split = bigquery_query_op(
        query=query,
        project_id=project_id,
        dataset_id=dataset_id,
        table_id='',
        output_gcs_path=testing_file_path,
        dataset_location=dataset_location)

    # Tune hyperparameters
    tune_args = [
        '--training_dataset_path',
        create_training_split.outputs['output_gcs_path'],
        '--validation_dataset_path',
        create_validation_split.outputs['output_gcs_path'], '--hptune', 'True'
    ]

    job_dir = '{}/{}/{}'.format(gcs_root, 'jobdir/hypertune',
                                kfp.dsl.RUN_ID_PLACEHOLDER)

    hypertune = mlengine_train_op(
        project_id=project_id,
        region=region,
        master_image_uri=TRAINER_IMAGE,
        job_dir=job_dir,
        args=tune_args,
        training_input=hypertune_settings)

    # Retrieve the best trial
    get_best_trial = retrieve_best_run_op(
            project_id, hypertune.outputs['job_id'])

    # Train the model on a combined training and validation datasets
    job_dir = '{}/{}/{}'.format(gcs_root, 'jobdir', kfp.dsl.RUN_ID_PLACEHOLDER)

    train_args = [
        '--training_dataset_path',
        create_training_split.outputs['output_gcs_path'],
        '--validation_dataset_path',
        create_validation_split.outputs['output_gcs_path'], '--alpha',
        get_best_trial.outputs['alpha'], '--max_iter',
        get_best_trial.outputs['max_iter'], '--hptune', 'False'
    ]

    train_model = mlengine_train_op(
        project_id=project_id,
        region=region,
        master_image_uri=TRAINER_IMAGE,
        job_dir=job_dir,
        args=train_args)

    # Evaluate the model on the testing split
    eval_model = evaluate_model_op(
        dataset_path=str(create_testing_split.outputs['output_gcs_path']),
        model_path=str(train_model.outputs['job_dir']),
        metric_name=evaluation_metric_name)

    # Deploy the model if the primary metric is better than threshold
    with kfp.dsl.Condition(eval_model.outputs['metric_value'] > evaluation_metric_threshold):
        deploy_model = mlengine_deploy_op(
        model_uri=train_model.outputs['job_dir'],
        project_id=project_id,
        model_id=model_id,
        version_id=version_id,
        runtime_version=RUNTIME_VERSION,
        python_version=PYTHON_VERSION,
        replace_existing_version=replace_existing_version)

    # Configure the pipeline to run using the service account defined
    # in the user-gcp-sa k8s secret
    if USE_KFP_SA == 'True':
        kfp.dsl.get_pipeline_conf().add_op_transformer(
              use_gcp_secret('user-gcp-sa'))

Overwriting ./pipeline/covertype_training_pipeline.py


In [4]:
%%writefile ./base_image/Dockerfile
FROM gcr.io/deeplearning-platform-release/base-cpu
RUN pip install -U fire scikit-learn==0.20.4 pandas==0.24.2 kfp==0.2.5

Overwriting ./base_image/Dockerfile


In [5]:
REGION = 'us-central1'
ENDPOINT = '19a5aed0f754a516-dot-us-central2.pipelines.googleusercontent.com'
ARTIFACT_STORE_URI = 'gs://benazirsproject-demo'
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

In [8]:
TRAINING_APP_FOLDER = 'training_app'
os.makedirs(TRAINING_APP_FOLDER, exist_ok=True)

In [9]:
pwd

'/home/jupyter/mlops-on-gcp/workshops/kfp-caip-sklearn/lab_02_self_test'

In [10]:
%%writefile ./$TRAINING_APP_FOLDER/Dockerfile

FROM gcr.io/deeplearning-platform-release/base-cpu
RUN pip install -U fire cloudml-hypertune scikit-learn==0.20.4 pandas==0.24.2
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]


Writing ./training_app/Dockerfile


2. copy /trainer_image 

In [20]:
IMAGE_NAME='trainer_image'
TAG='latest'
TRAINER_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, TAG)

In [21]:
!gcloud builds submit --timeout 15m --tag $TRAINER_IMAGE training_app

Creating temporary tarball archive of 4 file(s) totalling 7.2 KiB before compression.
Uploading tarball of [training_app] to [gs://benazirsproject_cloudbuild/source/1598483658.08-65876e66f4a64641a6920c1c697d45da.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/benazirsproject/builds/d4a52430-4eab-4df4-a17b-f6939f419eba].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/d4a52430-4eab-4df4-a17b-f6939f419eba?project=981930454113].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "d4a52430-4eab-4df4-a17b-f6939f419eba"

FETCHSOURCE
Fetching storage object: gs://benazirsproject_cloudbuild/source/1598483658.08-65876e66f4a64641a6920c1c697d45da.tgz#1598483658638891
Copying gs://benazirsproject_cloudbuild/source/1598483658.08-65876e66f4a64641a6920c1c697d45da.tgz#1598483658638891...
/ [1 files][  1.9 KiB/  1.9 KiB]                                                
Operation completed over 1 objects/1.9 KiB.         

# copy base image folder 


In [22]:
IMAGE_NAME='base_image'
TAG='latest'
BASE_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, TAG)

In [23]:
!gcloud builds submit --timeout 15m --tag $BASE_IMAGE base_image

Creating temporary tarball archive of 2 file(s) totalling 244 bytes before compression.
Uploading tarball of [base_image] to [gs://benazirsproject_cloudbuild/source/1598483933.72-711e0acfe0cd4f17b1b8d0961b8e896e.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/benazirsproject/builds/983bd107-1259-44a5-9b9f-a0c03a49a2d1].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/983bd107-1259-44a5-9b9f-a0c03a49a2d1?project=981930454113].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "983bd107-1259-44a5-9b9f-a0c03a49a2d1"

FETCHSOURCE
Fetching storage object: gs://benazirsproject_cloudbuild/source/1598483933.72-711e0acfe0cd4f17b1b8d0961b8e896e.tgz#1598483934175368
Copying gs://benazirsproject_cloudbuild/source/1598483933.72-711e0acfe0cd4f17b1b8d0961b8e896e.tgz#1598483934175368...
/ [1 files][  290.0 B/  290.0 B]                                                
Operation completed over 1 objects/290.0 B.         

## compile pipeline 


In [24]:
USE_KFP_SA = False

COMPONENT_URL_SEARCH_PREFIX = 'https://raw.githubusercontent.com/kubeflow/pipelines/0.2.5/components/gcp/'
RUNTIME_VERSION = '1.15'
PYTHON_VERSION = '3.7'

%env USE_KFP_SA={USE_KFP_SA}
%env BASE_IMAGE={BASE_IMAGE}
%env TRAINER_IMAGE={TRAINER_IMAGE}
%env COMPONENT_URL_SEARCH_PREFIX={COMPONENT_URL_SEARCH_PREFIX}
%env RUNTIME_VERSION={RUNTIME_VERSION}
%env PYTHON_VERSION={PYTHON_VERSION}

env: USE_KFP_SA=False
env: BASE_IMAGE=gcr.io/benazirsproject/base_image:latest
env: TRAINER_IMAGE=gcr.io/benazirsproject/trainer_image:latest
env: COMPONENT_URL_SEARCH_PREFIX=https://raw.githubusercontent.com/kubeflow/pipelines/0.2.5/components/gcp/
env: RUNTIME_VERSION=1.15
env: PYTHON_VERSION=3.7


## copy helper_componenets 

In [25]:
!dsl-compile --py pipeline/covertype_training_pipeline.py --output covertype_training_pipeline.yaml 

In [26]:
!head covertype_training_pipeline.yaml ## where is this file . ?


apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
  generateName: covertype-classifier-training-
  annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.0.0, pipelines.kubeflow.org/pipeline_compilation_time: '2020-08-26T23:24:58.156040',
    pipelines.kubeflow.org/pipeline_spec: '{"description": "The pipeline training
      and deploying the Covertype classifierpipeline_yaml", "inputs": [{"name": "project_id"},
      {"name": "region"}, {"name": "source_table_name"}, {"name": "gcs_root"}, {"name":
      "dataset_id"}, {"name": "evaluation_metric_name"}, {"name": "evaluation_metric_threshold"},
      {"name": "model_id"}, {"name": "version_id"}, {"name": "replace_existing_version"},


## deploy the pipeline package 

In [28]:
PIPELINE_NAME='amyris_pipeline'

!kfp --endpoint $ENDPOINT pipeline upload \
-p $PIPELINE_NAME \
covertype_training_pipeline.yaml

Pipeline b6b2b054-e524-4acf-b3fc-0e7e3ba271bd has been submitted

Pipeline Details
------------------
ID           b6b2b054-e524-4acf-b3fc-0e7e3ba271bd
Name         amyris_pipeline
Description
Uploaded at  2020-08-26T23:27:54+00:00
+-----------------------------+--------------------------------------------------+
| Parameter Name              | Default Value                                    |
| project_id                  |                                                  |
+-----------------------------+--------------------------------------------------+
| region                      |                                                  |
+-----------------------------+--------------------------------------------------+
| source_table_name           |                                                  |
+-----------------------------+--------------------------------------------------+
| gcs_root                    |                                                  |
+--------------------

In [29]:
!kfp --endpoint $ENDPOINT pipeline list

+--------------------------------------+-------------------------------------------------+---------------------------+
| Pipeline ID                          | Name                                            | Uploaded at               |
| b6b2b054-e524-4acf-b3fc-0e7e3ba271bd | amyris_pipeline                                 | 2020-08-26T23:27:54+00:00 |
+--------------------------------------+-------------------------------------------------+---------------------------+
| 4ffefa9f-c1aa-4b11-9b4b-ad70c223226e | covertype_continuous_training_self              | 2020-08-26T23:25:58+00:00 |
+--------------------------------------+-------------------------------------------------+---------------------------+
| 1b5be90d-6020-43fc-8126-b6d979da2e39 | [Tutorial] DSL - Control structures             | 2020-08-26T22:17:35+00:00 |
+--------------------------------------+-------------------------------------------------+---------------------------+
| b162aff5-6673-41eb-8619-221ccfaaa71c | [Tutori

In [30]:
PIPELINE_ID='b6b2b054-e524-4acf-b3fc-0e7e3ba271bd'

In [33]:
EXPERIMENT_NAME = 'Amyris TRaining'
RUN_ID = 'Run_001'
SOURCE_TABLE = 'covertype_dataset.covertype'
DATASET_ID = 'splits'
EVALUATION_METRIC = 'accuracy'
EVALUATION_METRIC_THRESHOLD = '0.69'
MODEL_ID = 'covertype_classifier'
VERSION_ID = 'v01'
REPLACE_EXISTING_VERSION = 'True'

GCS_STAGING_PATH = '{}/staging'.format(ARTIFACT_STORE_URI)

In [34]:
!kfp --endpoint $ENDPOINT run submit \
-e $EXPERIMENT_NAME \
-r $RUN_ID \
-p $PIPELINE_ID \
project_id=$PROJECT_ID \
gcs_root=$GCS_STAGING_PATH \
region=$REGION \
source_table_name=$SOURCE_TABLE \
dataset_id=$DATASET_ID \
evaluation_metric_name=$EVALUATION_METRIC \
evaluation_metric_threshold=$EVALUATION_METRIC_THRESHOLD \
model_id=$MODEL_ID \
version_id=$VERSION_ID \
replace_existing_version=$REPLACE_EXISTING_VERSION

dictionary update sequence element #0 has length 1; 2 is required
