## lab 1 

In [291]:
import pandas as pd
import numpy as np
import json

from googleapiclient import discovery
from googleapiclient import errors

import time 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA

import plotly.graph_objects as go
import plotly.express as px


import os

## upload data in GCS

In [235]:
REGION = 'us-central1'
ARTIFACT_STORE = 'gs://benazirsproject-demo'

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]
DATA_ROOT='{}/data'.format(ARTIFACT_STORE)
JOB_DIR_ROOT='{}/jobs'.format(ARTIFACT_STORE)
TRAINING_FILE_PATH='{}/{}/{}'.format(DATA_ROOT, 'training', 'Anonymized_Fermentation_Data_final.xlsx')
# VALIDATION_FILE_PATH='{}/{}/{}'.format(DATA_ROOT, 'validation', 'dataset.csv')

In [263]:
TRAINING_FILE_PATH

'gs://benazirsproject-demo/data/training/Anonymized_Fermentation_Data_final.xlsx'

In [236]:
data = pd.read_excel(TRAINING_FILE_PATH,sheet_name='data')
meta_data = pd.read_excel(TRAINING_FILE_PATH, sheet_name='meta data')

## they need to upload data in bigquery - and then 

In [40]:
%%bigquery
SELECT *
FROM `amyris.amyris_fermentation_data`

Unnamed: 0,Product,Purpose,experiment,run,Project_Name,run_label,Strain,strain_key,Feedstock_Parent1,Start_Time,...,Cap_Oil_Em_End__percent,Cap_PCV_End__percent,Cap_Dead_Cell_Layer_End__percent,Zeex9ieJAlt_end__g_L,Zeex9ieJ_mAU_sec_end__area,Zeex9ieJ_Screening_end__g_L,Zeex9ieJ_end__g_L,Zeex9ieJ_end__g_L_121,Zeex9ieJ_end__g_L_122,interval_type
0,Zeex9ieJ,MF,F9FD5EC4C1,15,Zeex9ieJ for All,11408-15,F9FD5EC4C1,9215897,m1098919,1/20/21 2:25 PM,...,16.32145,21.04034,,7.455144,1.3341,9.923562,0.606429,0.675183,0,Cumulative
1,Zeex9ieJ,MF,F9FD5EC4C1,15,Zeex9ieJ for All,11408-15,F9FD5EC4C1,9215897,m1098919,1/20/21 2:25 PM,...,16.32145,21.04034,,7.455144,1.3341,9.923562,0.606429,0.675183,0,Curated
2,Zeex9ieJ,MF,F9FD5EC4C1,15,Zeex9ieJ for All,11408-15,F9FD5EC4C1,9215897,m1098919,1/20/21 2:25 PM,...,16.32145,21.04034,,7.455144,1.3341,9.923562,0.606429,0.675183,0,Cumulative (Day 3-)
3,Zeex9ieJ,MF,F9FD5EC4C1,15,Zeex9ieJ for All,11408-15,F9FD5EC4C1,9215897,m1098919,1/20/21 2:25 PM,...,16.32145,21.04034,,7.455144,1.3341,9.923562,0.606429,0.675183,0,Single
4,Zeex9ieJ,MF,F9FD5EC4C1,16,Zeex9ieJ for All,11409-16,F9FD5EC4C1,9215897,m1098919,1/20/21 2:24 PM,...,15.21805,20.34157,,4.546885,2.179946,7.52725,0.342935,0.455659,1.684056,Cumulative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1563,Zeex9ieJ,MF,B4EDDA67F0,8,Zeex9ieJ for All,16209-8,B4EDDA67F0,8770794,m1252714,9/30/21 2:32 PM,...,19.50613,22.02205,,17.845346,0,0,0,0,,Single
1564,Zeex9ieJ,MF,B4EDDA67F0,8,Zeex9ieJ for All,16209-8,B4EDDA67F0,8770794,m1252714,9/30/21 2:32 PM,...,16.0144,18.78522,,31.702724,0,0,0,0,,Cumulative (Day 3-)
1565,Zeex9ieJ,MF,B4EDDA67F0,8,Zeex9ieJ for All,16209-8,B4EDDA67F0,8770794,m1252714,9/30/21 2:32 PM,...,16.5879,18.74606,,38.355477,0,0,0,0,,Cumulative (Day 3-)
1566,Zeex9ieJ,MF,B4EDDA67F0,8,Zeex9ieJ for All,16209-8,B4EDDA67F0,8770794,m1252714,9/30/21 2:32 PM,...,16.0144,18.78522,,31.702724,0,0,0,0,,Single


In [43]:
!bq query \
-n 0 \
--destination_table amyris.training \
--replace \
--use_legacy_sql=false \
'SELECT * \
FROM `amyris.amyris_fermentation_data` AS training \
WHERE \
MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(training))), 10) IN (1, 2, 3, 4)' 

Waiting on bqjob_r7286b0f3cf393f52_000001744bd1a546_1 ... (1s) Current status: DONE   


In [31]:
meta_data.head()

print('The data contains {} samples and {} variables'.format(data.shape[0],data.shape[1]))
print('There are {} unique strains which are replicated or measured under different fermentation conditions.'.format(data['Strain'].nunique()))
print('The variables are comprised of a variety of fermentation process (meta data), physiological and biochemical parameters (independent):')
display(meta_data['variable type'].value_counts())
print('include: {}'.format(meta_data.query('target == 1').name.values))

The data contains 1568 samples and 124 variables
There are 20 unique strains which are replicated or measured under different fermentation conditions.
The variables are comprised of a variety of fermentation process (meta data), physiological and biochemical parameters (independent):


independent    63
metadata       53
dependent       5
category        3
Name: variable type, dtype: int64

include: ['Run_Execution' 'Run_Performance' 'Product_Produced__g'
 'Titer_End__g_over_kg']


In [104]:
#prepare data for analysis
#split out numeric from categorical varibles
numeric_vars = ((data.dtypes == 'float64') | (data.dtypes == 'int64')) & (meta_data['variable type'] == 'independent').values
numeric_x_data = data[data.columns[numeric_vars]]

#things to try to predict
y_data = data[['Run_Performance']]#data[data.columns[(meta_data['target'] == 1).values]]


In [None]:
#meta data about variables
meta_data = meta_data.set_index('name')

In [106]:
y_data.Run_Performance.replace(('delta', 'gamma'), (1, 0), inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [107]:
y_data.head(2)

Unnamed: 0,Run_Performance
0,1
1,1


In [144]:
X_train = numeric_x_data[:1400]
y_train = y_data[:1400]
X_validation = numeric_x_data[1400:]
y_validation = y_data[1400:]

In [145]:
#impute missing with median
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

#auto scale
scaler = StandardScaler()
pca = PCA(n_components=3)
pipe = Pipeline([('imputer',imputer),
                 ('scaler', scaler),
                 ('pca', pca),
                 ('classifier', SGDClassifier(loss='log', tol=1e-3))
                ])
pipe.set_params(classifier__alpha=0.001, classifier__max_iter=200)
pca_result = pipe.fit(X_train, y_train.values.ravel())

In [148]:
accuracy = pipe.score(X_validation, y_validation)
print(accuracy)

0.8869047619047619


## steps to create an image

In [192]:
TRAINING_APP_FOLDER = 'training_app'
os.makedirs(TRAINING_APP_FOLDER, exist_ok=True)

In [225]:
%%writefile {TRAINING_APP_FOLDER}/train.py

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import subprocess
import sys

import fire
import pickle
import numpy as np
import pandas as pd

import hypertune

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

def train_evaluate(job_dir, training_dataset_path, alpha, max_iter, hptune):
    data = pd.read_excel(training_dataset_path,sheet_name='data')
    meta_data = pd.read_excel(training_dataset_path, sheet_name='meta data')
    
    numeric_vars = ((data.dtypes == 'float64') | (data.dtypes == 'int64')) & (meta_data['variable type'] == 'independent').values
    numeric_x_data = data[data.columns[numeric_vars]]

    #things to try to predict
    y_data = data[['Run_Performance']]
    meta_data = meta_data.set_index('name')
    y_data.Run_Performance.replace(('delta', 'gamma'), (1, 0), inplace=True)

    X_train = numeric_x_data[:1400]
    y_train = y_data[:1400]
    X_validation = numeric_x_data[1400:]
    y_validation = y_data[1400:]
    
    if not hptune:
        X_train = pd.concat([X_train, X_validation])
        y_train = pd.concat([y_train, y_validation])

    #impute missing with median
    imputer = SimpleImputer(missing_values=np.nan, strategy='median')

    #auto scale
    scaler = StandardScaler()
    pca = PCA(n_components=3)
    pipe = Pipeline([('imputer',imputer),
                     ('scaler', scaler),
                     ('pca', pca),
                     ('classifier', SGDClassifier(loss='log', tol=1e-3))
                    ])

    

    print('Starting training: alpha={}, max_iter={}'.format(alpha, max_iter))

    pipe.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter)
    pipe.fit(X_train, y_train.values.ravel())

    if hptune:
        accuracy = pipe.score(X_validation, y_validation)
        print('Model accuracy: {}'.format(accuracy))
        # Log it with hypertune
        hpt = hypertune.HyperTune()
        hpt.report_hyperparameter_tuning_metric(
          hyperparameter_metric_tag='accuracy',
          metric_value=accuracy
        )

    # Save the model
    if not hptune:
        model_filename = 'model.pkl'
        with open(model_filename, 'wb') as model_file:
            pickle.dump(pipe, model_file)
        gcs_model_path = "{}/{}".format(job_dir, model_filename)
        subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path], stderr=sys.stdout)
        print("Saved model in: {}".format(gcs_model_path)) 
    
if __name__ == "__main__":
    fire.Fire(train_evaluate)

Overwriting training_app/train.py


## package script into a docker image 

In [226]:
%%writefile {TRAINING_APP_FOLDER}/Dockerfile

FROM gcr.io/deeplearning-platform-release/base-cpu
RUN pip install -U fire cloudml-hypertune scikit-learn==0.20.4 pandas==0.24.2
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

Overwriting training_app/Dockerfile


# buidl the docker image 

In [333]:
IMAGE_NAME='amyris_trainer_image'
IMAGE_TAG='latest'
TRAINER_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, IMAGE_TAG)

In [334]:
TRAINER_IMAGE

'gcr.io/benazirsproject/amyris_trainer_image:latest'

In [328]:
!gcloud builds submit --tag $TRAINER_IMAGE $TRAINING_APP_FOLDER

Creating temporary tarball archive of 4 file(s) totalling 4.6 KiB before compression.
Uploading tarball of [training_app] to [gs://benazirsproject_cloudbuild/source/1599102700.08-554cb713b31b4a1683a6ad530f16f96a.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/benazirsproject/builds/e5278675-b531-4d53-ac36-f7d23ce0704b].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/e5278675-b531-4d53-ac36-f7d23ce0704b?project=981930454113].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "e5278675-b531-4d53-ac36-f7d23ce0704b"

FETCHSOURCE
Fetching storage object: gs://benazirsproject_cloudbuild/source/1599102700.08-554cb713b31b4a1683a6ad530f16f96a.tgz#1599102700514485
Copying gs://benazirsproject_cloudbuild/source/1599102700.08-554cb713b31b4a1683a6ad530f16f96a.tgz#1599102700514485...
/ [1 files][  2.0 KiB/  2.0 KiB]                                                
Operation completed over 1 objects/2.0 KiB.         

## create hyperparameter file 

In [311]:
%%writefile {TRAINING_APP_FOLDER}/hptuning_config.yaml

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

trainingInput:
  hyperparameters:
    goal: MAXIMIZE
    maxTrials: 4
    maxParallelTrials: 4
    hyperparameterMetricTag: accuracy
    enableTrialEarlyStopping: TRUE 
    params:
    - parameterName: max_iter
      type: DISCRETE
      discreteValues: [
          200,
          500
          ]
    - parameterName: alpha
      type: DOUBLE
      minValue:  0.00001
      maxValue:  0.001
      scaleType: UNIT_LINEAR_SCALE

Overwriting training_app/hptuning_config.yaml


## start hyper parameter tuning job 

In [211]:
JOB_NAME = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = "{}/{}".format(JOB_DIR_ROOT, JOB_NAME)
SCALE_TIER = "BASIC"

!gcloud ai-platform jobs submit training $JOB_NAME \
--region=$REGION \
--job-dir=$JOB_DIR \
--master-image-uri=$TRAINER_IMAGE \
--scale-tier=$SCALE_TIER \
--config $TRAINING_APP_FOLDER/hptuning_config.yaml \
-- \
--training_dataset_path=$TRAINING_FILE_PATH \
--hptune

Job [JOB_20200902_203548] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20200902_203548

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20200902_203548
jobId: JOB_20200902_203548
state: QUEUED


In [212]:
ml = discovery.build('ml', 'v1')

job_id = 'projects/{}/jobs/{}'.format(PROJECT_ID, JOB_NAME)
request = ml.projects().jobs().get(name=job_id)

try:
    response = request.execute()
except errors.HttpError as err:
    print(err)
except:
    print("Unexpected error")
    
response

{'jobId': 'JOB_20200902_203548',
 'trainingInput': {'args': ['--training_dataset_path=gs://benazirsproject-demo/data/training/Anonymized_Fermentation_Data_final.xlsx',
   '--hptune'],
  'hyperparameters': {'goal': 'MAXIMIZE',
   'params': [{'parameterName': 'max_iter',
     'type': 'DISCRETE',
     'discreteValues': [200, 500]},
    {'parameterName': 'alpha',
     'minValue': 1e-05,
     'maxValue': 0.001,
     'type': 'DOUBLE',
     'scaleType': 'UNIT_LINEAR_SCALE'}],
   'maxTrials': 4,
   'maxParallelTrials': 4,
   'hyperparameterMetricTag': 'accuracy',
   'enableTrialEarlyStopping': True},
  'region': 'us-central1',
  'jobDir': 'gs://benazirsproject-demo/jobs/JOB_20200902_203548',
  'masterConfig': {'imageUri': 'gcr.io/benazirsproject/trainer_image:latest'}},
 'createTime': '2020-09-02T20:35:50Z',
 'startTime': '2020-09-02T20:35:51Z',
 'endTime': '2020-09-02T20:46:04Z',
 'state': 'SUCCEEDED',
 'trainingOutput': {'completedTrialCount': '4',
  'trials': [{'trialId': '2',
    'hyperpar

In [229]:
response['trainingOutput']['trials'][0]

{'trialId': '2',
 'hyperparameters': {'max_iter': '200', 'alpha': '0.00028790154168083194'},
 'finalMetric': {'trainingStep': '1', 'objectiveValue': 0.9523809523809523},
 'startTime': '2020-09-02T20:36:28.436283106Z',
 'endTime': '2020-09-02T20:45:00Z',
 'state': 'SUCCEEDED'}

In [230]:
alpha = response['trainingOutput']['trials'][0]['hyperparameters']['alpha']
max_iter = response['trainingOutput']['trials'][0]['hyperparameters']['max_iter']

In [330]:
JOB_NAME = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = "{}/{}".format(JOB_DIR_ROOT, JOB_NAME)
SCALE_TIER = "BASIC"

!gcloud ai-platform jobs submit training $JOB_NAME \
--region=$REGION \
--job-dir=$JOB_DIR \
--master-image-uri=$TRAINER_IMAGE \
--scale-tier=$SCALE_TIER \
-- \
--training_dataset_path=$TRAINING_FILE_PATH \
--alpha=$alpha \
--max_iter=$max_iter \
--nohptune

Job [JOB_20200903_031958] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20200903_031958

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20200903_031958
jobId: JOB_20200903_031958
state: QUEUED


In [313]:
model_name = 'amyris_endtoend2'
labels = "task=classifier,domain=healthcare"
filter = 'name:{}'.format(model_name)
# models = !(gcloud ai-platform models list --filter={filter} --format='value(name)')

# if not models:
!gcloud ai-platform models create  $model_name \
    --regions=$REGION \
    --labels=$labels
# else:
#     print("Model: {} already exists.".format(models[0]))

Using endpoint [https://ml.googleapis.com/]
Created ml engine model [projects/benazirsproject/models/amyris_endtoend2].


In [314]:
model_version = 'v01'
filter = 'name:{}'.format(model_version)
# versions = !(gcloud ai-platform versions list --model={model_name} --format='value(name)' --filter={filter})

# if not versions:
!gcloud ai-platform versions create {model_version} \
    --model={model_name} \
    --origin=$JOB_DIR \
    --runtime-version=1.15 \
    --framework=scikit-learn \
    --python-version=3.7
# else:
#     print("Model version: {} already exists.".format(versions[0]))

Using endpoint [https://ml.googleapis.com/]
Creating version (this might take a few minutes)......done.                    


In [292]:
input_file = 'serving_instances.json'

with open(input_file, 'w') as f:
    for index, row in X_validation.head().iterrows():
        f.write(json.dumps(list(row.values)))
        f.write('\n')

In [293]:
!cat $input_file

[30.5899229207696, 0.0652723072295666, 0.0264099131499111, 0.004874694806509, 0.00644991314991109, 2.22779261068584, 1.683714, 1.56741742827044, 5.7118868998969, 1.18461774905092, NaN, NaN, NaN, 4.0, 4.0, 0.0, 0.00297938144329896, 10.42988, 9.67653, NaN, 50.63335, 70.73976, 4.6589, 70.73976, 50.63335, 0.01996, 0.01156, 0.0493312963547756, 140.0, 174.72618, 4.29832181463847, NaN, NaN, NaN, NaN, NaN, NaN, 100000000.0, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, 0.57698, NaN, 0.68327, 0.0, 27.39144, 17.67385, NaN, 10.127133, 948.005266, 14.459386]
[30.5899229207696, 0.0652723072295666, 0.0264099131499111, 0.004874694806509, 0.00644991314991109, 2.22779261068584, 1.683714, 1.56741742827044, 5.7118868998969, 1.18461774905092, NaN, NaN, NaN, 4.0, 4.0, 0.0, 0.00297938144329896, 10.42988, 9.67653, NaN, 50.63335, 70.73976, 4.6589, 70.73976, 50.63335, 0.01996, 0.01156, 0.0493312963547756, 140.0, 174.72618, 4.29832181463847, NaN, NaN, NaN, NaN, NaN, NaN, 100000000.0, NaN, NaN

In [294]:
!gcloud ai-platform predict \
--model $model_name \
--version $model_version \
--json-instances $input_file

Using endpoint [https://ml.googleapis.com/]
[1, 1, 1, 1, 1]


In [332]:
TRAINER_IMAGE

'gcr.io/benazirsproject/trainer_image:latest'

In [351]:
%%writefile ./pipeline/amyris_pipeline.py
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""KFP pipeline orchestrating BigQuery and Cloud AI Platform services."""

import os

from helper_components import evaluate_model
from helper_components import retrieve_best_run
from jinja2 import Template
import kfp
from kfp.components import func_to_container_op
from kfp.dsl.types import Dict
from kfp.dsl.types import GCPProjectID
from kfp.dsl.types import GCPRegion
from kfp.dsl.types import GCSPath
from kfp.dsl.types import String
from kfp.gcp import use_gcp_secret

# Defaults and environment settings
BASE_IMAGE = os.getenv('BASE_IMAGE')
TRAINER_IMAGE = os.getenv('TRAINER_IMAGE')
RUNTIME_VERSION = os.getenv('RUNTIME_VERSION')
PYTHON_VERSION = os.getenv('PYTHON_VERSION')
COMPONENT_URL_SEARCH_PREFIX = os.getenv('COMPONENT_URL_SEARCH_PREFIX')
USE_KFP_SA = os.getenv('USE_KFP_SA')
TRAINING_FILE_PATH = 'gs://benazirsproject-demo/data/training/Anonymized_Fermentation_Data_final.xlsx'
# VALIDATION_FILE_PATH = 'datasets/validation/data.csv'
# TESTING_FILE_PATH = 'datasets/testing/data.csv'

# Parameter defaults
# SPLITS_DATASET_ID = 'splits'
HYPERTUNE_SETTINGS = """
{
    "hyperparameters":  {
        "goal": "MAXIMIZE",
        "maxTrials": 3,
        "maxParallelTrials": 3,
        "hyperparameterMetricTag": "accuracy",
        "enableTrialEarlyStopping": True,
        "params": [
            {
                "parameterName": "max_iter",
                "type": "DISCRETE",
                "discreteValues": [500, 1000]
            },
            {
                "parameterName": "alpha",
                "type": "DOUBLE",
                "minValue": 0.0001,
                "maxValue": 0.001,
                "scaleType": "UNIT_LINEAR_SCALE"
            }
        ]
    }
}
"""

# # Helper functions
# def generate_sampling_query(source_table_name, num_lots, lots):
#     """Prepares the data sampling query."""

#     sampling_query_template = """
#          SELECT *
#          FROM 
#              `{{ source_table }}` AS cover
#          WHERE 
#          MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(cover))), {{ num_lots }}) IN ({{ lots }})
#          """
#     query = Template(sampling_query_template).render(
#         source_table=source_table_name, num_lots=num_lots, lots=str(lots)[1:-1])

#     return query


# Create component factories
component_store = kfp.components.ComponentStore(
    local_search_paths=None, url_search_prefixes=[COMPONENT_URL_SEARCH_PREFIX])

# bigquery_query_op = component_store.load_component('bigquery/query')
mlengine_train_op = component_store.load_component('ml_engine/train')
mlengine_deploy_op = component_store.load_component('ml_engine/deploy')
retrieve_best_run_op = func_to_container_op(
    retrieve_best_run, base_image=BASE_IMAGE)
evaluate_model_op = func_to_container_op(evaluate_model, base_image=BASE_IMAGE)


@kfp.dsl.pipeline(
    name='Amyris Classifier Training',
    description='The pipeline training and deploying the Amyris classifierpipeline_yaml'
)
def amyris_train(project_id,
                    region,
                    gcs_root,
                    evaluation_metric_name,
                    evaluation_metric_threshold,
                    model_id,
                    version_id,
                    replace_existing_version,
                    hypertune_settings=HYPERTUNE_SETTINGS,
                    dataset_location='US'):
    """Orchestrates training and deployment of an sklearn model."""

    # Create the training split
#     query = generate_sampling_query(
#         source_table_name=source_table_name, num_lots=10, lots=[1, 2, 3, 4])

#     training_file_path = '{}/{}'.format(gcs_root, TRAINING_FILE_PATH)

#     create_training_split = bigquery_query_op(
#         query=query,
#         project_id=project_id,
#         dataset_id=dataset_id,
#         table_id='',
#         output_gcs_path=training_file_path,
#         dataset_location=dataset_location)

#     # Create the validation split
#     query = generate_sampling_query(
#         source_table_name=source_table_name, num_lots=10, lots=[8])

#     validation_file_path = '{}/{}'.format(gcs_root, VALIDATION_FILE_PATH)

#     create_validation_split = bigquery_query_op(
#         query=query,
#         project_id=project_id,
#         dataset_id=dataset_id,
#         table_id='',
#         output_gcs_path=validation_file_path,
#         dataset_location=dataset_location)

    # Create the testing split
#     query = generate_sampling_query(
#         source_table_name=source_table_name, num_lots=10, lots=[9])

#     testing_file_path = '{}/{}'.format(gcs_root, TESTING_FILE_PATH)

#     create_testing_split = bigquery_query_op(
#         query=query,
#         project_id=project_id,
#         dataset_id=dataset_id,
#         table_id='',
#         output_gcs_path=testing_file_path,
#         dataset_location=dataset_location)

    # Tune hyperparameters
    tune_args = [
        '--training_dataset_path',
        TRAINING_FILE_PATH,
         '--hptune', 'True'
    ]

    job_dir = '{}/{}/{}'.format(gcs_root, 'jobdir/hypertune',
                                kfp.dsl.RUN_ID_PLACEHOLDER)

    hypertune = mlengine_train_op(
        project_id=project_id,
        region=region,
        master_image_uri='gcr.io/benazirsproject/amyris_trainer_image:latest',
        job_dir=job_dir,
        args=tune_args,
        training_input=hypertune_settings)

    # Retrieve the best trial
    get_best_trial = retrieve_best_run_op(
            project_id, hypertune.outputs['job_id'])

    # Train the model on a combined training and validation datasets
    job_dir = '{}/{}/{}'.format(gcs_root, 'jobdir', kfp.dsl.RUN_ID_PLACEHOLDER)

    train_args = [
        '--training_dataset_path',
       TRAINING_FILE_PATH,
         '--alpha',
        get_best_trial.outputs['alpha'], '--max_iter',
        get_best_trial.outputs['max_iter'], '--hptune', 'False'
    ]

    train_model = mlengine_train_op(
        project_id=project_id,
        region=region,
        master_image_uri='gcr.io/benazirsproject/amyris_trainer_image:latest',
        job_dir=job_dir,
        args=train_args)

    # Evaluate the model on the testing split
    eval_model = evaluate_model_op(
        dataset_path=TRAINING_FILE_PATH,
        model_path=str(train_model.outputs['job_dir']),
        metric_name=evaluation_metric_name)

    # Deploy the model if the primary metric is better than threshold
    with kfp.dsl.Condition(eval_model.outputs['metric_value'] > evaluation_metric_threshold):
        deploy_model = mlengine_deploy_op(
        model_uri=train_model.outputs['job_dir'],
        project_id=project_id,
        model_id=model_id,
        version_id=version_id,
        runtime_version=RUNTIME_VERSION,
        python_version=PYTHON_VERSION,
        replace_existing_version=replace_existing_version)

    # Configure the pipeline to run using the service account defined
      # in the user-gcp-sa k8s secret
    if USE_KFP_SA == 'True':
        kfp.dsl.get_pipeline_conf().add_op_transformer(
              use_gcp_secret('user-gcp-sa'))

Overwriting ./pipeline/amyris_pipeline.py


In [336]:
TRAINER_IMAGE

'gcr.io/benazirsproject/amyris_trainer_image:latest'

In [338]:
%%writefile ./pipeline/helper_components.py

# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
"""Helper components."""

from typing import NamedTuple


def retrieve_best_run(
    project_id: str, job_id: str
) -> NamedTuple('Outputs', [('metric_value', float), ('alpha', float),
                            ('max_iter', int)]):
  """Retrieves the parameters of the best Hypertune run."""

  from googleapiclient import discovery
  from googleapiclient import errors

  ml = discovery.build('ml', 'v1')

  job_name = 'projects/{}/jobs/{}'.format(project_id, job_id)
  request = ml.projects().jobs().get(name=job_name)

  try:
    response = request.execute()
  except errors.HttpError as err:
    print(err)
  except:
    print('Unexpected error')

  print(response)

  best_trial = response['trainingOutput']['trials'][0]

  metric_value = best_trial['finalMetric']['objectiveValue']
  alpha = float(best_trial['hyperparameters']['alpha'])
  max_iter = int(best_trial['hyperparameters']['max_iter'])

  return (metric_value, alpha, max_iter)


def evaluate_model(
    dataset_path: str, model_path: str, metric_name: str
) -> NamedTuple('Outputs', [('metric_name', str), ('metric_value', float),
                            ('mlpipeline_metrics', 'Metrics')]):
  """Evaluates a trained sklearn model."""
  #import joblib
  import pickle
  import json
  import pandas as pd
  import subprocess
  import sys

  from sklearn.metrics import accuracy_score, recall_score

  df_test = pd.read_csv(dataset_path)

  X_test = df_test.drop('Cover_Type', axis=1)
  y_test = df_test['Cover_Type']

  # Copy the model from GCS
  model_filename = 'model.pkl'
  gcs_model_filepath = '{}/{}'.format(model_path, model_filename)
  print(gcs_model_filepath)
  subprocess.check_call(['gsutil', 'cp', gcs_model_filepath, model_filename],
                        stderr=sys.stdout)

  with open(model_filename, 'rb') as model_file:
    model = pickle.load(model_file)

  y_hat = model.predict(X_test)

  if metric_name == 'accuracy':
    metric_value = accuracy_score(y_test, y_hat)
  elif metric_name == 'recall':
    metric_value = recall_score(y_test, y_hat)
  else:
    metric_name = 'N/A'
    metric_value = 0

  # Export the metric
  metrics = {
      'metrics': [{
          'name': metric_name,
          'numberValue': float(metric_value)
      }]
  }

  return (metric_name, metric_value, json.dumps(metrics))


Overwriting ./pipeline/helper_components.py


## create an empty folder base_image

In [299]:
%%writefile ./base_image/Dockerfile
FROM gcr.io/deeplearning-platform-release/base-cpu
RUN pip install -U fire scikit-learn==0.20.4 pandas==0.24.2 kfp==0.2.5

Overwriting ./base_image/Dockerfile


In [300]:
IMAGE_NAME='base_image'
TAG='latest'
BASE_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, TAG)

In [246]:
!gcloud builds submit --timeout 15m --tag $BASE_IMAGE base_image

Creating temporary tarball archive of 1 file(s) totalling 122 bytes before compression.
Uploading tarball of [base_image] to [gs://benazirsproject_cloudbuild/source/1599084515.31-1c7fb82a1b4d46b0af6be2e2e95ba3e9.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/benazirsproject/builds/fc39fdfd-4636-4d6e-b387-05acd1e075bb].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/fc39fdfd-4636-4d6e-b387-05acd1e075bb?project=981930454113].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "fc39fdfd-4636-4d6e-b387-05acd1e075bb"

FETCHSOURCE
Fetching storage object: gs://benazirsproject_cloudbuild/source/1599084515.31-1c7fb82a1b4d46b0af6be2e2e95ba3e9.tgz#1599084515781863
Copying gs://benazirsproject_cloudbuild/source/1599084515.31-1c7fb82a1b4d46b0af6be2e2e95ba3e9.tgz#1599084515781863...
/ [1 files][  227.0 B/  227.0 B]                                                
Operation completed over 1 objects/227.0 B.         

In [339]:
REGION = 'us-central1'
ENDPOINT = '19a5aed0f754a516-dot-us-central2.pipelines.googleusercontent.com'
ARTIFACT_STORE_URI = 'gs://benazirsproject-demo'
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

## compile pipeline 

In [340]:
USE_KFP_SA = False

COMPONENT_URL_SEARCH_PREFIX = 'https://raw.githubusercontent.com/kubeflow/pipelines/0.2.5/components/gcp/'
RUNTIME_VERSION = '1.15'
PYTHON_VERSION = '3.7'

%env USE_KFP_SA={USE_KFP_SA}
%env BASE_IMAGE={BASE_IMAGE}
%env TRAINER_IMAGE={TRAINER_IMAGE}
%env COMPONENT_URL_SEARCH_PREFIX={COMPONENT_URL_SEARCH_PREFIX}
%env RUNTIME_VERSION={RUNTIME_VERSION}
%env PYTHON_VERSION={PYTHON_VERSION}

env: USE_KFP_SA=False
env: BASE_IMAGE=gcr.io/benazirsproject/base_image:latest
env: TRAINER_IMAGE=gcr.io/benazirsproject/amyris_trainer_image:latest
env: COMPONENT_URL_SEARCH_PREFIX=https://raw.githubusercontent.com/kubeflow/pipelines/0.2.5/components/gcp/
env: RUNTIME_VERSION=1.15
env: PYTHON_VERSION=3.7


## copy helper components 

In [341]:
!dsl-compile --py pipeline/amyris_pipeline.py --output amyris_pipeline.yaml 

In [342]:
!head amyris_pipeline.yaml 

apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
  generateName: amyris-classifier-training-
  annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.0.0, pipelines.kubeflow.org/pipeline_compilation_time: '2020-09-03T03:27:50.570494',
    pipelines.kubeflow.org/pipeline_spec: '{"description": "The pipeline training
      and deploying the Amyris classifierpipeline_yaml", "inputs": [{"name": "project_id"},
      {"name": "region"}, {"name": "gcs_root"}, {"name": "evaluation_metric_name"},
      {"name": "evaluation_metric_threshold"}, {"name": "model_id"}, {"name": "version_id"},
      {"name": "replace_existing_version"}, {"default": "\n{\n    \"hyperparameters\":  {\n        \"goal\":


## deploy pipeline package 

In [343]:
PIPELINE_NAME='amyris_pipeline_september'

!kfp --endpoint $ENDPOINT pipeline upload \
-p $PIPELINE_NAME \
amyris_pipeline.yaml

Pipeline c67cf5a4-db2b-4aa3-bb04-c947aba35230 has been submitted

Pipeline Details
------------------
ID           c67cf5a4-db2b-4aa3-bb04-c947aba35230
Name         amyris_pipeline_september
Description
Uploaded at  2020-09-03T03:28:08+00:00
+-----------------------------+--------------------------------------------------+
| Parameter Name              | Default Value                                    |
| project_id                  |                                                  |
+-----------------------------+--------------------------------------------------+
| region                      |                                                  |
+-----------------------------+--------------------------------------------------+
| gcs_root                    |                                                  |
+-----------------------------+--------------------------------------------------+
| evaluation_metric_name      |                                                  |
+----------

In [344]:
!kfp --endpoint $ENDPOINT pipeline list

+--------------------------------------+-------------------------------------------------+---------------------------+
| Pipeline ID                          | Name                                            | Uploaded at               |
| c67cf5a4-db2b-4aa3-bb04-c947aba35230 | amyris_pipeline_september                       | 2020-09-03T03:28:08+00:00 |
+--------------------------------------+-------------------------------------------------+---------------------------+
| c2171aa5-2680-4a96-b5de-516679f24c07 | amyris_pipeline_sept                            | 2020-09-02T22:26:04+00:00 |
+--------------------------------------+-------------------------------------------------+---------------------------+
| 3367f9e4-6e1c-4ece-b517-a347c8d42690 | covertype_continuous_training_test              | 2020-08-27T16:36:33+00:00 |
+--------------------------------------+-------------------------------------------------+---------------------------+
| b6b2b054-e524-4acf-b3fc-0e7e3ba271bd | amyris_

In [345]:
PIPELINE_ID='c67cf5a4-db2b-4aa3-bb04-c947aba35230'

In [349]:
EXPERIMENT_NAME = 'amyris1'
RUN_ID = 'Run_001'
SOURCE_TABLE = 'covertype_dataset.covertype'
DATASET_ID = 'splits'
EVALUATION_METRIC = 'accuracy'
EVALUATION_METRIC_THRESHOLD = '0.69'
MODEL_ID = 'covertype_classifier'
VERSION_ID = 'v01'
REPLACE_EXISTING_VERSION = 'True'

GCS_STAGING_PATH = '{}/staging'.format(ARTIFACT_STORE_URI)

In [350]:
!kfp --endpoint $ENDPOINT run submit \
-e $EXPERIMENT_NAME \
-r $RUN_ID \
-p $PIPELINE_ID \
project_id=$PROJECT_ID \
gcs_root=$GCS_STAGING_PATH \
region=$REGION \
source_table_name=$SOURCE_TABLE \
dataset_id=$DATASET_ID \
evaluation_metric_name=$EVALUATION_METRIC \
evaluation_metric_threshold=$EVALUATION_METRIC_THRESHOLD \
model_id=$MODEL_ID \
version_id=$VERSION_ID \
replace_existing_version=$REPLACE_EXISTING_VERSION

Creating experiment amyris1.
(400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Content-Length': '1451', 'Content-Type': 'text/html; charset=utf-8', 'Date': 'Thu, 03 Sep 2020 03:29:41 GMT', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'SAMEORIGIN', 'X-Powered-By': 'Express', 'X-Xss-Protection': '0', 'Set-Cookie': 'S=cloud_datalab_tunnel=Ync7COYNtFPrG88i7p-vVy1MbOKKQCWGkU75ZcTGjWo; Path=/; Max-Age=3600'})
HTTP response body: 
<!DOCTYPE html>
<html lang=en>
  <meta charset=utf-8>
  <meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width">
  <title>Error 400 (Bad Request)!!1</title>
  <style>
    *{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}ins{color:#777;text-de