In [None]:
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================


In [None]:
# Uninstall old packages
!pip3 uninstall -r requirements_uninstall.txt -y


In [None]:
# Install packages
# https://cloud.google.com/ai-platform/training/docs/runtime-version-list
!pip3 install -r requirements-rt2.1.txt --user --no-cache-dir --use-feature=2020-resolver


In [None]:
# Import packages
import warnings
warnings.filterwarnings("ignore")

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
#0 = all messages are logged (default behavior)
#1 = INFO messages are not printed
#2 = INFO and WARNING messages are not printed
#3 = INFO, WARNING, and ERROR messages are not printed

import numpy as np
from google.cloud import storage
import tensorflow as tf
#import matplotlib.pyplot as plt
#from tensorflow.keras import models

print("TF Version= ", tf.__version__)
print("Keras Version= ", tf.keras.__version__)


In [None]:
# Utility functions

#------
def find_best_model_dir(model_dir, offset=1, maxFlag=1):
    # Get a list of model directories
    all_models = ! gsutil ls $model_dir
    print("")
    print("All Models = ")
    print(*all_models, sep='\n')

    # Check if model dirs exist
    if (("CommandException" in all_models[0]) or (len(all_models) <= 1)):
        print("Create the models first.")
        return ""

    # Find the best model from checkpoints
    import re
    best_acc = -np.Inf
    if (maxFlag != 1):
        best_acc = np.Inf
    best_model_dir = ""
    tup_list = []
    for i in range(1,len(all_models)):
        all_floats = re.findall(r"[-+]?\d*\.\d+|\d+", all_models[i]) #Find the floats in the string
        cur_acc = -float(all_floats[-offset]) #which item is the model optimization metric
        tup_list.append([all_models[i],cur_acc])
        if (maxFlag*(cur_acc > best_acc) or (1-maxFlag)*(cur_acc < best_acc)):
            best_acc = cur_acc
            best_model_dir = all_models[i]
    if maxFlag:
        tup_list.sort(key=lambda tup: tup[1], reverse=False)
    else:
        tup_list.sort(key=lambda tup: tup[1], reverse=True)
    for i in range(len(tup_list)):
        print(tup_list[i][0])
    print("Best Accuracy  from Checkpoints = ", best_acc)
    print("Best Model Dir from Checkpoints = ", best_model_dir)
    
    return best_model_dir


In [None]:
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery
from googleapiclient import errors
import json

#------
# Python module to get the best hypertuned model parameters
def pyth_get_hypertuned_parameters(project_name, job_name, maxFlag):
    # Define the credentials for the service account
    #credentials = service_account.Credentials.from_service_account_file(<PATH TO CREDENTIALS JSON>)
    credentials = GoogleCredentials.get_application_default()

    # Define the project id and the job id and format it for the api request
    project_id = 'projects/{}'.format(project_name)
    job_id = '{}/jobs/{}'.format(project_id, job_name)

    # Build the service
    cloudml = discovery.build('ml', 'v1', cache_discovery=False, credentials=credentials)

    # Execute the request and pass in the job id
    request = cloudml.projects().jobs().get(name=job_id)

    try:
        response = request.execute()
        # Handle a successful request
    except errors.HttpError as err:
        tf.compat.v1.logging.error('There was an error getting the hyperparameters. Check the details:')
        tf.compat.v1.logging.error(err._get_reason())

    # Get just the best hp values
    if maxFlag:
        best_model = response['trainingOutput']['trials'][0]
    else:
        best_model = response['trainingOutput']['trials'][-1]
    #print('Best Hyperparameters:')
    #print(json.dumps(best_model, indent=4))

    nTrials = len(response['trainingOutput']['trials'])
    for i in range(0,nTrials):
        state = response['trainingOutput']['trials'][i]['state']
        trialId = response['trainingOutput']['trials'][i]['trialId']
        objV = -1
        if (state == 'SUCCEEDED'):
            objV = response['trainingOutput']['trials'][i]['finalMetric']['objectiveValue']
        print('objective=', objV, ' trialId=', trialId, state)
        d = response['trainingOutput']['trials'][i]['hyperparameters']
        for key, value in d.items():
            print('    ', key, value)
    return best_model


In [None]:
# Plot original image overlaid by the fault labels
# image_array is np.array(num_images, x_size, y_size)
# label_array is np.array(num_images, x_size, y_size)

import matplotlib.pyplot as plt

def plot_images(image_array, label_array):
    fig, axarr = plt.subplots(4,4, figsize=(16, 16))
    for i in range(4):
        for j in range(4):
            axarr[i,j].set_title('Image-'+str(4*i+j))
            axarr[i,j].imshow(image_array[4*i+j,:,:], cmap='gray')
            axarr[i,j].imshow(label_array[4*i+j,:,:], alpha=0.3)


# Setup

In [None]:
USER = 'cchatterj'
PROJECT_ID = 'cchatterjee-sandbox' #$(gcloud config list project --format "value(core.project)")
BUCKET_NAME = 'chanchal-sandbox'
FOLDER_NAME = 'tensorflow_results'
REGION = 'us-central1'
RUNTIME_VERSION = 2.1
JOB_DIR   = 'gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/jobdir'
MODEL_DIR = 'gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/models'


In [None]:
!gcloud config list
#!gcloud config config-helper --format "value(configuration.properties.core.project)"


In [None]:
# Clean old job logs, job packages and models
!gsutil -m -q rm $JOB_DIR/packages/**
!gsutil -m -q rm $MODEL_DIR/model**


# UNET-FPN32-sl1 Model

In [None]:
# Create the tf_trainer directory and load the trainer files in it
!mkdir -p trainer


In [None]:
%%writefile ./trainer/inputs.py

# Create the train and label lists
import math
import numpy as np
#import imageio
#from google.cloud import storage
import io
import tensorflow as tf
from PIL import Image

#------
def readImage(image_path):
    img = tf.io.gfile.GFile(image_path, 'rb').read()
    img = io.BytesIO(img)
    img = np.array(Image.open(img).convert('L')).astype(dtype='int32')
    height, width = img.shape
    # clipping 
    h = int(height/16)*16
    w = int(width/16)*16
    image = img[:h, :w]
    return image

#------
def load_data(data_file_path: str, label_file_path: str, rangeIndices, batch_size) -> tf.data.Dataset:
    images = []
    for i in rangeIndices:
        im = readImage(data_file_path+'/image_inline_i%04d.png' % i)
        im = np.array(im).astype(dtype='float32')/255
        (h,w) = im.shape
        im = np.reshape(im, (h,w,1))
        images.append(im)

    labels = []
    for i in rangeIndices:
        im = readImage(label_file_path+'/image_inline_i%04d.png' % i)
        im = np.array(im).astype(dtype='float32')
        (h,w) = im.shape
        im = np.reshape(im, (h,w,1))
        labels.append(im)

    #for items in images:
    #    print(items)

    seismic = np.array(images)
    label = np.array(labels)
    print("Data  Shape = ", images.shape)
    print("Label Shape = ", label.shape)
    
    #return images, labels
    
    dataset = tf.data.Dataset.from_tensor_slices((images, labels))
    dataset = dataset.shuffle(100).batch(batch_size)
    
    return dataset



In [None]:
%%writefile ./trainer/model.py

import tensorflow as tf
import numpy as np
from tensorflow.keras import backend as K
from tensorflow.keras import Input, layers, models, regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.activations import softmax


def tf_model():
    return

def custom_loss(y_true, y_pred):
    return custom_loss


## Package for distributed training

In [None]:
%%writefile ./setup.py

# python3

# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================

from setuptools import find_packages
from setuptools import setup

#Runtime 2.2
#REQUIRED_PACKAGES = ['tensorflow==2.3.1',
#                     'pandas==1.0.4',
#                     'scikit-learn==0.23.1',
#                     'google-cloud-storage==1.29.0',
#                     'cloudml-hypertune',
#                     'pillow',
#                    ]
#Runtime 2.1
REQUIRED_PACKAGES = ['tensorflow==2.1.0',
                     'pandas==0.25.3',
                     'scikit-learn==0.22',
                     'google-cloud-storage==1.23.0',
                     'cloudml-hypertune',
                     'pillow',
                    ]
setup(
    name='trainer',
    version='0.1',
    install_requires=REQUIRED_PACKAGES,
    packages=find_packages(),
    include_package_data=True,
    description='Trainer package for Tensorflow Task'
)


## Training functions

In [None]:
%%writefile ./trainer/__init__.py
# python3

# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================


In [None]:
%%writefile ./trainer/train.py

# python3
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================

import os
import json
import tensorflow as tf
import numpy as np
import datetime as datetime
from pytz import timezone
import hypertune
import argparse
from trainer import model
from trainer import inputs


import warnings
warnings.filterwarnings("ignore")

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
#0 = all messages are logged (default behavior)
#1 = INFO messages are not printed
#2 = INFO and WARNING messages are not printed
#3 = INFO, WARNING, and ERROR messages are not printed


def parse_arguments():
    """Argument parser.
    Returns:
      Dictionary of arguments.
    """

    parser = argparse.ArgumentParser()
    parser.add_argument('--depth', default=5, type=int, 
                        help='Hyperparameter: depth of net')
    parser.add_argument('--dropout_rate', default=0.2, type=float, 
                        help='Hyperparameter: Drop out rate')
    parser.add_argument('--learning_rate', default=0.00005, type=float, 
                        help='Hyperparameter: initial learning rate')
    parser.add_argument('--batch_size', default=1, type=int, 
                        help='batch size of the deep network')
    parser.add_argument('--epochs', default=2, type=int, 
                        help='epoch.')
    parser.add_argument('--num_samples', default=3000, type=int,
                        help='Number of training samples to use.')
    parser.add_argument('--model_dir', default="",
                        help='Directory to store models and logs.')
    parser.add_argument('--verbosity', choices=['DEBUG','ERROR','FATAL','INFO','WARN'],
                        default='FATAL')
    args, _ = parser.parse_known_args()
    return args


def get_callbacks(args, early_stop_patience: int = 3):
    """Creates Keras callbacks for model training."""

    # Get trialId
    trialId = json.loads(os.environ.get("TF_CONFIG", "{}")).get("task", {}).get("trial", "")
    if trialId == '':
        trialId = '0'
    print("trialId=", trialId)

    curTime = datetime.datetime.now(timezone('US/Pacific')).strftime('%H%M%S')
    
    # Modify model_dir paths to include trialId
    model_dir = args.model_dir + "/checkpoints/cp-"+curTime+"-"+trialId+"-{custom_mse:.4f}"
    log_dir   = args.model_dir + "/log_dir"

    tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir, histogram_freq=1)
    checkpoint_cb  = tf.keras.callbacks.ModelCheckpoint(model_dir, monitor='custom_mse', mode='min', 
                                                        verbose=0, save_best_only=True,
                                                        save_weights_only=False)
    earlystop_cb   = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

    return [checkpoint_cb, tensorboard_cb, earlystop_cb]


if __name__ == "__main__":

    # ---------------------------------------
    # Parse Arguments
    # ---------------------------------------
    args = parse_arguments()
    #args.model_dir = MODEL_DIR + datetime.datetime.now(timezone('US/Pacific')).strftime('/model_%m%d%Y_%H%M')
    print(args)

    #tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)  # or any {DEBUG, INFO, WARN, ERROR, FATAL}

    # ---------------------------------------
    # Input Data & Preprocessing
    # ---------------------------------------
    print("Input and pre-process data ...")
    # Extract train_seismic, train_label
    data_dir = 'gs://../images'
    label_dir = 'gs://../labels'
    train_dataset = inputs.load_data(data_dir, label_dir, range(0,args.num_samples), args.batch_size)
    
    # ---------------------------------------
    # Train model
    # ---------------------------------------
    print("Creating model ...")
    tf_model = model.tf_model(depth=args.depth,
                              dropout_rate=args.dropout_rate)
    tf_model.compile(optimizer=tf.keras.optimizers.Adam(lr=args.learning_rate),
                     loss=model.custom_loss,  # loss is the custom loss I shared with you
                     metrics=[model.custom_mse])
    
    print("Fitting model ...")
    callbacks = get_callbacks(args, 3)
    history = tf_model.fit(train_dataset, 
                           epochs=args.epochs,
                           validation_split = 0.0,
                           callbacks=callbacks)

    # TBD save history for visualization

    final_epoch_accuracy = history.history['custom_mse'][-1]
    final_epoch_count = len(history.history['custom_mse'])

    print('final_epoch_accuracy = %.6f' % final_epoch_accuracy)
    print('final_epoch_count = %02d' % final_epoch_count)


In [None]:
%%time
# Run the training manually
# Training parameters
from datetime import datetime
from pytz import timezone

DEPTH = 7
DROPOUT_RATE = 0.15000017803209056
#DEPTH = 5
#DROPOUT_RATE = 0.2
#N_SEG_LEVELS = 3
LEARNING_RATE = 0.00005
EPOCHS = 2
BATCH_SIZE = 1
NUM_SAMPLES = 3000

MODEL_DIR_PYTH = MODEL_DIR + datetime.now(timezone('US/Pacific')).strftime('/model_%m%d%Y_%H%M')

print('DEPTH = %02d' % DEPTH)
print('DROPOUT_RATE = %.4f' % DROPOUT_RATE)
print('LEARNING_RATE = %.6f' % LEARNING_RATE)
print('EPOCHS = %02d' % EPOCHS)
print('BATCH_SIZE = %02d' % BATCH_SIZE)
print('NUM_SAMPLES = %d' % NUM_SAMPLES)
print("MODEL_DIR =", MODEL_DIR_PYTH)

# Run training
! python3 -m trainer.train --depth=$DEPTH --dropout_rate=$DROPOUT_RATE \
    --n_base_filters=$N_BASE_FILTERS --n_segmentation_levels=$N_SEG_LEVELS \
    --learning_rate=$LEARNING_RATE \
    --epochs=$EPOCHS --batch_size=$BATCH_SIZE --num_samples=$NUM_SAMPLES \
    --model_dir=$MODEL_DIR_PYTH


In [None]:
# Test with latest saved model
best_model_dir_pyth = find_best_model_dir(MODEL_DIR_PYTH+'/checkpoints', offset=1, maxFlag=0)
#acc = test_saved_model(best_model_dir_pyth, 0)


In [None]:
%%time

from trainer import model

# Copy the model from storage to local memory
!gsutil -m cp -r $best_model_dir_pyth* ./model_dir

# Load the model
loaded_model = tf.keras.models.load_model('./model_dir', compile=False, 
               custom_objects={"custom_loss": model.custom_loss, "custom_mse": model.custom_mse})
print("Signature ", loaded_model.signatures)
print("")

# Display model
tf.keras.utils.plot_model(loaded_model, show_shapes=True)


------
# Training

In [None]:
# Create the config directory and load the trainer files in it
!mkdir -p config


In [None]:
%%writefile ./config/config.yaml

# python3
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================

# https://cloud.google.com/sdk/gcloud/reference/ai-platform/jobs/submit/training#--scale-tier
# https://www.kaggle.com/c/passenger-screening-algorithm-challenge/discussion/37087
# https://cloud.google.com/ai-platform/training/docs/using-gpus

#trainingInput:
#  scaleTier: CUSTOM
#  masterType: n1-highmem-8
#  masterConfig:
#    acceleratorConfig:
#      count: 1
#      type: NVIDIA_TESLA_T4

#  masterType: n1-highcpu-16
#  workerType: cloud_tpu
#  workerCount: 1
#  workerConfig:
#    acceleratorConfig:
#      type: TPU_V3
#      count: 8

#trainingInput:
#  scaleTier: CUSTOM
#  masterType: complex_model_m
#  workerType: complex_model_m
#  parameterServerType: large_model
#  workerCount: 6
#  parameterServerCount: 1
#  scheduling:
#    maxWaitTime: 3600s
#    maxRunningTime: 7200s

#trainingInput:
#  runtimeVersion: "2.2"
#  scaleTier: CUSTOM
#  masterType: standard_gpu
#  workerCount: 9
#  workerType: standard_gpu
#  parameterServerCount: 3
#  parameterServerType: standard

#trainingInput:
#  scaleTier: BASIC-GPU
    
#trainingInput:
#  region: us-central1
#  scaleTier: CUSTOM
#  masterType: complex_model_m
#  workerType: complex_model_m_gpu
#  parameterServerType: large_model
#  workerCount: 4
#  parameterServerCount: 2

trainingInput:
    scaleTier: CUSTOM
    masterType: n1-highmem-16
    masterConfig:
        acceleratorConfig:
            count: 2
            type: NVIDIA_TESLA_V100



In [None]:
from datetime import datetime
from pytz import timezone
JOBNAME_TRN = 'tf_train_'+ USER + '_' + \
           datetime.now(timezone('US/Pacific')).strftime("%m%d%y_%H%M")
JOB_CONFIG = "config/config.yaml"
MODEL_DIR_TRN = MODEL_DIR + datetime.now(timezone('US/Pacific')).strftime('/model_%m%d%Y_%H%M')
print("Job Name = ", JOBNAME_TRN)
print("Job Dir  = ", JOB_DIR)
print("MODEL_DIR =", MODEL_DIR_TRN)

# Training parameters
DEPTH = 6
DROPOUT_RATE = 0.15
#DEPTH = 5
#DROPOUT_RATE = 0.2
LEARNING_RATE = 0.00005
EPOCHS = 2
BATCH_SIZE = 1
NUM_SAMPLES = 3000

print('DEPTH = %02d' % DEPTH)
print('DROPOUT_RATE = %.4f' % DROPOUT_RATE)
print('LEARNING_RATE = %.6f' % LEARNING_RATE)
print('EPOCHS = %02d' % EPOCHS)
print('BATCH_SIZE = %02d' % BATCH_SIZE)
print('NUM_SAMPLES = %d' % NUM_SAMPLES)


In [None]:
# https://cloud.google.com/sdk/gcloud/reference/ai-platform/jobs/submit/training

TRAIN_LABELS = "mode=train,owner="+USER

# submit the training job
! gcloud ai-platform jobs submit training $JOBNAME_TRN \
  --package-path $(pwd)/trainer \
  --module-name trainer.train \
  --region $REGION \
  --python-version 3.7 \
  --runtime-version $RUNTIME_VERSION \
  --job-dir $JOB_DIR \
  --config $JOB_CONFIG \
  --labels $TRAIN_LABELS \
  -- \
  --depth=$DEPTH \
  --dropout_rate=$DROPOUT_RATE \
  --learning_rate=$LEARNING_RATE \
  --epochs=$EPOCHS \
  --batch_size=$BATCH_SIZE \
  --num_samples=$NUM_SAMPLES \
  --model_dir=$MODEL_DIR_TRN


In [None]:
# check the training job status
! gcloud ai-platform jobs describe $JOBNAME_TRN


In [None]:
# Print Errors
response = ! gcloud logging read "resource.labels.job_id=$JOBNAME_TRN severity>=ERROR"
for i in range(0,len(response)):
    if 'message' in response[i]:
        print(response[i])

In [None]:
# Test with latest saved model
best_model_dir_trn = find_best_model_dir(MODEL_DIR_TRN+'/checkpoints', offset=1, maxFlag=0)
#acc = test_saved_model(best_model_dir_trn, 0)


------
# Hyper Parameter Tuning

In [None]:
# Create the tf directory and load the trainer files in it
!cp ./trainer/train.py ./trainer/train_hpt.py


In [None]:
%%writefile -a ./trainer/train_hpt.py

    """This method updates a CAIP HPTuning Job with a final metric for the job.
    In TF2.X the user must either use hypertune or a custom callback with
    tf.summary.scalar to update CAIP HP Tuning jobs. This function uses
    hypertune, which appears to be the preferred solution. Hypertune also works
    with containers, without code change.
    Args:
        metric_tag: The metric being optimized.  This MUST MATCH the
          hyperparameterMetricTag specificed in the hyperparameter tuning yaml.
        metric_value: The value to report at the end of model training.
        global_step: An int value to specify the number of trainin steps completed
          at the time the metric was reported.
    """

    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
        hyperparameter_metric_tag='custom_mse',
        metric_value=final_epoch_accuracy,
        global_step=final_epoch_count
    )


In [None]:
%%writefile ./config/hptuning_config.yaml

# python3
# ==============================================================================
# Copyright 2020 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.
# ==============================================================================

# https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs

#trainingInput:
#  scaleTier: CUSTOM
#  masterType: n1-highmem-8
#  masterConfig:
#    acceleratorConfig:
#      count: 1
#      type: NVIDIA_TESLA_T4
#
#  masterType: standard_p100
#  workerType: standard_p100
#  parameterServerType: standard_p100
#  workerCount: 8
#  parameterServerCount: 1
#  runtimeVersion: $RUNTIME_VERSION
#  pythonVersion: '3.7'

#trainingInput:
#  scaleTier: CUSTOM
#  masterType: complex_model_m
#  workerType: complex_model_m
#  parameterServerType: large_model
#  workerCount: 9
#  parameterServerCount: 3
#  scheduling:
#    maxWaitTime: 3600s
#    maxRunningTime: 7200s

#trainingInput:
#  scaleTier: BASIC-GPU

trainingInput:
  scaleTier: CUSTOM
  masterType: n1-highmem-16
  masterConfig:
    acceleratorConfig:
      count: 2
      type: NVIDIA_TESLA_V100
  hyperparameters:
    goal: MAXIMIZE
    hyperparameterMetricTag: custom_mse
    maxTrials: 32
    maxParallelTrials: 4
    enableTrialEarlyStopping: True
    params:
    - parameterName: depth
      type: INTEGER
      minValue: 4
      maxValue: 6
      scaleType: UNIT_LINEAR_SCALE
    - parameterName: dropout_rate
      type: DOUBLE
      minValue: 0.1
      maxValue: 0.3
      scaleType: UNIT_LINEAR_SCALE
    - parameterName: n_segmentation_levels
      type: DISCRETE
      discreteValues:
      - 2
      - 3
    - parameterName: n_base_filters
      type: DISCRETE
      discreteValues:
      - 8
      - 12
      - 16


In [None]:
from datetime import datetime
from pytz import timezone

JOBNAME_HPT = 'tf_hptrn_' + USER + '_' + \
              datetime.now(timezone('US/Pacific')).strftime("%m%d%y_%H%M")
JOB_CONFIG = "./config/hptuning_config.yaml"
MODEL_DIR_HPT = MODEL_DIR + datetime.now(timezone('US/Pacific')).strftime('/model_%m%d%Y_%H%M')
print("Job Name = ", JOBNAME_HPT)
print("Job Dir  = ", JOB_DIR)
print("MODEL_DIR =", MODEL_DIR_HPT)

# Training parameters
LEARNING_RATE = 0.00005
EPOCHS = 2
BATCH_SIZE = 1
NUM_SAMPLES = 3000

print('LEARNING_RATE = %.6f' % LEARNING_RATE)
print('EPOCHS = %02d' % EPOCHS)
print('BATCH_SIZE = %02d' % BATCH_SIZE)
print('NUM_SAMPLES = %d' % NUM_SAMPLES)


In [None]:
# submit the training job
HT_LABELS = "mode=hypertrain,owner="+USER

! gcloud ai-platform jobs submit training $JOBNAME_HPT \
  --package-path $(pwd)/trainer \
  --module-name trainer.train_hpt \
  --python-version 3.7 \
  --runtime-version $RUNTIME_VERSION \
  --region $REGION \
  --job-dir $JOB_DIR \
  --config $JOB_CONFIG \
  --labels $HT_LABELS \
  -- \
  --learning_rate=$LEARNING_RATE \
  --epochs=$EPOCHS \
  --batch_size=$BATCH_SIZE \
  --num_samples=$NUM_SAMPLES \
  --model_dir=$MODEL_DIR_HPT


In [None]:
# check the hyperparameter training job status
! gcloud ai-platform jobs describe $JOBNAME_HPT


In [None]:
# Print Errors
response = ! gcloud logging read "resource.labels.job_id=$JOBNAME_HPT severity>=ERROR"
for i in range(0,len(response)):
    if 'message' in response[i]:
        print(response[i])

In [None]:
# Get the best model parameters from Cloud API
best_model = pyth_get_hypertuned_parameters(PROJECT_ID, JOBNAME_HPT, 0)
DEPTH = best_model['hyperparameters']['depth']
DROPOUT_RATE = best_model['hyperparameters']['dropout_rate']
N_SEG_LEVELS = best_model['hyperparameters']['n_segmentation_levels']
N_BASE_FILTERS = best_model['hyperparameters']['n_base_filters']
print('')
print('Objective=', best_model['finalMetric']['objectiveValue'])
print('DEPTH =', DEPTH)
print('DROPOUT_RATE =', DROPOUT_RATE)
print('N_SEG_LEVELS =', N_SEG_LEVELS)
print('N_BASE_FILTERS =', N_BASE_FILTERS)


In [None]:
# Find count of checkpoints
all_models = ! gsutil ls {MODEL_DIR_HPT+'/checkpoints'}
print("Total Hypertrained Models=", len(all_models))

# Test with latest saved model
best_model_dir_hyp = find_best_model_dir(MODEL_DIR_HPT+'/checkpoints', offset=1, maxFlag=0)
#acc = test_saved_model(best_model_dir_hyp, 0)

#import keras.backend as K
#loaded_model = tf.keras.models.load_model(MODEL_DIR_PARAM+'/checkpoints')
#print("learning_rate=", K.eval(loaded_model.optimizer.lr))
#tf.keras.utils.plot_model(loaded_model, show_shapes=True)


--------
# Deploy the Model

In [None]:
## https://cloud.google.com/ai-platform/prediction/docs/machine-types-online-prediction#available_machine_types
# We need 2 versions of the same model:
# 1. Batch prediction model deployed on a mls1-c1-m2 cluster
# 2. Online prediction model deployed on a n1-standard-16 cluster
# Batch prediction does not support GPU and n1-standard-16 clusters.

# Run the Deploy Model section twice:
# 1. As a BATCH Mode version use MODEL_VERSION = MODEL_VERSION_BATCH
# 2. As a ONLINE Mode version use MODEL_VERSION = MODEL_VERSION_ONLINE


In [None]:
from googleapiclient import discovery
from googleapiclient import errors
import time
import yaml

def print_all_versions_of_model(project_id, model_name):
    project_id = 'projects/{}'.format(project_id)
    model_id = '{}/models/{}'.format(project_id, model_name)
    credentials = GoogleCredentials.get_application_default()
    service = discovery.build('ml', 'v1', cache_discovery=False, credentials=credentials)
    request = service.projects().models().versions().list(parent=model_id)
    response = request.execute()

    # check if model has versions
    all_versions = []
    if 'versions' in response.keys():    
        all_versions = response['versions']

    # print all model versions
    print("\nVersions for Model:", model_name)
    print("Total Number of versions = ", len(all_versions))
    if (len(all_versions) > 0):
        print("")
        print(yaml.dump(all_versions))
    
    return all_versions

#------------------------------------------------------------------------------#
def delete_model_versions(project_id, model_name, version_name):
    project_id = 'projects/{}'.format(project_id)
    model_id = '{}/models/{}'.format(project_id, model_name)
    credentials = GoogleCredentials.get_application_default()
    service = discovery.build('ml', 'v1', cache_discovery=False, credentials=credentials)

    # get model versions
    all_versions = print_all_versions_of_model(PROJECT_ID, MODEL_NAME)

    # check if current version is a default version
    cur_ver_default = 0
    for i in range(len(all_versions)):
        if MODEL_VERSION in all_versions[i]['name']:
            if 'isDefault' in all_versions[i].keys():
                if all_versions[i]['isDefault'] == True:
                    cur_ver_default = 1
    print("cur_ver_default=", cur_ver_default)

    # if not default delete this version only
    if (cur_ver_default == 0) and (len(all_versions) > 0):
        print("This is not the default version")
        print("Delete only this version")
        for i in range(len(all_versions)):
            if MODEL_VERSION in all_versions[i]['name']:
                print("Deleting non default version:", version['name'])
                request = service.projects().models().versions().delete(name = all_versions[i]['name'])
                try:
                    request.execute()
                except errors.HttpError as err:
                    reason = err._get_reason()
                    print("Delete Error Reason:", reason)

    if (cur_ver_default == 1) and (len(all_versions) > 0):
        print("This is the default version")
        print("Delete all versions")
        if (cur_ver_default == 1) and (len(all_versions) > 0):
            print("This is the default version")
            print("Delete all versions")
            while len(all_versions) >= 1:
                for version in all_versions:
                    #print("Deleting version:", version['name'])
                    request = service.projects().models().versions().delete(name = version['name'])
                    try:
                        request.execute()
                    except errors.HttpError as err:
                        reason = err._get_reason()
                        #print("Delete Error Reason:", reason)
                        if 'Cannot delete the default version' in reason:
                            next
                request = service.projects().models().versions().list(parent=model_id)
                response = request.execute()
                time.sleep(1)
                try:
                    all_versions = response['versions']
                except:
                    all_versions = []
                    break

In [None]:
MODEL_NAME = "unetfpn16sl1"
MODEL_VERSION_BATCH  = "cchatterj_v1_batch"
MODEL_VERSION_ONLINE = "cchatterj_v1_online"

#Run this as Batch first then Online
MODEL_VERSION = MODEL_VERSION_BATCH
#MODEL_VERSION = MODEL_VERSION_ONLINE

# List all models
print("\nList all models")
!gcloud ai-platform models list

# List all versions of the model
#!gcloud ai-platform versions list --model $MODEL_NAME
_ = print_all_versions_of_model(PROJECT_ID, MODEL_NAME)


In [None]:
#!gcloud ai-platform versions delete $MODEL_VERSION --model $MODEL_NAME -q
#!gcloud ai-platform models delete $MODEL_NAME -q


In [None]:
# create the model if it doesn't already exist
modelname = !gcloud ai-platform models list | grep -w $MODEL_NAME
print(modelname)
if len(modelname) <= 1:
    print("Creating model " + MODEL_NAME)
    ! gcloud ai-platform models create $MODEL_NAME --regions $REGION --enable-logging
else:
    print("Model " + MODEL_NAME + " exist")


In [None]:
# Check if current version is default
#    if default model delete all versions
#    if not default model delete only this version

print("Model Version=", MODEL_VERSION)
delete_model_versions(PROJECT_ID, MODEL_NAME, MODEL_VERSION)

# List the models
print("\nList all models:")
!gcloud ai-platform models list

# List all versions of the model
_ = print_all_versions_of_model(PROJECT_ID, MODEL_NAME)
#!gcloud ai-platform versions list --model $MODEL_NAME


In [None]:
%%time

# Get a list of model directories
best_model_dir = best_model_dir_hyp  #best_model_dir_hyp
print("Best Model Dir: ", best_model_dir)

MODEL_FRAMEWORK = "TENSORFLOW"
MODEL_DESCRIPTION = "UNET_FPN_SL1"
MODEL_LABELS="team=total,phase=test,owner="+USER

MACHINE_TYPE = "mls1-c1-m2"
if (MODEL_VERSION == MODEL_VERSION_BATCH):
    MACHINE_TYPE = "mls1-c1-m2"
    MODEL_LABELS = MODEL_LABELS+",mode=batch"
if (MODEL_VERSION == MODEL_VERSION_ONLINE):
    MACHINE_TYPE = "n1-standard-16"
    MODEL_LABELS = MODEL_LABELS+",mode=online"

# Deploy the model
! gcloud beta ai-platform versions create $MODEL_VERSION \
  --model $MODEL_NAME \
  --origin $best_model_dir \
  --runtime-version $RUNTIME_VERSION \
  --python-version=3.7 \
  --description=$MODEL_DESCRIPTION \
  --labels $MODEL_LABELS \
  --machine-type=$MACHINE_TYPE 
#  --framework $MODEL_FRAMEWORK \
#  --region=$REGION


In [None]:
# List all models
print("")
!gcloud ai-platform models list

# List a ll versions of model
_ = print_all_versions_of_model(PROJECT_ID, MODEL_NAME)


------
# Predictions with the deployed model

In [None]:
from trainer import inputs
print("Input and pre-process data ...")
# Extract train_seismic, train_label
data_dir = 'gs://codev-test-data/FAULT_TRAINING_SET/SYNTHETIC_OPTIM/seismic'
label_dir = 'gs://codev-test-data/FAULT_TRAINING_SET/SYNTHETIC_OPTIM/label'
test_dataset = inputs.load_data(data_dir, label_dir, range(0,16), 1)
X_test1 = list(test_dataset.as_numpy_iterator())
X_test1 = np.array(X_test1).astype(dtype='float32')
X_test_images = X_test1[:,0,0,:,:,:]
X_test_labels = X_test1[:,1,0,:,:,:]

print("X_test shape =", X_test_images.shape)

# plot the results
plot_images(X_test_images[:,:,:,0], X_test_labels[:,:,:,0])


In [None]:
%%time

from trainer import model

# Copy the model from storage to local memory
!gsutil -m cp -r $best_model_dir_hyp* ./model_dir

# Load the model
loaded_model = tf.keras.models.load_model('./model_dir', compile=False, 
               custom_objects={"custom_loss": model.custom_loss,"custom_mse": model.custom_mse})
print("Signature ", loaded_model.signatures)

# Check the model layers
model_layers = [layer.name for layer in loaded_model.layers]
print("")
print("Model Input  Layer=", model_layers[0])
print("Model Output Layer=", model_layers[-1])
print("")


## Batch Prediction with GCLOUD

In [None]:
# Write batch data to file in GCS

import shutil

# Clean current directory
DATA_DIR = './batch_data'
shutil.rmtree(DATA_DIR, ignore_errors=True)
os.makedirs(DATA_DIR)

nTotalImages = X_test_images.shape[0]
nFiles = min(100,nTotalImages)
nImagesPerFile = min(100, nTotalImages//nFiles)
print("nTotalImages =", nTotalImages)
print("nFiles =", nFiles)
print("nImagesPerFile =", nImagesPerFile)

# Create nFiles files with nImagesPerFile images each
for i in range(nFiles):
    with open(f'{DATA_DIR}/unkeyed_batch_{i}.json', "w") as file:
        for z in range(nImagesPerFile):
            print(f'{{"{model_layers[0]}": {X_test_images[i*nImagesPerFile+z].tolist()}}}', file=file)
            #key = f'key_{i}_{z}'
            #print(f'{{"image": {X_test_images[z].tolist()}, "key": "{key}"}}', file=file)

# Write batch data to file
! gsutil -m cp -r ./batch_data gs://$BUCKET_NAME/$FOLDER_NAME/


In [None]:
from datetime import datetime
from pytz import timezone

DATA_FORMAT="text" # JSON data format
INPUT_PATHS='gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/batch_data/*'
OUTPUT_PATH='gs://' + BUCKET_NAME + '/' + FOLDER_NAME + '/batch_predictions'
PRED_LABELS="mode=batch,team=engineering,phase=test,owner="+USER
SIGNATURE_NAME="serving_default"

JOBNAME_BATCH = 'tf_batch_predict_'+ USER + '_' + \
           datetime.now(timezone('US/Pacific')).strftime("%m%d%y_%H%M")

print("INPUT_PATHS = ", INPUT_PATHS)
print("OUTPUT_PATH = ", OUTPUT_PATH)
print("Job Name    = ", JOBNAME_BATCH)


In [None]:
# Submit batch predict job
# Use  MODEL_VERSION_BATCH not MODEL_VERSION_ONLINE
MODEL_VERSION = MODEL_VERSION_BATCH

! gcloud ai-platform jobs submit prediction $JOBNAME_BATCH \
    --model $MODEL_NAME \
    --version $MODEL_VERSION \
    --input-paths $INPUT_PATHS \
    --output-path $OUTPUT_PATH \
    --region $REGION \
    --data-format $DATA_FORMAT \
    --labels $PRED_LABELS \
    --signature-name $SIGNATURE_NAME


In [None]:
# check the batch prediction job status
! gcloud ai-platform jobs describe $JOBNAME_BATCH


In [None]:
# Print Errors
response = ! gcloud logging read "resource.labels.job_id=$JOBNAME_BATCH severity>=ERROR"
for i in range(0,len(response)):
    if 'message' in response[i]:
        print(response[i])


In [None]:
!gsutil cat $OUTPUT_PATH/prediction.results-00000-of-00016


## Online Prediction with python

In [None]:
%%time

# Use MODEL_VERSION_ONLINE not MODEL_VERSION_BATCH
MODEL_VERSION = MODEL_VERSION_ONLINE

from oauth2client.client import GoogleCredentials
from googleapiclient import discovery
from googleapiclient import errors
import json

#tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
#tf.get_logger().setLevel('ERROR')

project_id = 'projects/{}'.format(PROJECT_ID)
model_name = '{}/models/{}'.format(project_id, MODEL_NAME)
if MODEL_VERSION is not None:
    model_name += '/versions/{}'.format(MODEL_VERSION)
credentials = GoogleCredentials.get_application_default()
service = discovery.build('ml', 'v1', cache_discovery=False, credentials=credentials)

pprobas_temp = []
batch_size = 1
n_samples = min(1000,len(X_test_images))
print("n_samples=", n_samples)
for i in range(0, n_samples, batch_size):
    j = min(i+batch_size, n_samples)
    print("Processing samples", i, j)
    request = service.projects().predict(name=model_name, \
                                         body={'instances': X_test_images[i:j].tolist()})
    try:
        response = request.execute()
    except errors.HttpError as err:
        # Something went wrong, print out some information.
        tf.compat.v1.logging.error('There was an error getting the job info, Check the details:')
        tf.compat.v1.logging.error(err._get_reason())

    pprobas_temp += response['predictions']



In [None]:
# If Model Version Online is used
if (MODEL_VERSION_ONLINE in model_name):
    print("Online Version is Used")
    pprobas = np.array(pprobas_temp)[:,:,:,0]
    print("pprobas shape=", pprobas.shape)
    plot_images(X_test_images[:,:,:,0], pprobas)

# If Model Version Batch is used
if (MODEL_VERSION_BATCH in model_name):
    print("Batch Version is Used")
    pprobas = [] 
    for i in range(0,n_samples):
        pprobas.append(list(pprobas_temp[i].values()))
    pprobas = np.array(pprobas)
    pprobas = pprobas[:,0,:,:,0]
    print("pprobas shape=", pprobas.shape)
    plot_images(X_test_images[:,:,:,0], pprobas)
