In [2]:
import os

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Google Cloud Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"

In [3]:
 pip install {USER_FLAG} --upgrade google-cloud-aiplatform

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.14.0-py2.py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: google-cloud-aiplatform
[0mSuccessfully installed google-cloud-aiplatform-1.14.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
 pip install {USER_FLAG} -U google-cloud-storage

Collecting google-cloud-storage
  Downloading google_cloud_storage-2.4.0-py2.py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.0/107.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: google-cloud-storage
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kfp 1.8.12 requires google-api-python-client<2,>=1.7.8, but you have google-api-python-client 2.48.0 which is incompatible.
kfp 1.8.12 requires google-cloud-storage<2,>=1.20.0, but you have google-cloud-storage 2.4.0 which is incompatible.[0m[31m
[0mSuccessfully installed google-cloud-storage-2.4.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
 pip install {USER_FLAG} -U "google-cloud-bigquery[all]"

Collecting google-cloud-bigquery[all]
  Downloading google_cloud_bigquery-3.2.0-py2.py3-none-any.whl (211 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.8/211.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting opentelemetry-api>=1.1.0
  Downloading opentelemetry_api-1.11.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting geopandas<1.0dev,>=0.9.0
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting Shapely<2.0dev,>=1.6.0
  Downloading Shapely-1.8.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting opentelemetry-sdk>=1.1.0
  Downloading open

In [6]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)


In [1]:
import os

PROJECT_ID = ""

if not os.getenv("IS_TESTING"):
    # Get your Google Cloud project ID from gcloud
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

Project ID:  mle-creditcard-project


In [2]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")


In [3]:
BUCKET_URI = "gs://creditcard_default_project"
REGION = "us-central1"  # @param {type:"string"}

In [6]:
! gsutil ls -al $BUCKET_URI

# Import Vertex AI SDK for Python

In [7]:
import json
import os
import sys

import numpy as np
from google.cloud import aiplatform, bigquery
from google.cloud.aiplatform import gapic as aip

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

In [8]:
# specify accelerator
TRAIN_GPU, TRAIN_NGPU = (None, None)

DEPLOY_GPU, DEPLOY_NGPU = (None, None)

In [9]:
TRAIN_VERSION = "tf-cpu.2-8"
DEPLOY_VERSION = "tf2-cpu.2-8"

TRAIN_IMAGE = "us-docker.pkg.dev/vertex-ai/training/{}:latest".format(TRAIN_VERSION)
DEPLOY_IMAGE = "us-docker.pkg.dev/vertex-ai/prediction/{}:latest".format(DEPLOY_VERSION)

print("Training:", TRAIN_IMAGE, TRAIN_GPU, TRAIN_NGPU)
print("Deployment:", DEPLOY_IMAGE, DEPLOY_GPU, DEPLOY_NGPU)

Training: us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-8:latest None None
Deployment: us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest None None


In [10]:
MACHINE_TYPE = "n1-standard"

VCPU = "4"
TRAIN_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Train machine type", TRAIN_COMPUTE)

MACHINE_TYPE = "n1-standard"

VCPU = "4"
DEPLOY_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Deploy machine type", DEPLOY_COMPUTE)

Train machine type n1-standard-4
Deploy machine type n1-standard-4


In [11]:
BQ_SOURCE = "bq://mle-creditcard-project.project_datasets.creditcard_default_taiwan"

In [17]:
# Calculate mean and std across all rows

NA_VALUES = ["NA", "."]

# Set up BigQuery clients
bqclient = bigquery.Client(project=PROJECT_ID)


# Download a table
def download_table(bq_table_uri: str):
    # Remove bq:// prefix if present
    prefix = "bq://"
    if bq_table_uri.startswith(prefix):
        bq_table_uri = bq_table_uri[len(prefix) :]

    table = bigquery.TableReference.from_string(bq_table_uri)
    rows = bqclient.list_rows(
        table,
    )
    return rows.to_dataframe()


# Remove NA values
def clean_dataframe(df):
    return df.dropna()


def calculate_mean_and_std(df):
    # Calculate mean and std for each applicable column
    mean_and_std = {}
    dtypes = list(zip(df.dtypes.index, map(str, df.dtypes)))
    print(df.info())
    # Normalize numeric columns.
    for column, dtype in dtypes:
        if dtype == "float32" or dtype == "float64" or dtype == "Int64":
            mean_and_std[column] = {
                "mean": df[column].mean(),
                "std": df[column].std(),
            }

    return mean_and_std


dataframe = download_table(BQ_SOURCE)
dataframe = clean_dataframe(dataframe)
mean_and_std = calculate_mean_and_std(dataframe)

print("The mean and stds for each column are: " + str(mean_and_std))

# Write to a file
MEAN_AND_STD_JSON_FILE = "mean_and_std.json"

with open(MEAN_AND_STD_JSON_FILE, "w") as outfile:
    json.dump(mean_and_std, outfile)

# Save to the staging bucket
! gsutil cp {MEAN_AND_STD_JSON_FILE} {BUCKET_URI}

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         30000 non-null  Int64  
 1   LIMIT_BAL  30000 non-null  Int64  
 2   SEX        30000 non-null  object 
 3   EDUCATION  30000 non-null  object 
 4   MARRIAGE   30000 non-null  object 
 5   AGE        30000 non-null  Int64  
 6   PAY_0      30000 non-null  Int64  
 7   PAY_2      30000 non-null  Int64  
 8   PAY_3      30000 non-null  Int64  
 9   PAY_4      30000 non-null  Int64  
 10  PAY_5      30000 non-null  Int64  
 11  PAY_6      30000 non-null  Int64  
 12  BILL_AMT1  30000 non-null  Int64  
 13  BILL_AMT2  30000 non-null  Int64  
 14  BILL_AMT3  30000 non-null  Int64  
 15  BILL_AMT4  30000 non-null  Int64  
 16  BILL_AMT5  30000 non-null  Int64  
 17  BILL_AMT6  30000 non-null  Int64  
 18  PAY_AMT1   30000 non-null  Int64  
 19  PAY_AMT2   30000 non-null  Int64  
 20  PAY_AM

In [18]:
dataset = aiplatform.TabularDataset.create(
    display_name="ccd-ds", bq_source=BQ_SOURCE
)

Creating TabularDataset
Create TabularDataset backing LRO: projects/153707083586/locations/us-central1/datasets/5452377007071428608/operations/2821959883325177856
TabularDataset created. Resource name: projects/153707083586/locations/us-central1/datasets/5452377007071428608
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/153707083586/locations/us-central1/datasets/5452377007071428608')


In [30]:
JOB_NAME = "ccd_train_job" + TIMESTAMP

if not TRAIN_NGPU or TRAIN_NGPU < 2:
    TRAIN_STRATEGY = "single"
else:
    TRAIN_STRATEGY = "mirror"

# EPOCHS = 20
# BATCH_SIZE = 10

CMDARGS = [
    # "--epochs=" + str(EPOCHS),
    # "--batch_size=" + str(BATCH_SIZE),
    "--distribute=" + TRAIN_STRATEGY,
    "--mean_and_std_json_file=" + f"{BUCKET_URI}/{MEAN_AND_STD_JSON_FILE}",
]

In [28]:
%%writefile task.py

import argparse
import tensorflow as tf
import numpy as np
import os
from xgboost import XGBClassifier
import pandas as pd
import tensorflow as tf
import pickle
from sklearn.preprocessing import LabelEncoder
from google.cloud import bigquery
from google.cloud import storage

# Read environmental variables
training_data_uri = os.getenv("AIP_TRAINING_DATA_URI")
validation_data_uri = os.getenv("AIP_VALIDATION_DATA_URI")
test_data_uri = os.getenv("AIP_TEST_DATA_URI")

# Read args
parser = argparse.ArgumentParser()
# parser.add_argument('--epochs', dest='epochs',
#                     default=10, type=int,
#                     help='Number of epochs.')
# parser.add_argument('--batch_size', dest='batch_size',
#                     default=10, type=int,
#                     help='Batch size.')
parser.add_argument('--distribute', dest='distribute', type=str, default='single',
                    help='Distributed training strategy.')
parser.add_argument('--mean_and_std_json_file', dest='mean_and_std_json_file', type=str,
                    help='GCS URI to the JSON file with pre-calculated column means and standard deviations.')
args = parser.parse_args()

def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    # bucket_name = "your-bucket-name"
    # source_blob_name = "storage-object-name"
    # destination_file_name = "local/path/to/file"

    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

    print(
        "Blob {} downloaded to {}.".format(
            source_blob_name, destination_file_name
        )
    )

def extract_bucket_and_prefix_from_gcs_path(gcs_path: str):
    """Given a complete GCS path, return the bucket name and prefix as a tuple.

    Example Usage:

        bucket, prefix = extract_bucket_and_prefix_from_gcs_path(
            "gs://example-bucket/path/to/folder"
        )

        # bucket = "example-bucket"
        # prefix = "path/to/folder"

    Args:
        gcs_path (str):
            Required. A full path to a Google Cloud Storage folder or resource.
            Can optionally include "gs://" prefix or end in a trailing slash "/".

    Returns:
        Tuple[str, Optional[str]]
            A (bucket, prefix) pair from provided GCS path. If a prefix is not
            present, a None will be returned in its place.
    """
    if gcs_path.startswith("gs://"):
        gcs_path = gcs_path[5:]
    if gcs_path.endswith("/"):
        gcs_path = gcs_path[:-1]

    gcs_parts = gcs_path.split("/", 1)
    gcs_bucket = gcs_parts[0]
    gcs_blob_prefix = None if len(gcs_parts) == 1 else gcs_parts[1]

    return (gcs_bucket, gcs_blob_prefix)

# Download means and std
def download_mean_and_std(mean_and_std_json_file):
    """Download mean and std for each column"""
    import json

    bucket, file_path = extract_bucket_and_prefix_from_gcs_path(mean_and_std_json_file)
    download_blob(bucket_name=bucket, source_blob_name=file_path, destination_file_name=file_path)

    with open(file_path, 'r') as file:
        return json.loads(file.read())

mean_and_std = download_mean_and_std(args.mean_and_std_json_file)

# Single Machine, single compute device
if args.distribute == 'single':
    if tf.test.is_gpu_available():
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    else:
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
# Single Machine, multiple compute device
elif args.distribute == 'mirror':
    strategy = tf.distribute.MirroredStrategy()
# Multiple Machine, multiple compute device
elif args.distribute == 'multi':
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

# Set up training variables
LABEL_COLUMN = "default_"
UNUSED_COLUMNS = ['ID', 'SEX', 'EDUCATION', 'MARRIAGE']
NA_VALUES = ["NA", "."]

# Possible categorical values
MARRIAGE = ['Married', 'Single', 'Other', '0']
EDUCATION = ['University', 'Graduate school', 'High School',
             'Unknown', 'Others', '0']
SEX = ['F', 'M']


# Set up BigQuery clients
bqclient = bigquery.Client()

# Download a table
def download_table(bq_table_uri: str):
    # Remove bq:// prefix if present
    prefix = "bq://"
    if bq_table_uri.startswith(prefix):
        bq_table_uri = bq_table_uri[len(prefix):]

    table = bigquery.TableReference.from_string(bq_table_uri)
    rows = bqclient.list_rows(
        table,
    )
    return rows.to_dataframe(create_bqstorage_client=False)


df_train = download_table(training_data_uri)
df_validation = download_table(validation_data_uri)
df_test = download_table(test_data_uri)

# Remove NA values
def clean_dataframe(df):
    return df.dropna()


df_train = clean_dataframe(df_train)
#     df_validation = clean_dataframe(df_validation)
df_validation = clean_dataframe(df_validation)

_CATEGORICAL_TYPES = {
    "marriage": pd.api.types.CategoricalDtype(categories=MARRIAGE),
    "education": pd.api.types.CategoricalDtype(categories=EDUCATION),
    "sex": pd.api.types.CategoricalDtype(categories=SEX),
}


def standardize(df, mean_and_std):
    """Scales numerical columns using their means and standard deviation to get
    z-scores: the mean of each numerical column becomes 0, and the standard
    deviation becomes 1. This can help the model converge during training.

    Args:
      df: Pandas df

    Returns:
      Input df with the numerical columns scaled to z-scores
    """
    dtypes = list(zip(df.dtypes.index, map(str, df.dtypes)))
    # Normalize numeric columns.
    for column, dtype in dtypes:
        if dtype == "float32":
            df[column] -= mean_and_std[column]["mean"]
            df[column] /= mean_and_std[column]["std"]
    return df

def preprocess(df):
    """Converts categorical features to numeric. Removes unused columns.

    Args:
      df: Pandas df with raw data

    Returns:
      df with preprocessed data
    """
    df = df.drop(columns=UNUSED_COLUMNS)

    # Drop rows with NaN's
    df = df.dropna()

    # Convert integer valued (numeric) columns to floating point
    numeric_columns = df.select_dtypes(["int32", "float32", "float64"]).columns
    df[numeric_columns] = df[numeric_columns].astype("float32")

    # Convert categorical columns to numeric
    cat_columns = df.select_dtypes(["object"]).columns

    df[cat_columns] = df[cat_columns].apply(
        lambda x: x.astype(_CATEGORICAL_TYPES[x.name])
    )
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df


def convert_dataframe_to_dataset(
    df_train,
    df_validation,
    mean_and_std
):
    df_train = preprocess(df_train)
    df_validation = preprocess(df_validation)

    df_train_x, df_train_y = df_train, df_train.pop(LABEL_COLUMN)
    df_validation_x, df_validation_y = df_validation, df_validation.pop(LABEL_COLUMN)

    # Join train_x and eval_x to normalize on overall means and standard
    # deviations. Then separate them again.
    all_x = pd.concat([df_train_x, df_validation_x], keys=["train", "eval"])
    # all_x = standardize(all_x, mean_and_std)
    df_train_x, df_validation_x = all_x.xs("train"), all_x.xs("eval")

    # y_train = np.asarray(df_train_y).astype("float32")
    # y_validation = np.asarray(df_validation_y).astype("float32")

    # Convert to numpy representation
    x_train = np.asarray(df_train_x)
    x_test = np.asarray(df_validation_x)

    # label encode the target column
    le = LabelEncoder()
    labels = df_train_y
    y = le.fit_transform(labels) 
    y_train = le.fit_transform(labels) 
    
    labels = df_validation_y
    y_validation = le.fit_transform(labels) 

    # dataset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    # dataset_validation = tf.data.Dataset.from_tensor_slices((x_test, y_validation))
    return x_train, y_train, x_test, y_validation

# Create datasets
x_train, y_train, x_test, y_validation = convert_dataframe_to_dataset(df_train, df_validation, mean_and_std)

# # Shuffle train set
# dataset_train = dataset_train.shuffle(len(df_train))

# def create_model(num_features):
    # Create model
    # xg_model = XGBClassifier()
    # return model

# # Create the model
# with strategy.scope():
#     model = create_model(num_features=dataset_train._flat_shapes[0].dims[0].value)

# Set up datasets
# NUM_WORKERS = strategy.num_replicas_in_sync
# Here the batch size scales up by number of workers since
# `tf.data.Dataset.batch` expects the global batch size.
# GLOBAL_BATCH_SIZE = args.batch_size * NUM_WORKERS
# dataset_train = dataset_train.batch(GLOBAL_BATCH_SIZE)
# dataset_validation = dataset_validation.batch(GLOBAL_BATCH_SIZE)

# Train the model
xg_model = XGBClassifier()
xg_model.fit(x_train,y_train)

artifact_filename = 'model.pkl'

# Save model artifact to local filesystem (doesn't persist)
local_path = artifact_filename
with open(local_path, 'wb') as model_file:
  pickle.dump(xg_model, model_file)

# Upload model artifact to Cloud Storage
model_directory = os.environ['AIP_MODEL_DIR']
storage_path = os.path.join(model_directory, artifact_filename)
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client())
blob.upload_from_filename(local_path)

df_test.head()

Overwriting task.py


In [31]:
job = aiplatform.CustomTrainingJob(
    display_name=JOB_NAME,
    script_path="task.py",
    container_uri=TRAIN_IMAGE,
    requirements=["google-cloud-bigquery>=2.20.0", "db-dtypes"],
    model_serving_container_image_uri=DEPLOY_IMAGE,
)

MODEL_DISPLAY_NAME = "ccd-" + TIMESTAMP

# Start the training
# if TRAIN_GPU:
#     model = job.run(
#         dataset=dataset,
#         model_display_name=MODEL_DISPLAY_NAME,
#         bigquery_destination=f"bq://{PROJECT_ID}",
#         args=CMDARGS,
#         replica_count=1,
#         machine_type=TRAIN_COMPUTE,
#         accelerator_type=TRAIN_GPU.name,
#         accelerator_count=TRAIN_NGPU,
#     )
# else:
model = job.run(
    dataset=dataset,
    model_display_name=MODEL_DISPLAY_NAME,
    bigquery_destination=f"bq://{PROJECT_ID}",
    args=CMDARGS,
    replica_count=1,
    machine_type=TRAIN_COMPUTE,
    accelerator_count=0,
)

Training script copied to:
gs://creditcard_default_project/aiplatform-2022-06-22-03:18:33.896-aiplatform_custom_trainer_script-0.1.tar.gz.
Training Output directory:
gs://creditcard_default_project/aiplatform-custom-training-2022-06-22-03:18:34.045 
No dataset split provided. The service will use a default split.
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/3683865605229772800?project=153707083586
CustomTrainingJob projects/153707083586/locations/us-central1/trainingPipelines/3683865605229772800 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomTrainingJob projects/153707083586/locations/us-central1/trainingPipelines/3683865605229772800 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomTrainingJob projects/153707083586/locations/us-central1/trainingPipelines/3683865605229772800 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomTrainingJob projects/153707083586/locations/us-central1/trainingPipelines/36838656052297728