## Set Variables

In [1]:
PROJECT_ID = 'jchavezar-demo'
TRAIN_IMAGE = 'gcr.io/jchavezar-demo/pytorch-custom-random-t:v2'
PREDICTION_IMAGE = 'gcr.io/jchavezar-demo/pytorch-custom-random-p:v2'
STAGING_BUCKET = 'gs://vtx-staging'

# Training Block

In [2]:
## Create Folder Code Files Structure
!rm -fr training
!mkdir training

## Create Training Code [PyTorch]

In [3]:
%%writefile training/train.py
#%%
import pandas as pd
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

train = pd.read_csv('gs://vtx-datasets-public/pytorch_tabular/synthetic/train.csv')
test = pd.read_csv('gs://vtx-datasets-public/pytorch_tabular/synthetic/test.csv')
val = pd.read_csv('gs://vtx-datasets-public/pytorch_tabular/synthetic/val.csv')

cat_columns = [col for col in train.columns if 'cat' in col]
num_columns = [col for col in train.columns if 'num' in col]

data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    accelerator="auto", # can be 'cpu','gpu', 'tpu', or 'ipu' 
)
optimizer_config = OptimizerConfig()


head_config = LinearHeadConfig(
    layers="", # No additional layer in head, just a mapping layer to output_dim
    dropout=0.1,
    initialization="kaiming"
).__dict__ # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="32-16", # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    dropout=0.1,
    initialization="kaiming",
    head = "LinearHead", #Linear Head
    head_config = head_config, # Linear Head Config
    learning_rate = 1e-3
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

tabular_model.fit(train=train, validation=val)
tabular_model.save_model('/gcs/vtx-models/pytorch/tabular_random')

Writing training/train.py


### Build Image and Push to GCR

In [6]:
%%writefile training/Dockerfile
FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel

COPY . .
RUN pip install pytorch_tabular[extra]
RUN pip install gcsfs

ENTRYPOINT ["python", "train.py"]

Overwriting training/Dockerfile


In [7]:
!gcloud builds submit -t $TRAIN_IMAGE training/.

Creating temporary tarball archive of 2 file(s) totalling 2.2 KiB before compression.
Uploading tarball of [training/.] to [gs://jchavezar-demo_cloudbuild/source/1679410868.018461-6def21ef3d944df5a5740e9cff97c5ff.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/jchavezar-demo/locations/global/builds/d67657c8-2139-41ae-8c31-3be5a9db90bd].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/d67657c8-2139-41ae-8c31-3be5a9db90bd?project=569083142710 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "d67657c8-2139-41ae-8c31-3be5a9db90bd"

FETCHSOURCE
Fetching storage object: gs://jchavezar-demo_cloudbuild/source/1679410868.018461-6def21ef3d944df5a5740e9cff97c5ff.tgz#1679410868220541
Copying gs://jchavezar-demo_cloudbuild/source/1679410868.018461-6def21ef3d944df5a5740e9cff97c5ff.tgz#1679410868220541...
/ [1 files][  1.2 KiB/  1.2 KiB]                                                
Operation completed over 1 

## Run Training CustomJob using Container Image

In [8]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, staging_bucket=STAGING_BUCKET)

worker_pool_specs = [
        {
            "machine_spec": {
                "machine_type": "n1-standard-4",
                "accelerator_type": "NVIDIA_TESLA_T4",
                "accelerator_count": 1,
            },
            "replica_count": 1,
            "container_spec": {
                "image_uri": TRAIN_IMAGE,
                "command": [],
                "args": [],
            },
        }
    ]

my_job = aiplatform.CustomJob(
    display_name='pytorch_tabular_custom',
    worker_pool_specs=worker_pool_specs,
)

my_job.run()

Creating CustomJob
CustomJob created. Resource name: projects/569083142710/locations/us-central1/customJobs/2428807579300790272
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/569083142710/locations/us-central1/customJobs/2428807579300790272')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/2428807579300790272?project=569083142710
CustomJob projects/569083142710/locations/us-central1/customJobs/2428807579300790272 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/2428807579300790272 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/2428807579300790272 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/2428807579300790272 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/2428807

RuntimeError: Job failed with:
code: 3
message: "The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=569083142710&resource=ml_job%2Fjob_id%2F2428807579300790272&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%222428807579300790272%22"


# Prediction Block

In [None]:
## Create Folder Code Files Structure
!rm -fr prediction
!mkdir prediction
!mkdir prediction/app
!mkdir prediction/app/tabular_random

## Create Prediction Code [Uvicorn:FastAPI PyTorch]

In [None]:
%%writefile prediction/app/main.py
import json
import os
import pandas as pd
from fastapi import Request, FastAPI
from pytorch_tabular import TabularModel

app = FastAPI()
columns = pd.read_csv('gs://vtx-datasets-public/pytorch_tabular/synthetic/train.csv', nrows=0).iloc[:,:-1].columns.to_list()
loaded_model = TabularModel.load_from_checkpoint("../../training/tabular_random_model")

@app.get('/health_check')
def health():
    return 200
if os.environ.get('AIP_PREDICT_ROUTE') is not None:
    method = os.environ['AIP_PREDICT_ROUTE']
else:
    method = '/predict'

@app.post(method)
async def predict(request: Request):
    print("----------------- PREDICTING -----------------")
    body = await request.json()
    instances = body["instances"]
    data_pred = pd.DataFrame([instances],columns=columns)
    print(data_pred)
    outputs = loaded_model.predict(data_pred)
    response = outputs['prediction'].tolist()
    print("----------------- OUTPUTS -----------------")
    return {"predictions": response}

### Build Image and Push to GCR

In [None]:
%%writefile prediction/Dockerfile
FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel

COPY app /app
WORKDIR /app

RUN pip install pytorch_tabular[extra]
RUN pip install uvicorn fastapi
RUN pip install gcsfs

CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]

## Copy Model from GCS

In [None]:
!gsutil cp -r gs://vtx-models/pytorch/tabular_random prediction/app/tabular_random

## Create Container Image and Push it

In [None]:
!gcloud builds submit -t $PREDICTION_IMAGE prediction/.

## Upload to Model Registry

In [None]:
model = aiplatform.Model.upload(
    display_name="synthetic_data_pytorch",
    serving_container_image_uri=PREDICTION_IMAGE,
    serving_container_health_route="/health_check",
    serving_container_ports=[8080]
)

In [None]:
endpoint = model.deploy(
    deployed_model_display_name='synthetic_data_pytorch',
    machine_type='a2-highgpu-1g',
    accelerator_type='NVIDIA_TESLA_A100',
    accelerator_count=1,
    min_replica_count=1,
    max_replica_count=1
)