# How to manage Machine Learning Models written in R with Vertex AI

In [90]:
library(reticulate)
library(glue)
library(IRdisplay)
use_python(Sys.which("python3"))

In [120]:
sh <- function(cmd, args = c(), intern = FALSE) {
    if (is.null(args)) {
        cmd <- glue(cmd)
        s <- strsplit(cmd, " ")[[1]]
        cmd <- s[1]
        args <- s[2:length(s)]
    }
    ret <- system2(cmd, args, stdout = TRUE, stderr = TRUE)
    if ("errmsg" %in% attributes(attributes(ret))$names) cat(attr(ret, "errmsg"), "\n")
    if (intern) return(ret) else cat(paste(ret, collapse = "\n"))
}

display_file <- function(filename) {
    display_html(paste(sh(glue("pygmentize -g {filename} -f html -P full -O style=default"), intern = TRUE), collapse = "\n"))
}

## Set variables

In [3]:
project_id <- sh("gcloud config list --format 'value(core.project)'", intern = TRUE)
location <- "us-central1"
repo_name <- "vertex-r"
image_name <- "vertex-r"
image_tag <- "latest"
image_uri <- glue("{location}-docker.pkg.dev/{project_id}/{repo_name}/{image_name}:{image_tag}")
bucket <- glue("{project_id}-vertex-r")

## Prerequisites

In [119]:
sh("gcloud services enable artifactregistry.googleapis.com")

In [121]:
sh("gcloud artifacts repositories create {repo_name} --repository-format=docker --location={location}")

“running command ''gcloud' artifacts repositories create vertex-r --repository-format=docker --location=us-central1 2>&1' had status 1”


Resource temporarily unavailable 
ERROR: (gcloud.artifacts.repositories.create) ALREADY_EXISTS: the repository already exists

In [122]:
# sh("gcloud auth configure-docker {location}-docker.pkg.dev") ## not required

In [123]:
sh("gsutil mb -l {location} gs://{bucket}")

“running command ''gsutil' mb -l us-central1 gs://astute-ace-336608-vertex-r 2>&1' had status 1”


Resource temporarily unavailable 
Creating gs://astute-ace-336608-vertex-r/...
ServiceException: 409 A Cloud Storage bucket named 'astute-ace-336608-vertex-r' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.

## Build a docker container for R

In [116]:
display_file("Dockerfile")

0,1
1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17,"# filename: Dockerfile - container specifications for using R in Vertex AI FROM gcr.io/deeplearning-platform-release/r-cpu.4-1:latest WORKDIR /root COPY train.R /root/train.R COPY serve.R /root/serve.R # Install Fortran RUN apt-get update RUN apt-get install gfortran -yy # Install R packages RUN Rscript -e ""install.packages('plumber')"" RUN Rscript -e ""install.packages('randomForest')"" EXPOSE 8080"


In [117]:
display_file("train.R")

0,1
1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30,"#!/usr/bin/env Rscript # filename: train.R - train a Random Forest model on Vertex AI Managed Dataset library(tidyverse) library(data.table) library(randomForest) Sys.getenv() project_id <- Sys.getenv(""CLOUD_ML_PROJECT_ID"") job_spec <- Sys.getenv(""CLOUD_MD_JOB"") location <- Sys.getenv(""CLOUD_ML_REGION"") model_dir <- Sys.getenv(""AIP_MODEL_DIR"") checkpoint_dir <- Sys.getenv(""AIP_CHECKPOINT_DIR"") dir.create(""training"") dir.create(""validation"") dir.create(""test"") system2(""gsutil"", c(""cp"", Sys.getenv(""AIP_TRAINING_DATA_URI""), ""training/"")) system2(""gsutil"", c(""cp"", Sys.getenv(""AIP_VALIDATION_DATA_URI""), ""validation/"")) system2(""gsutil"", c(""cp"", Sys.getenv(""AIP_TEST_DATA_URI""), ""test/"")) training_df <- list.files(""training"", full.names = TRUE) %>% map_df(~fread(.)) validation_df <- list.files(""validation"", full.names = TRUE) %>% map_df(~fread(.)) test_df <- list.files(""validation"", full.names = TRUE) %>% map_df(~fread(.)) print(training_df) rf <- randomForest(medianHouseValue ~ ., data=training_df, ntree=100) rf saveRDS(rf, ""rf.rds"") system2(""gsutil"", c(""cp"", ""rf.rds"", model_dir))"


In [118]:
display_file("serve.R")

0,1
1  2  3  4  5  6  7  8  9 10 11 12 13 14 15,"#!/usr/bin/env Rscript # filename: serve.R - serve predictions from a Random Forest model Sys.getenv() library(plumber) #system2(""gsutil"", c(""cp"", ""-r"", Sys.getenv(""AIP_STORAGE_URI""), ""."")) predict <- function(req, res) {  return(list(predictions=list(1, 2, 3))) } pr() %>%  pr_get(Sys.getenv(""AIP_HEALTH_ROUTE""), function() ""OK"") %>%  pr_post(Sys.getenv(""AIP_PREDICT_ROUTE""), predict) %>%  pr_run(host = ""0.0.0.0"", port=as.integer(Sys.getenv(""AIP_HTTP_PORT"", 8080)))"


In [232]:
sh("gcloud builds submit --region={location} --tag={image_uri} --timeout=1h")

Creating temporary tarball archive of 12 file(s) totalling 8.0 KiB before compression.
Uploading tarball of [.] to [gs://astute-ace-336608_cloudbuild/source/1651149122.191425-0ff1e836506d40b3a28b21519563b13e.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/astute-ace-336608/locations/us-central1/builds/7a365a5e-0868-497c-b647-b84a6583aa89].
Logs are available at [https://console.cloud.google.com/cloud-build/builds;region=us-central1/7a365a5e-0868-497c-b647-b84a6583aa89?project=888342260584].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "7a365a5e-0868-497c-b647-b84a6583aa89"

FETCHSOURCE
Fetching storage object: gs://astute-ace-336608_cloudbuild/source/1651149122.191425-0ff1e836506d40b3a28b21519563b13e.tgz#1651149122488426
Copying gs://astute-ace-336608_cloudbuild/source/1651149122.191425-0ff1e836506d40b3a28b21519563b13e.tgz#1651149122488426...
/ [1 files][  3.6 KiB/  3.6 KiB]                                                
O

## Prepare training data

In [124]:
data_uri <- glue("gs://{bucket}/california.csv")
download.file("https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz", destfile = "cal_housing.tgz")
untar("cal_housing.tgz")
housing.cols <- read.csv("CaliforniaHousing/cal_housing.domain", sep = ":", header = FALSE)[,1]
housing <- read.csv("CaliforniaHousing/cal_housing.data", col.names = housing.cols)
write.csv(housing, "housing.csv", row.names = FALSE)
system2("gsutil", c("cp", "housing.csv", data_uri))

In [14]:
vertex <- import("google.cloud.aiplatform")
vertex$init(project = project_id, location = location, staging_bucket = bucket)

In [15]:
dataset <- vertex$TabularDataset$create(
    display_name = "California Housing Dataset",
    gcs_source = data_uri
)

In [16]:
job <- vertex$CustomContainerTrainingJob(
    display_name = "vertex-r",
    container_uri = image_uri,
    command = c("Rscript", "train.R"),
    model_serving_container_command = c("Rscript", "serve.R"),
    model_serving_container_image_uri = image_uri
)

In [17]:
model <- job$run(
    dataset=dataset,
    model_display_name = "vertex-r-model",
    machine_type = "n1-standard-4"
)

In [18]:
model$display_name

In [19]:
model$resource_name

In [20]:
model$uri

In [21]:
endpoint <- vertex$Endpoint$create(
    display_name = "California Housing Endpoint",
    project = project_id,
    location = location
)

In [22]:
model$deploy(endpoint = endpoint, machine_type = "n1-standard-4")

<google.cloud.aiplatform.models.Endpoint> 
resource name: projects/888342260584/locations/us-central1/endpoints/5210356905611886592

## Cleanup

In [23]:
endpoint$undeploy_all()
endpoint$delete()
dataset$delete()
model$delete()
job$delete()

<google.cloud.aiplatform.models.Endpoint> 
resource name: projects/888342260584/locations/us-central1/endpoints/5210356905611886592