In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Environment Setup

## Overview

In this tutorial, you learn to use `Vertex AI Pipelines`, `KubeFlow Components` and `Google Cloud Pipeline Components` to build a `custom` tabular regression model. `Vertex Pipelines workshop`, is a series of labs on how to build an end-to-end pipeline using Vertex Pipelines and Kubeflow Pipelines (kfp). In the pipeline we orchestrate data creation, data processing, model training and evaluation, and model deployment. We'll also see how to send payloads the endpoint deployed and how to run batch predition jobs. 

In this workshop we'll use the **public datase**t [Auto MPG](https://archive.ics.uci.edu/ml/datasets/auto+mpg) for demonstration purposes. The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes. The objective will be to build a model to predict "MPG" (Miles per Gallon).

The Google Cloud Components are [documented here](https://google-cloud-pipeline-components.readthedocs.io/en/latest/google_cloud_pipeline_components.aiplatform.html#module-google_cloud_pipeline_components.aiplatform).

The KubeFlow Compoenents are [documented here](https://www.kubeflow.org/docs/components/pipelines/v1/sdk-v2/python-function-components/)

## Notebook Objective

In this notebook, you will setup your environment to run the notebooks of this workshop.

This lab uses the following Google Cloud services and resources:

- `BigQuery`
- `Vertex AI Pipelines`
- `Google Cloud Pipeline Components`
- `Vertex AI Model`
- `Vertex AI Model Registry`
- `Vertex AI Metadata`
- `Vertex AI Endpoint`

The steps performed in this notebook include:

* [Setup your environment](#Setup-your-environment)
* [Download Public Dataset Locally](#Download-Public-Dataset-Locally)
* [Store Dataset in Google Cloud Storage](#Store-Dataset-in-Google-Cloud-Storage)
* [Load Dataset into BigQuery](#Load-Dataset-into-BigQuery)
* [Create config file](#Create-config-file)





## Setup your environment

### Install additional packages

In [None]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

!pip install --upgrade --no-warn-conflicts '{USER_FLAG}' -q \
    google-cloud-aiplatform \
    google-cloud-pipeline-components \
    facets-overview \
    ipywidgets \
    google-cloud-storage \
    tensorflow==2.8.0 \
    plotly==5.10.0 \
    itables==1.2.0 \
    plotly==5.10.0 \
    kfp==1.8.12 

In [None]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

### Configure Project Constants

In [None]:
import google.auth
import random
import string
from typing import Union

# Generate unique ID to help w/ unique naming of certain pieces
ID = "".join(random.choices(string.ascii_lowercase + string.digits, k=3))
_ , PROJECT_ID = google.auth.default()
REGION = "us-central1"
BUCKET_NAME = f"{PROJECT_ID}-{ID}-bucket"
print('PROJECT_ID', PROJECT_ID)
print('BUCKET_NAME', BUCKET_NAME)

--------

#### Create Bucket

In [None]:
from src.helper import create_bucket

# Create new bucket
new_bucket_name, new_bucket_uri = create_bucket(
  bucket_name=BUCKET_NAME,
  region=REGION,
  project_id=PROJECT_ID
)

--------

## Download Public Dataset Locally

In [None]:
import pandas as pd

In [None]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

column_names = [
    'MPG', 'Cylinders', 'Displacement',
    'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin'
]

raw_dataset = pd.read_csv(
    url,
    names=column_names,
    na_values='?',
    comment='\t',
    sep=' ',
    skipinitialspace=True,
)

raw_dataset.rename(
    columns = {
        'MPG': 'mpg',
        'Cylinders': 'cyl',
        'Displacement': 'dis',
        'Horsepower': 'hp',
        'Weight': 'weight',
        'Acceleration': 'accel',
        'Model Year': 'year',
        'Origin': 'origin'
    }, inplace = True
)

raw_dataset.dropna(inplace=True)


In [None]:
raw_dataset.head()

#### Store dataset locally

In [None]:
local_path = 'data/fuel_data.csv'
raw_dataset.to_csv(header=False, index=False, path_or_buf=local_path)

## Store Dataset in Google Cloud Storage

In [None]:
from src.helper import upload_file_to_gcs

gcs_data_path = upload_file_to_gcs(
    project_id=PROJECT_ID,
    target=BUCKET_NAME,
    source=local_path,
    blob_name='data/fuel_data.csv')

In [None]:
gcs_data_path

## Load Dataset into BigQuery

In [None]:
def create_bq_dataset(
    gcs_uri: str,
    project_id: str,
    dataset_name: str,
    table_name: str,
):
    """
    Create a new bucket in the US region with the STANDARD storage class
    Args:
        gcs_uri: gcs uri (gs://...)
        bucket_name: name of the bucket
        region: region or zone
        service_account: service account
    Output:
        table_id:string, Table in BigQuery
    """
    
    from google.cloud import bigquery
    import os

    # Create bigquery table
    bq_client = bigquery.Client(project=project_id)
    dataset_name = dataset_name

    dataset_id = "{}.{}".format(bq_client.project, dataset_name)
    dataset = bigquery.Dataset(dataset_id)
    dataset.location = "US"

    try:
        dataset = bq_client.create_dataset(dataset, timeout=30)
        print("Created dataset {}.{}".format(bq_client.project, dataset.dataset_id))
    except:
        bq_client.delete_dataset(dataset_id, delete_contents=True)
        dataset = bq_client.create_dataset(dataset, timeout=30)

    # Create table
    table_name = table_name
    table_id = f"{dataset_id}.{table_name}"

    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("mpg", bigquery.enums.SqlTypeNames.FLOAT64),
            bigquery.SchemaField("cyl", bigquery.enums.SqlTypeNames.INTEGER),
            bigquery.SchemaField("dis", bigquery.enums.SqlTypeNames.FLOAT64),
            bigquery.SchemaField("hp", bigquery.enums.SqlTypeNames.FLOAT64),
            bigquery.SchemaField("weight", bigquery.enums.SqlTypeNames.FLOAT64),
            bigquery.SchemaField("accel", bigquery.enums.SqlTypeNames.FLOAT64),
            bigquery.SchemaField("year", bigquery.enums.SqlTypeNames.INTEGER),
            bigquery.SchemaField("origin", bigquery.enums.SqlTypeNames.INTEGER),
        ], 
        write_disposition="WRITE_TRUNCATE")

    job = bq_client.load_table_from_uri(
        gcs_uri, table_id, job_config=job_config)

    job.result()

    bq_dataset_uri = f"bq://{dataset_id}.{table_name}"
    
    return bq_dataset_uri

In [None]:
bq_dataset_uri = create_bq_dataset(
    gcs_uri=gcs_data_path,
    project_id=PROJECT_ID,
    dataset_name='fuel_dataset',
    table_name='main',
)

In [None]:
bq_dataset_uri

## Create config file

#### Create Config File

In [None]:
config = {
    'PROJECT_ID':PROJECT_ID,
    'REGION': REGION,
    'ID': ID,
    'BUCKET_NAME': BUCKET_NAME,
    'GCS_DATA_URI': gcs_data_path,
    'BQ_DATASET_URI': bq_dataset_uri
}

In [None]:
import json
with open("src/config.py", 'w') as f: 
    f.write(f"config={json.dumps(config)}")

In [None]:
from src.config import config

In [None]:
config

In [None]:
config['PROJECT_ID']