In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 01 - Exploratory Data Analysis

## Overview
`Vertex Pipelines workshop`, is a series of labs on how to build an end-to-end pipeline using Vertex Pipelines and Kubeflow Pipelines (kfp). In the pipeline we orchestrate data creation, data processing, model training and evaluation, and model deployment. We'll also see how to send payloads the endpoint deployed and how to run batch predition jobs. 

In this workshop we'll use the **public datase**t [Auto MPG](https://archive.ics.uci.edu/ml/datasets/auto+mpg) for demonstration purposes. The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes. The objective will be to build a model to predict "MPG" (Miles per Gallon).

**Attribute Information**:

1. `mpg`: continuous
2. `cylinders`: multi-valued discrete
3. `displacement`: continuous
4. `horsepower`: continuous
5. `weight`: continuous
6. `acceleration`: continuous
7. `model year`: multi-valued discrete
8. `origin`: multi-valued discrete
9. `car name`: string (unique for each instance)



## Notebook Objective

This tutorial uses the following Google Cloud Services services:

- `BigQuery`

Steps performed in this notebook.

1. [Load Configuration settings from the setup notebook](#Load-Configuration-settings-from-the-setup-notebook)
1. [Read Data from BigQuery as DataFrame](#Read-Data-from-BigQuery-as-DataFrame)
1. [Explore the data using pandas](#Explore-the-data-using-pandas)
1. [Apply data transformations](#Apply-data-transformations)
1. [Explore the datasets using Facets](#Explore-the-datasets-using-Facets)

## Load Configuration settings from the setup notebook

In [None]:
from src.config import config

In [None]:
PROJECT_ID = config['PROJECT_ID']
REGION = config['REGION']
ID = config['ID']
BUCKET_NAME = config['BUCKET_NAME']
GCS_DATA_URI = config['GCS_DATA_URI']
BQ_DATASET_URI = config['BQ_DATASET_URI']

In [None]:
BQ_DATASET_URI[5:]

### Import Libraries

In [None]:
import pandas as pd
import facets_overview
from google.cloud import bigquery
from IPython.core.display import display, HTML
import base64

---------

## Read data from BigQuery as DataFrame

In [None]:
BQ_DATASET_URI

In [None]:
BQ_DATASET_URI[5:]

In [None]:
%%bigquery raw_dataset --project $PROJECT_ID
SELECT * FROM `vertex-ai-workshop-2022.fuel_dataset.main`

------

## Explore the data using pandas

In [None]:
raw_dataset.head()

In [None]:
raw_dataset.describe()

In [None]:
raw_dataset.isna().sum(axis=0)

--------------

## Apply data transformations 

In [None]:
raw_dataset.rename(columns = {
    'mpg':'MPG',
    'cyl':'Cylinders',
    'dis':'Displacement',
    'hp': 'Horsepower',
    'weight': 'Weight',
    'accel': 'Acceleration',
    'year': 'Model Year',
    'origin': 'Origin'}, inplace = True)

# Get data in shape
dataset = raw_dataset.copy()
dataset.tail()
dataset = dataset.dropna()
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')



### Split Data into Train and Test Datasets

In [None]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

------

## Explore the datasets using Facets

[Facets](https://pair-code.github.io/facets/) is an open-source visualization tool released by Google under the PAIR(People + AI Research) initiative. Facets contains two robust visualizations to aid in understanding and analyzing machine learning datasets. Get a sense of the shape of each feature of your dataset using Facets Overview, or explore individual observations using Facets Dive.

### Facets Dive

Facets Dive provides an interactive interface for exploring the relationship between data points across all of the different features of a dataset. Each individual item in the visualization represents a data point. Position items by "faceting" or bucketing them in multiple dimensions by their feature values. Success stories of Dive include the detection of classifier failure, identification of systematic errors, evaluating ground truth and potential new signals for ranking.


To use the Dive visualization, the data has to be transformed into JSON format.

[source](https://pair-code.github.io/facets/)

In [None]:
# Display the Dive visualization for the training data.
from IPython.core.display import display, HTML

jsonstr = dataset.to_json(orient='records')
HTML_TEMPLATE = """
        <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
        <facets-dive id="elem" height="600"></facets-dive>
        <script>
          var data = {jsonstr};
          document.querySelector("#elem").data = data;
        </script>"""
html = HTML_TEMPLATE.format(jsonstr=jsonstr)
display(HTML(html))

In [None]:
with open('output_facets_dive.html','w') as fo:
    fo.write(html)

---------------------

### Facets Overview 

Facets Overview takes input feature data from any number of datasets, analyzes them feature by feature and visualizes the analysis

Overview gives users a quick understanding of the distribution of values across the features of their dataset(s). Uncover several uncommon and common issues such as unexpected feature values, missing feature values for a large number of observation, training/serving skew and train/test/validation set skew.

Some of the information given is:

- Statistics
- Missing data
- Zero values
- Distribution of features
- Comparison between datasets (incl. distribution comparison)

[source](https://pair-code.github.io/facets/)

In [None]:
# Create the feature stats for the datasets and stringify it.
import base64
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator

In [None]:
gfsg = GenericFeatureStatisticsGenerator()
proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': train_dataset},
                                 {'name': 'test', 'table': test_dataset}])
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

In [None]:
# Display the facets overview visualization for this data
from IPython.core.display import display, HTML

HTML_TEMPLATE = """
        <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
        <facets-overview id="elem"></facets-overview>
        <script>
          document.querySelector("#elem").protoInput = "{protostr}";
        </script>"""
html = HTML_TEMPLATE.format(protostr=protostr)
display(HTML(html))

In [None]:
with open('output_facets_overview.html','w') as fo:
    fo.write(html)

-----