# 01 - Extract, Inspect & Store Dataset

In [7]:
#!pip install facets-overview

In [3]:
#!pip install ipywidgets

In [4]:
import pandas as pd
import facets_overview
from IPython.core.display import display, HTML
import base64

## Download Workshop Data

In [5]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

column_names = [
    'MPG', 'Cylinders', 'Displacement',
    'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin'
]

raw_dataset = pd.read_csv(
    url,
    names=column_names,
    na_values='?',
    comment='\t',
    sep=' ',
    skipinitialspace=True,
)

raw_dataset.rename(
    columns = {
        'MPG': 'mpg',
        'Cylinders': 'cyl',
        'Displacement': 'dis',
        'Horsepower': 'hp',
        'Weight': 'weight',
        'Acceleration': 'accel',
        'Model Year': 'year',
        'Origin': 'origin'
    }, inplace = True
)

raw_dataset.dropna(inplace=True)
# gcs_path = 'data/fuel_data.csv'
# raw_dataset.to_csv(header=False, index=False, path_or_buf=gcs_path)

---------

##  Quickly Explore the dataset

In [6]:
raw_dataset.head()

Unnamed: 0,mpg,cyl,dis,hp,weight,accel,year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [8]:
raw_dataset.describe()

Unnamed: 0,mpg,cyl,dis,hp,weight,accel,year,origin
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592,1.576531
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737,0.805518
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,73.0,1.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [11]:
raw_dataset.isna().sum(axis=0)

mpg       0
cyl       0
dis       0
hp        0
weight    0
accel     0
year      0
origin    0
dtype: int64

--------------

## Do some data transformations & split into Train and Test Datasets

In [25]:
raw_dataset.rename(columns = {
    'mpg':'MPG',
    'cyl':'Cylinders',
    'dis':'Displacement',
    'hp': 'Horsepower',
    'weight': 'Weight',
    'accel': 'Acceleration',
    'year': 'Model Year',
    'origin': 'Origin'}, inplace = True)

# Get data in shape
dataset = raw_dataset.copy()
dataset.tail()
dataset = dataset.dropna()
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)


## Explore Train and Test Datasets using Facets

Facets is an open-source visualization tool released by Google under the PAIR(People + AI Research) initiative. This tool helps us to understand and analyze the Machine Learning datasets. Facets consist of two visualizations, which help drill down the data and provide great insights without much work at the user’s end.


### Facets Dive

This feature helps the user dive deep into the individual feature/observation of the data to get more information. It helps in interactively exploring large numbers of data points at once.

Facets Dive provides an easy-to-customize, intuitive interface for exploring the relationship between the data points across the different features. With Facets Dive, you control each data point's position, color, and visual representation based on its feature values. If the data points have images associated with them, the images can be used as visual representations.

To use the Dive visualization, the data has to be transformed into JSON format.

In [35]:
# Display the Dive visualization for the training data.
from IPython.core.display import display, HTML

jsonstr = dataset.to_json(orient='records')
HTML_TEMPLATE = """
        <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
        <facets-dive id="elem" height="600"></facets-dive>
        <script>
          var data = {jsonstr};
          document.querySelector("#elem").data = data;
        </script>"""
html = HTML_TEMPLATE.format(jsonstr=jsonstr)
display(HTML(html))

In [36]:
with open('output_facets_dive.html','w') as fo:
    fo.write(html)

---------------------

### Facets Overview 

As the name suggests, this visualization gives an overview of the entire dataset and gives a sense of the shape of each feature of the data. Facets Overview summarizes statistics for each feature and compares the training and test datasets.

The Overview automatically gives a quick understanding of the distribution of values across the various features of the data. The distribution can also be compared across the training and testing datasets instantly. If some anomaly exists in the data, it just pops out from the data there and then.

Some of the information that can be easily accessed through this feature are:

Statistics like mean, median, and Standard Deviation
Min and Max values of a column
Missing data
Values that have zero values
Since it is possible to view the distributions across the test dataset, we can easily confirm if the training and testing data follow the same distributions.

In [26]:
# Create the feature stats for the datasets and stringify it.
import base64
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator

In [27]:
gfsg = GenericFeatureStatisticsGenerator()
proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': train_dataset},
                                 {'name': 'test', 'table': test_dataset}])
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

In [28]:
# Display the facets overview visualization for this data
from IPython.core.display import display, HTML

HTML_TEMPLATE = """
        <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
        <facets-overview id="elem"></facets-overview>
        <script>
          document.querySelector("#elem").protoInput = "{protostr}";
        </script>"""
html = HTML_TEMPLATE.format(protostr=protostr)


In [29]:
with open('output_facets_overview.html','w') as fo:
    fo.write(html)

-----