# viz-2b-cars-outlier.ipynb

This notebook uses an index that was created using Elasticsearch's Machine Learning to detect outliers on the cars dataset. This index including ML's metadata is used to create a scatterplot matrix that highlights outliers. Additionally it includes a Vega based slider to adjust the threshold for highlighting.

In [1]:
import datetime
import altair as alt
import eland as ed
import json
import numpy as np
import matplotlib.pyplot as plt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
index_name = 'cars_outlier'

In [3]:
ed_df = ed.DataFrame('localhost:9200', index_name)
ed_df.head()

Unnamed: 0,Acceleration,Cylinders,Displacement,Horsepower,Miles_per_Gallon,Name,Origin,Weight_in_lbs,Year,ml.feature_influence,ml.feature_influence.feature_name,ml.feature_influence.influence,ml.outlier_score,ml__incremental_id
0,12.0,8,307.0,130.0,18.0,chevrolet chevelle malibu,USA,3504,1970-01-01,,,,0.047825,0
1,11.5,8,350.0,165.0,15.0,buick skylark 320,USA,3693,1970-01-01,,,,0.036906,1
2,11.0,8,318.0,150.0,18.0,plymouth satellite,USA,3436,1970-01-01,,,,0.040322,2
3,12.0,8,304.0,150.0,16.0,amc rebel sst,USA,3433,1970-01-01,,,,0.036022,3
4,10.5,8,302.0,140.0,17.0,ford torino,USA,3449,1970-01-01,"[{'feature_name': 'Acceleration', 'influence':...",,,0.108697,4


In [4]:
url = 'http://localhost:9200/'+index_name+'/_search?size=1000'
url_data = alt.Data(url=url, format=alt.DataFormat(property='hits.hits',type='json'))

fields = ['Acceleration', 'Cylinders', 'Displacement', 'Horsepower',
       'Miles_per_Gallon', 'Name', 'Origin', 'Weight_in_lbs', 'Year','ml.outlier_score']

rename_dict = dict((a, 'datum._source.'+a) for a in fields)

slider = alt.binding_range(min=0, max=1, step=.01, name='Outlier score Threshold:')
selector = alt.selection_single(name="SelectorName", fields=['cutoff'],
                                bind=slider, init={'cutoff': .8})

chart = alt.Chart(url_data).transform_calculate(**rename_dict).mark_point().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color=alt.condition(
        alt.datum["ml.outlier_score"] >= selector.cutoff,
        alt.value('red'), alt.value('gray')
    ),
    opacity=alt.condition(
        alt.datum["ml.outlier_score"] >= selector.cutoff,
        alt.value(.75), alt.value(.25)
    ),
    size=alt.condition(
        alt.datum["ml.outlier_score"] >= selector.cutoff,
        alt.value(28), alt.value(2)
    ),
    tooltip=['Name:N', 'ml.outlier_score:Q', 'Horsepower:Q', 'Acceleration:Q', 'Miles_per_Gallon:Q']
).properties(
    width=150,
    height=150
).repeat(
    row=['Horsepower', 'Acceleration', 'Miles_per_Gallon'],
    column=['Miles_per_Gallon', 'Acceleration', 'Horsepower']
).interactive().add_selection(
    selector
)

chart

In [5]:
from kibana_vega_util import saveVegaVis
from elasticsearch import Elasticsearch 
es=Elasticsearch([{'host':'localhost','port':9200}])

saveVegaVis(es, index_name, 'def-vega-cars-outlier-1', chart, resultSize=1000)

{'_index': '.kibana_3',
 '_id': 'visualization:def-vega-cars-outlier-1',
 '_version': 2,
 'result': 'updated',
 '_shards': {'total': 1, 'successful': 1, 'failed': 0},
 '_seq_no': 407,
 '_primary_term': 2}