# Databricks <> Graphistry Tutorial: Notebooks & Dashboards on IoT data

This tutorial visualizes a set of sensors by clustering them based on lattitude/longitude and overlaying summary statistics

We show how to load the interactive plots both with Databricks notebook and dashboard modes. The general flow should work in other PySpark environments as well.

Steps:

* Install Graphistry
* Prepare IoT data
* Plot in a notebook
* Plot in a dashboard
* Plot as a shareable URL

## Install & connect

In [0]:
# Uncomment and run first time or
#  have databricks admin install graphistry python library: 
#  https://docs.databricks.com/en/libraries/package-repositories.html#pypi-package

#%pip install graphistry
    

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# Required to run after pip install to pick up new python package: 
dbutils.library.restartPython()

In [0]:
import graphistry  # if not yet available, install pygraphistry and/or restart Python kernel using the cells above
graphistry.__version__

'0.34.3'

In [0]:

# Best practice:  use databricks secrets to store graphistry personal key (access token) 

# create personal key for your user at https://hub.graphistry.com/account/tokens

graphistry.register(api=3, 
                    personal_key_id=dbutils.secrets.get(scope="my-secret-scope", key="graphistry-personal_key_id"), 
                    personal_key_secret=dbutils.secrets.get(scope="my-secret-scope", key="graphistry-personal_key_secret"), 
                    protocol='https',
                    server='hub.graphistry.com')

# Alternatively, use username and password: 
# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')
# For more options, see https://github.com/graphistry/pygraphistry#configure

## Prepare IoT data
Sample data provided by Databricks

We create tables for different plots:

* Raw table of device sensor reads
* Summarized table:
  - rounded latitude/longitude
  - summarize min/max/avg for battery_level, c02_level, humidity, timestamp

In [0]:
# Load the data from its source.
devices = spark.read \
  .format('json') \
  .load('/databricks-datasets/iot/iot_devices.json')

# Show the results.
print('type: ', str(type(devices)))
display(devices.take(10))

type:  <class 'pyspark.sql.connect.dataframe.DataFrame'>


battery_level,c02_level,cca2,cca3,cn,device_id,device_name,humidity,ip,latitude,lcd,longitude,scale,temp,timestamp
8,868,US,USA,United States,1,meter-gauge-1xbYRYcj,51,68.161.225.1,38.0,green,-97.0,Celsius,34,1458444054093
7,1473,NO,NOR,Norway,2,sensor-pad-2n2Pea,70,213.161.254.1,62.47,red,6.15,Celsius,11,1458444054119
2,1556,IT,ITA,Italy,3,device-mac-36TWSKiT,44,88.36.5.1,42.83,red,12.83,Celsius,19,1458444054120
6,1080,US,USA,United States,4,sensor-pad-4mzWkz,32,66.39.173.154,44.06,yellow,-121.32,Celsius,28,1458444054121
4,931,PH,PHL,Philippines,5,therm-stick-5gimpUrBB,62,203.82.41.9,14.58,green,120.97,Celsius,25,1458444054122
3,1210,US,USA,United States,6,sensor-pad-6al7RTAobR,51,204.116.105.67,35.93,yellow,-85.46,Celsius,27,1458444054122
3,1129,CN,CHN,China,7,meter-gauge-7GeDoanM,26,220.173.179.1,22.82,yellow,108.32,Celsius,18,1458444054123
0,1536,JP,JPN,Japan,8,sensor-pad-8xUD6pzsQI,35,210.173.177.1,35.69,red,139.69,Celsius,27,1458444054123
3,807,JP,JPN,Japan,9,device-mac-9GcjZ2pw,85,118.23.68.227,35.69,green,139.69,Celsius,13,1458444054124
7,1470,US,USA,United States,10,sensor-pad-10BsywSYUF,56,208.109.163.218,33.61,red,-111.89,Celsius,26,1458444054125


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import concat_ws, col, round

devices_with_rounded_locations = (
    devices
    .withColumn(
        'location_rounded1',
        concat_ws(
            '_',
            round(col('latitude'), 0).cast('integer'),
            round(col('longitude'), 0).cast('integer')))
    .withColumn(
        'location_rounded2',
        concat_ws(
            '_',
            round(col('latitude'), -1).cast('integer'),
            round(col('longitude'), -1).cast('integer')))
)

cols = ['battery_level', 'c02_level', 'humidity', 'timestamp']
id_cols = ['cca2', 'cca3', 'cn', 'device_name', 'ip', 'location_rounded1', 'location_rounded2']
devices_summarized = (
    devices_with_rounded_locations.groupby('device_id').agg(
        *[F.min(col) for col in cols],
        *[F.max(col) for col in cols],
        *[F.avg(col) for col in cols],
        *[F.first(col) for col in id_cols]
    )
)

# [(from1, to1), ...]
renames = (
    [('device_id', 'device_id')]
    + [(f'first({col})', f'{col}') for col in id_cols]
    + [(f'min({col})', f'{col}_min') for col in cols] 
    + [(f'max({col})', f'{col}_max') for col in cols]
    + [(f'avg({col})', f'{col}_avg') for col in cols]
 )
devices_summarized = devices_summarized.select(list(
       map(lambda old,new:F.col(old).alias(new),*zip(*renames))
       ))

display(devices_summarized.take(10))

device_id,cca2,cca3,cn,device_name,ip,location_rounded1,location_rounded2,battery_level_min,c02_level_min,humidity_min,timestamp_min,battery_level_max,c02_level_max,humidity_max,timestamp_max,battery_level_avg,c02_level_avg,humidity_avg,timestamp_avg
26,JP,JPN,Japan,sensor-pad-26rAyCZQOQH9,210.158.147.11,36_140,40_140,9,1300,27,1458444054135,9,1300,27,1458444054135,9.0,1300.0,27.0,1458444054135.0
29,NL,NLD,Netherlands,meter-gauge-29lyNVxIS,83.98.224.49,52_5,50_0,6,1095,69,1458444054137,6,1095,69,1458444054137,6.0,1095.0,69.0,1458444054137.0
474,KR,KOR,Republic of Korea,sensor-pad-474nftAl1at,211.247.149.1,38_127,40_130,9,1546,91,1458444054336,9,1546,91,1458444054336,9.0,1546.0,91.0,1458444054336.0
964,US,USA,United States,sensor-pad-9644B6JD78,67.135.206.21,38_-97,40_-100,9,1557,41,1458444054438,9,1557,41,1458444054438,9.0,1557.0,41.0,1458444054438.0
1677,VN,VNM,Vietnam,device-mac-1677DrQfg,203.79.28.5,21_106,20_110,2,1276,51,1458444054562,2,1276,51,1458444054562,2.0,1276.0,51.0,1458444054562.0
1697,US,USA,,meter-gauge-1697vHqpe,130.117.14.234,47_8,50_10,5,1393,44,1458444054565,5,1393,44,1458444054565,5.0,1393.0,44.0,1458444054565.0
1806,US,USA,United States,sensor-pad-1806wi3AS,69.72.157.77,41_-74,40_-70,7,1136,63,1458444054585,7,1136,63,1458444054585,7.0,1136.0,63.0,1458444054585.0
1950,DE,DEU,Germany,sensor-pad-19507yukhBx,62.180.242.241,51_9,50_10,5,1279,69,1458444054611,5,1279,69,1458444054611,5.0,1279.0,69.0,1458444054611.0
2040,US,USA,United States,sensor-pad-2040kKvwnJ,68.87.129.153,38_-97,40_-100,0,1247,96,1458444054628,0,1247,96,1458444054628,0.0,1247.0,96.0,1458444054628.0
2214,UA,UKR,Ukraine,sensor-pad-2214ZnWAla,195.137.254.153,50_31,50_30,5,867,36,1458444054651,5,867,36,1458444054651,5.0,867.0,36.0,1458444054651.0


## Notebook plot

* Simple: Graph connections between `device_name` and `cca3` (country code)
* Advanced: Graph multiple connections, like `ip -> device_name` and `locaation_rounded1 -> ip`

In [0]:
(
    graphistry 
        .edges(devices.sample(fraction=0.1).toPandas(), 'device_name', 'cca3') \
        .settings(url_params={'strongGravity': 'true'}) \
        .plot()
)

In [0]:
hg = graphistry.hypergraph(
    devices_with_rounded_locations.sample(fraction=0.1).toPandas(),
    ['ip', 'device_name', 'location_rounded1', 'location_rounded2', 'cca3'],
    direct=True,
    opts={
        'EDGES': {
            'ip': ['device_name'],
            'location_rounded1': ['ip'],
            'location_rounded2': ['ip'],
            'cca3': ['location_rounded2']
        }
    })
g = hg['graph']
g = g.settings(url_params={'strongGravity': 'true'})  # this setting is great!

g.plot()

# links 80084
# events 20021
# attrib entities 41671


## Dashboard plot

* Make a `graphistry` object as usual...
* ... Then disable the splash screen and optionally set custom dimensions

The visualization will now load without needing to interact in the dashboard (`view` -> `+ New Dashboard`)

In [0]:
(
    g
        .settings(url_params={'splashAfter': 'false'})  # extends existing setting
        .plot(override_html_style="""
            border: 1px #DDD dotted;
            width: 50em; height: 50em;
        """)
)

## Plot as a Shareable URL

In [0]:
url = g.plot(render=False)
url

'https://hub.graphistry.com/graph/graph.html?dataset=6d335b56affe437a98c57a349aab9e04&type=arrow&viztoken=85c14656-1740-4b16-8da0-ae1ebf138508&usertag=ac3e16de-pygraphistry-0.34.3&splashAfter=1728595501&info=true&strongGravity=true'