# IIoT Data Analytics Notebook (Spark)
Demonstration of [AWS IoT Analytics](https://aws.amazon.com/iot-analytics/) Notebooks, using real-time sensor data.

In [1]:
import boto3
import pandas as pd
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType,  DoubleType, TimestampType, BooleanType

In [2]:
def parse(x):
    return pd.to_datetime(x,
                          infer_datetime_format=True,
                          unit='s',
                          utc=True)

In [3]:
spark = SparkSession \
    .builder \
    .appName('iiot_demo') \
    .getOrCreate()

In [4]:
iot_schema = StructType([
    StructField(name='device', dataType=StringType(), nullable=False),
    StructField(name='ts', dataType=TimestampType(), nullable=False),
    StructField(name='humidity', dataType=DoubleType(), nullable=True),
    StructField(name='temp', dataType=DoubleType(), nullable=True),
    StructField(name='light', dataType=BooleanType(), nullable=True),
    StructField(name='motion', dataType=BooleanType(), nullable=True),
    StructField(name='lpg', dataType=DoubleType(), nullable=True),
    StructField(name='co', dataType=DoubleType(), nullable=True),
    StructField(name='smoke', dataType=DoubleType(), nullable=True),
    StructField(name='__dt', dataType=StringType(), nullable=True)
])

In [5]:
%%time

client = boto3.client("iotanalytics")
dataset = "iot_data_dataset"
data_location = client.get_dataset_content(datasetName=dataset)["entries"][0]["dataURI"]

df = spark.createDataFrame(
    data=pd.read_csv(
        data_location,
        header=0,
        low_memory=False,
        date_parser=parse,
        parse_dates=[1],
        nrows=1000
    ), schema=iot_schema)

CPU times: user 742 ms, sys: 318 ms, total: 1.06 s
Wall time: 5.84 s


In [6]:
df = df.drop("__dt")
df = df.orderBy(f.asc("ts"))

In [7]:
df.show(5)

+------------------+--------------------+----------------+------------------+-----+------+--------------------+--------------------+--------------------+
|            device|                  ts|        humidity|              temp|light|motion|                 lpg|                  co|               smoke|
+------------------+--------------------+----------------+------------------+-----+------+--------------------+--------------------+--------------------+
|iot-demo-device-02|2020-06-21 00:00:...|70.5999984741211|63.139998626708994| true| false|0.006747995268470235|0.004166214182479396|0.017849189636577008|
|iot-demo-device-03|2020-06-21 00:00:...|            46.5|  75.9199993133545| true| false|0.007438467747152723|0.004766802620617986|0.019806689967262585|
|iot-demo-device-01|2020-06-21 00:00:...|82.5999984741211| 62.06000137329102| true| false|0.006808996836430836|0.004218366700830792|0.018021599997150074|
|iot-demo-device-03|2020-06-21 00:00:...|            46.0|  75.9199993133545

In [8]:
print("Dataset Range")
print("-------------")
print("Record count: {:,}".format(df.count()))
print("Time range (min): {}".format(df.agg({"ts": "min"}).collect()[0][0]))
print("Time range (max): {}".format(df.agg({"ts": "max"}).collect()[0][0]))
print("Records:")
print("".format(df.groupBy("device").count().select('device', f.col('count').alias('count')).show()))

Dataset Range
-------------
Record count: 1,000
Time range (min): 2020-06-21 00:00:03.442186
Time range (max): 2020-07-12 00:50:28.387987
Records:
+------------------+-----+
|            device|count|
+------------------+-----+
|iot-demo-device-01|  301|
|iot-demo-device-02|  298|
|iot-demo-device-03|  401|
+------------------+-----+


