# Octank IIoT Data Analytics Notebook
Demonstration of [AWS IoT Analytics](https://aws.amazon.com/iot-analytics/) Notebooks, using real-time sensor data.

In [1]:
import boto3
import pandas as pd
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType,  DoubleType, TimestampType, BooleanType

In [31]:
def parse(x):
    return pd.to_datetime(x,
                          infer_datetime_format=True,
                          unit='s',
                          utc=True)

In [32]:
spark = SparkSession \
    .builder \
    .appName('iiot_demo') \
    .getOrCreate()

In [33]:
iot_schema = StructType([
    StructField(name='device', dataType=StringType(), nullable=False),
    StructField(name='ts', dataType=TimestampType(), nullable=False),
    StructField(name='humidity', dataType=DoubleType(), nullable=True),
    StructField(name='temp', dataType=DoubleType(), nullable=True),
    StructField(name='light', dataType=BooleanType(), nullable=True),
    StructField(name='motion', dataType=BooleanType(), nullable=True),
    StructField(name='lpg', dataType=DoubleType(), nullable=True),
    StructField(name='co', dataType=DoubleType(), nullable=True),
    StructField(name='smoke', dataType=DoubleType(), nullable=True),
    StructField(name='__dt', dataType=StringType(), nullable=True)
])

In [38]:
%%time

client = boto3.client("iotanalytics")
dataset = "iot_data_dataset"
data_location = client.get_dataset_content(datasetName=dataset)["entries"][0]["dataURI"]

df = spark.createDataFrame(
    data=pd.read_csv(
        data_location,
        header=0,
        low_memory=False,
        date_parser=parse,
        parse_dates=[1],
        nrows=100000
    ), schema=iot_schema)

CPU times: user 4.3 s, sys: 163 ms, total: 4.47 s
Wall time: 5.02 s


In [39]:
df = df.drop("__dt")
df = df.orderBy(f.asc("ts"))

In [40]:
df.show(5)

+------------------+--------------------+-----------------+------------------+-----+------+--------------------+--------------------+--------------------+
|            device|                  ts|         humidity|              temp|light|motion|                 lpg|                  co|               smoke|
+------------------+--------------------+-----------------+------------------+-----+------+--------------------+--------------------+--------------------+
|iot-demo-device-01|2020-06-20 23:59:...| 82.5999984741211| 62.06000137329102| true| false|0.006710216201741271|0.004134005617772618|0.017742466864098354|
|iot-demo-device-02|2020-06-20 23:59:...|70.69999694824219|63.139998626708994| true| false|0.006747995268470235|0.004166214182479396|0.017849189636577008|
|iot-demo-device-03|2020-06-20 23:59:...| 45.9000015258789|  75.9199993133545| true| false|0.007472417062662312|0.004796903006589501|0.019903266607975173|
|iot-demo-device-03|2020-06-20 23:59:...|45.79999923706055|  75.919999

In [41]:
print("Dataset Range")
print("-------------")
print("Record count: {:,}".format(df.count()))
print("Time range (min): {}".format(df.agg({"ts": "min"}).collect()[0][0]))
print("Time range (max): {}".format(df.agg({"ts": "max"}).collect()[0][0]))
print("Records:")
print("".format(df.groupBy("device").count().select('device', f.col('count').alias('count')).show()))

Dataset Range
-------------
Record count: 100,000
Time range (min): 2020-06-20 23:59:32.175828
Time range (max): 2020-07-05 08:19:57.985332
Records:
+------------------+-----+
|            device|count|
+------------------+-----+
|iot-demo-device-01|29256|
|iot-demo-device-02|28543|
|iot-demo-device-03|42201|
+------------------+-----+


