# IIoT Data Analytics Notebook (Spark)
Demonstration of [AWS IoT Analytics](https://aws.amazon.com/iot-analytics/) Notebooks, using real-time sensor data.

In [None]:
import boto3
import pandas as pd
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType,  DoubleType, TimestampType, BooleanType

In [None]:
def parse(x):
    return pd.to_datetime(x,
                          infer_datetime_format=True,
                          unit='s',
                          utc=True)

In [None]:
spark = SparkSession \
    .builder \
    .appName('iiot_demo') \
    .getOrCreate()

In [None]:
iot_schema = StructType([
    StructField(name='device', dataType=StringType(), nullable=False),
    StructField(name='ts', dataType=TimestampType(), nullable=False),
    StructField(name='humidity', dataType=DoubleType(), nullable=True),
    StructField(name='temp', dataType=DoubleType(), nullable=True),
    StructField(name='light', dataType=BooleanType(), nullable=True),
    StructField(name='motion', dataType=BooleanType(), nullable=True),
    StructField(name='lpg', dataType=DoubleType(), nullable=True),
    StructField(name='co', dataType=DoubleType(), nullable=True),
    StructField(name='smoke', dataType=DoubleType(), nullable=True),
    StructField(name='__dt', dataType=StringType(), nullable=True)
])

In [None]:
%%time

client = boto3.client("iotanalytics")
dataset = "iot_data_dataset"
data_location = client.get_dataset_content(datasetName=dataset)["entries"][0]["dataURI"]

df = spark.createDataFrame(
    data=pd.read_csv(
        data_location,
        header=0,
        low_memory=False,
        date_parser=parse,
        parse_dates=[1],
        nrows=1000
    ), schema=iot_schema)

In [None]:
df = df.drop("__dt")
df = df.orderBy(f.asc("ts"))

In [None]:
df.show(5)

In [None]:
print("Dataset Range")
print("-------------")
print("Record count: {:,}".format(df.count()))
print("Time range (min): {}".format(df.agg({"ts": "min"}).collect()[0][0]))
print("Time range (max): {}".format(df.agg({"ts": "max"}).collect()[0][0]))
print("Records:")
print("".format(df.groupBy("device").count().select('device', f.col('count').alias('count')).show()))