# IIoT Data Analytics Notebook (Pandas)
Demonstration of [AWS IoT Analytics](https://aws.amazon.com/iot-analytics/) Notebooks, using real-time sensor data.

In [None]:
import sys
import boto3
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from matplotlib.dates import DateFormatter
from pandas.plotting import register_matplotlib_converters

In [None]:
# constants
MY_TIMEZONE='US/Eastern'
DATETIME_FORMAT='%y-%m-%d %H:%M'

In [None]:
def parse(x):
    x = pd.to_datetime(x, infer_datetime_format=True, unit='s', utc=True)
    x = x.tz_convert(MY_TIMEZONE)
    return x

In [None]:
%%time

client = boto3.client('iotanalytics')
dataset = 'iot_data_dataset'
data_location = client.get_dataset_content(datasetName=dataset)['entries'][0]['dataURI']
df = pd.read_csv(data_location,
                 header=0,
                 low_memory=False,
                 infer_datetime_format=True,
                 date_parser=parse,
                 index_col=['ts'])

In [None]:
# clean up DataFrame
df = df.drop(columns='__dt')
df = df.sort_values(by='ts', ascending=True)

In [None]:
df.head(5)

In [None]:
print('Original DataFrame Range')
print('-------------')
print('Record count: {:,}'.format(df['temp'].count()))
print('DataFrame size (MB): {:,.2f}'.format(sys.getsizeof(df)/1024/1024))
print('-------------')
print('Time range (min): {:%Y-%m-%d %H:%M:%S %Z}'.format(df.index[1]))
print('Time range (max): {:%Y-%m-%d %H:%M:%S %Z}'.format(df.index[-1]))
print('-------------')
print('Records:\n{}'.format(df.groupby('device').size()))

In [None]:
# filter time range (different methods)
df_filtered = df.loc[df.index >= pd.Timestamp('2020-07-07T00', tz=MY_TIMEZONE)]
df_filtered = df_filtered.loc[df_filtered.index < pd.Timestamp('2020-07-10T00', tz=MY_TIMEZONE)]

#df_filtered = df.loc[df.index >= pd.Timestamp('today', tz=MY_TIMEZONE).floor('D') + pd.Timedelta(-1, unit='D')]

# df_filtered = df.loc[df.index >= pd.Timestamp('today', tz=MY_TIMEZONE) + pd.Timedelta(-1, unit='D')] # last 24 hours

#df_filtered = df.tail(10000) # fixed amount of records

In [None]:
# filter temp/humidity outliers (>1% & <99%)
df_filtered = df_filtered.loc[df_filtered['temp'] > df_filtered.groupby('device').temp.transform(lambda x: x.quantile(.01))]
df_filtered = df_filtered.loc[df_filtered['temp'] < df_filtered.groupby('device').temp.transform(lambda x: x.quantile(.99))]
df_filtered = df_filtered.loc[df_filtered['humidity'] > df_filtered.groupby('device').humidity.transform(lambda x: x.quantile(.01))]
df_filtered = df_filtered.loc[df_filtered['humidity'] < df_filtered.groupby('device').humidity.transform(lambda x: x.quantile(.99))]

In [None]:
# group by device
# df_filtered = df_filtered.loc[df_filtered['device'] == 'iot-demo-device-01']
groups = df_filtered.groupby('device')

In [None]:
print('Filtered DataFrame Range')
print('-------------')
print('Record count: {:,}'.format(df_filtered['temp'].count()))
print('DataFrame size (MB): {:,.2f}'.format(sys.getsizeof(df_filtered)/1024/1024))
print('-------------')
print('Time range (min): {:%Y-%m-%d %H:%M:%S %Z}'.format(df_filtered.index[1]))
print('Time range (max): {:%Y-%m-%d %H:%M:%S %Z}'.format(df_filtered.index[-1]))
print('Temperature (min): {:.2f}'.format(df_filtered['temp'].min()))
print('Temperature (max): {:.2f}'.format(df_filtered['temp'].max()))
print('Humidity (min): {:.2f}{}'.format(df_filtered['humidity'].min(), '%'))
print('Humidity (max): {:.2f}{}'.format(df_filtered['humidity'].max(), '%'))
print('-------------')
print('Record count:\n{}'.format(groups.size()))
print('Temperature (min):\n{}'.format(groups['temp'].min()))
print('Temperature (max):\n{}'.format(groups['temp'].max()))
print('Humidity (min)\n{}'.format(groups['humidity'].min()))
print('Humidity (max):\n{}'.format(groups['humidity'].max()))

In [None]:
# matplotlib datetime config
plt.rcParams['timezone']=MY_TIMEZONE
register_matplotlib_converters()
myFmt = DateFormatter(DATETIME_FORMAT)

### Scatter Plot using Matplotlib
* Using [Matplotlib: Visualization with Python](https://matplotlib.org/)
* X Axis = temperature
* Y Axis = humidity

In [None]:
_, ax = plt.subplots(1, 1, figsize=(18, 9))
for device, group in groups:
    ax.plot(group.temp,
            group.humidity,
            marker='o',
            linestyle='',
            alpha=.5,
            ms=10,
            label=device)
ax.grid()
ax.margins(0.05)
ax.legend()
plt.title('Temperature vs. Humidity')
plt.xlabel('Temperature (˚F)')
plt.ylabel('Humidity (%)')
plt.show()

### Temperature Graph using Moving Average
* Smoothing data using the mean average of a 1 minute rolling window
* 1 minutes = (20) data-points @ 3 second intervals
* Reference: https://en.wikipedia.org/wiki/Moving_average

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(18, 9))
for device, group in groups:
    group.mean = group.temp.rolling(window=20).mean()
    ax.plot(group.mean,
            label=device)
fig.autofmt_xdate()
ax.xaxis.set_major_formatter(myFmt)
ax.grid()
ax.margins(0.05)
ax.legend()
plt.title('Temperature Comparison over Time')
plt.ylabel('Temperature (˚F)')
plt.xlabel('Time')
plt.show()

### Humidity Graph using Moving Average
* Smoothing data using the mean average of a 1 minute rolling window (moving average)
* 1 minutes = (20) data-points @ 3 second intervals
* Reference: https://en.wikipedia.org/wiki/Moving_average

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(18, 9))
for device, group in groups:
    group.mean = group.humidity.rolling(window=20).mean()
    ax.plot(group.mean,
            label=device)
fig.autofmt_xdate()
ax.xaxis.set_major_formatter(myFmt)
ax.grid()
ax.margins(0.05)
ax.legend()
plt.title('Humidity Comparison over Time')
plt.ylabel('Humidity (%)')
plt.xlabel('Time')
plt.show()

### Plotly vs. Matplotlib for Graphing
Graphs using [Plotly Python Open Source Graphing Library](https://plotly.com/python/)

In [None]:
# strip timezone info so Plotly won't convert to UTC
df_filtered.index = df_filtered.index.tz_localize(None)

fig = px.scatter(df_filtered,
                 x='temp',
                 y='humidity',
                 color='device',
                 hover_name='device',
                 hover_data={'device': False,
                             'temp':':.2f',
                             'humidity':':.2f'})

fig.update_layout(title='Temperature vs. Humidity',
                  xaxis_title='Temperature (˚F)',
                  yaxis_title='Humidity (%)',
                  template='ggplot2')

fig.show()

In [None]:
fig = px.line(df_filtered,
              x=df_filtered.index.values,
              y='temp',
              color='device',
              hover_name='device',
              hover_data={'device': False,
                             'temp':':.2f'})

fig.update_layout(title='Temperature Comparison over Time',
                  xaxis_title='Date/Time',
                  yaxis_title='Temperature (˚F)',
                  template='ggplot2')

fig.show()

In [None]:
fig = px.line(df_filtered,
              x=df_filtered.index.values,
              y='humidity',
              color='device',
              hover_name='device',
              hover_data={'device': False,
                          'humidity':':.2f'})

fig.update_layout(title='Humidity Comparison over Time',
                  xaxis_title='Time',
                  yaxis_title='Humidity (%)',
                  template='ggplot2')

fig.show()

In [None]:
fig = px.scatter(df_filtered,
                 x=df_filtered.index.values,
                 y='light',
                 color='device',
                 height=250)

fig.update_layout(title='Light Detection Comparison over Time',
                  xaxis_title='Time',
                  yaxis_title='Detected?',
                  template='ggplot2')

fig.show()