# Batch anomaly detection with the Anomaly Detector API

### Use this Jupyter notebook to start visualizing anomalies as a batch with the Anomaly Detector API in Python.

This notebook shows you how to send a batch anomaly detection request, and vizualize the anomalies found throughout the example data set. The graph created at the end of this notebook will display the following:
* Anomalies found throughout the data set, highlighted.
* The expected values versus the values contained in the data set.
* Anomaly detection boundaries 


# Azure Stream Analytics Query

If you are exporting telemetry from IoT Hub/Central to ASA then this query will be a useful starting point

```sql
WITH Telemetry AS (
    SELECT
        deviceId as DeviceId,
        enrichments.deviceName as DeviceName,
        telemetry.latitude as Latitude,
        telemetry.longitude as Longitude,
        MAX(telemetry.temperature) AS Temperature,
        AVG(telemetry.humidity) AS Humidity,
        AVG(telemetry.pressure) AS Pressure,
        System.Timestamp() as Timestamp,
        Count(*) as Count
    FROM [weather-eh] TIMESTAMP BY enqueuedTime
    GROUP BY
        deviceId,
        enrichments.deviceName,
        telemetry.latitude,
        telemetry.longitude,
        TumblingWindow(minute,4)
)

SELECT DeviceId, DeviceName, LEFT(CAST(Timestamp as nvarchar(max)), 16) as timestamp, Temperature AS value INTO [anomaly-data] FROM Telemetry
```

Install the following Python libraries.

- pandas
- numpy
- bokeh
- ipywidgets
- azure.storage.blob
- matplotlib

In [None]:
# To start sending requests to the Anomaly Detector API, paste your Anomaly Detector resource access key below,
# and replace the endpoint variable with the endpoint for your region or your on-premise container endpoint. 
# Endpoint examples:
# https://westus2.api.cognitive.microsoft.com/anomalydetector/v1.0/timeseries/entire/detect
# http://127.0.0.1:5000/anomalydetector/v1.0/timeseries/entire/detect

# Output from Azure Stream Analytics
# apikey = '[REPLACE_WITH_AD_SUBSCRIPTION_KEY]'
# endpoint = '[REPLACE_WITH_AD_ENDPOINT_URL]'
# blob_conn_str="[REPLACE_WITH_BLOB_CONNECTION_STRING]"

# IoT Central Blog Storage Logging
apikey = ""
endpoint = ""
device_name = ""
blob_conn_str=""

blob_filter = "d89eef5e-6e74-43cf-aa04-2f36e81b91da/1/2022/07/"
input_container = "workspaces"


In [None]:
import requests
import json
import pandas as pd
import numpy as np
import warnings
import math
from azure.storage.blob import ContainerClient

warnings.filterwarnings('ignore')

# Import library to display results
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
from bokeh.plotting import figure,output_notebook, show
from bokeh.palettes import Blues4
from bokeh.models import ColumnDataSource,Slider
import datetime
from bokeh.io import push_notebook
from dateutil import parser
from ipywidgets import interact, widgets, fixed
from IPython.display import clear_output 

output_notebook()

In [None]:
def detect(endpoint, apikey, request_data):
    headers = {'Content-Type': 'application/json', 'Ocp-Apim-Subscription-Key': apikey}
    response = requests.post(endpoint, data=json.dumps(request_data), headers=headers)
    if response.status_code == 200:
        return json.loads(response.content.decode("utf-8"))
    else:
        print(response.status_code)
        raise Exception(response.text)

format_json_data iterates through the blob data and adds to a python dictionary.
A dictionary is used as it dedups any duplicate timestamps. The dictionary is later converted to a 
list of json objects.

In [None]:
def build_figure(sample_data, sensitivity):
    sample_data['sensitivity'] = sensitivity
    
    result = detect(endpoint, apikey, sample_data)
        
    columns = {'expectedValues': result['expectedValues'], 'isAnomaly': result['isAnomaly'], 'isNegativeAnomaly': result['isNegativeAnomaly'],
          'isPositiveAnomaly': result['isPositiveAnomaly'], 'upperMargins': result['upperMargins'], 'lowerMargins': result['lowerMargins'],
          'timestamp': [parser.parse(x['timestamp']) for x in sample_data['series']], 
          'value': [x['value'] for x in sample_data['series']]}
    response = pd.DataFrame(data=columns)
    values = response['value']
    label = response['timestamp']
    anomalies = []
    anomaly_labels = []
    index = 0
    anomaly_indexes = []
    p = figure(x_axis_type='datetime', title="Batch Anomaly Detection ({0} Sensitvity)".format(sensitivity), width=800, height=600)
    for anom in response['isAnomaly']:
        if anom == True and (values[index] > response.iloc[index]['expectedValues'] + response.iloc[index]['upperMargins'] or 
                         values[index] < response.iloc[index]['expectedValues'] - response.iloc[index]['lowerMargins']):
            anomalies.append(values[index])
            anomaly_labels.append(label[index])
            anomaly_indexes.append(index)
        index = index+1
    upperband = response['expectedValues'] + response['upperMargins']
    lowerband = response['expectedValues'] -response['lowerMargins']
    band_x = np.append(label, label[::-1])
    band_y = np.append(lowerband, upperband[::-1])
    boundary = p.patch(band_x, band_y, color=Blues4[2], fill_alpha=0.5, line_width=1, legend='Boundary')
    p.line(label, values, legend='Value', color="#2222aa", line_width=1)
    p.line(label, response['expectedValues'], legend='ExpectedValue',  line_width=1, line_dash="dotdash", line_color='olivedrab')
    anom_source = ColumnDataSource(dict(x=anomaly_labels, y=anomalies))
    anoms = p.circle('x', 'y', size=5, color='tomato', source=anom_source)
    p.legend.border_line_width = 1
    p.legend.background_fill_alpha  = 0.1
    show(p, notebook_handle=True)

In [None]:
def read_data():
    with open('temperature.json',mode='r') as file: 
        filecontents = file.read()
    format_json_data(filecontents, data)

In [None]:
def add_entries_from_log(telemetry_data):
    parse_json_records = [json.loads(str(item)) for item in telemetry_data.strip().split('\n') ]
    for item in parse_json_records:
        if item["enrichments"]["deviceName"] == device_name:
            temperature = item['telemetry']['temperature']
            if item['telemetry']['temperature'] is not None and item["enqueuedTime"] is not None and not math.isnan(temperature):
                di = {}
                di['timestamp'] = item["enqueuedTime"]
                di['value'] = temperature
                data.append(di)

In [None]:
def read_iot_central_log(input_container):
    container = ContainerClient.from_connection_string(
        conn_str=blob_conn_str, container_name=input_container)
    blob_name = ''
    json_paths = []
    blob_list = container.list_blobs(name_starts_with = blob_filter )
    for blob in blob_list:
        # read raw data from blob storage
        blob_name = blob.name
        # print(blob.name + '\n')
        blob_client = container.get_blob_client(blob_name)
        filestream = blob_client.download_blob()
        filecontents = filestream.content_as_text()

        if filecontents:
            add_entries_from_log(filecontents)

## Vizualizing anomalies throughout your data

The following cells call the Anomaly Detector API with two different example time series data sets, and different sensitivities for anomaly detection. Varying the sensitivity of the Anomaly Detector API can improve how well the response fits your data.

### Example 1: time series with an hourly sampling frequency


In [None]:
# DONT RUN AS WILL BE USING OFFLINE DATA FOR DEMO

# data = []

# # Get data from blob storage
# # process_raw_data()
# read_iot_central_log(input_container)

# # Clean data

# # convert list object to a pandas dataframe
# df = pd.DataFrame(data)

# # Convert datetime string to dataframe timestamp type
# # https://cumsum.wordpress.com/2022/02/26/pandas-typeerror-only-valid-with-datetimeindex-timedeltaindex-or-periodindex-but-got-an-instance-of-index/
# df.index = pd.to_datetime(df['timestamp'])

# # Resample into 10 minute buckets
# # https://towardsdatascience.com/using-the-pandas-resample-function-a231144194c4
# res = df.resample('5min').max()

# # remove existing timestamp column convert the datetime index to a timestamp column
# # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html
# # https://stackoverflow.com/questions/44773714/convert-index-to-column-pandas-dataframe
# res = res.drop(columns=['timestamp']).reset_index()

# # Convert timestamp column of type timestamp to type string
# res['timestamp']=res['timestamp'].astype(str)

# # convert dataframe to a list of json objects
# ts_dict = res.to_dict('records')

# with open('iot_central_data_backup.json', 'w') as f:
#     json.dump(ts_dict, f)



In [None]:
# Load from backup if there is a network/service problem :)

with open('iot_central_data_backup.json', 'r') as f:
    ts_dict = json.load(f)
    print(json.dumps(ts_dict, indent=4, sort_keys=True))


In [None]:
# print(ts_dict)

if len(ts_dict) < 12:
    print("Climate data items: {count}".format(count=len(ts_dict)))
    print("12 Climate data items required")
    print("Wait for more data to be generated then try again")
else:

    # Create data for anomaly detection

    sample_data = {}
    sample_data['series'] = ts_dict
    sample_data['granularity'] = 'minutely'
    sample_data['customInterval'] = 5

    # sample_data['period'] = 24
    # 95 sensitivity
    build_figure(sample_data,95)

In [None]:
# 90 sensitivity
build_figure(sample_data,90)

In [None]:
#85 sensitivity
build_figure(sample_data,85)