Anomaly Detection Visualization; 
Project: DaQuAI (DigiLab, LSBG); Date: 31/07/2023

@florist-notes

In [1491]:
import os
import glob 
import pandas as pd
from datetime import datetime

df_p = pd.read_csv('public_holiday_data.csv')
df_s = pd.read_csv('school_holiday_data.csv')
del df_p['_id']
del df_s['_id']

date_format = '%Y-%m-%d %H:%M:%S'
date_format2 = '%Y-%m-%d'

path_root = 'for_res/'

In [1492]:
dir_list = []
for it in os.scandir(path_root):
    if it.is_dir():
        print(it.path)
        dir_list.append(it.path)

for_res/MQ32.1
for_res/MQ29.4
for_res/MQ31.6
for_res/MQ28.6
for_res/MQ27.1
for_res/MQ32.2
for_res/MQ31.3
for_res/MQ29.1
for_res/MQ31.5
for_res/MQ29.2
for_res/MQ27.2
for_res/MQ31.4
for_res/MQ30.1
for_res/MQ30.2


All the sub directories contain sensor data of 1hr interval from different stations. Each MQ represents a station and has several zones within it with respective zone IDs. You can fetch the data from MongoDb but for local processing I have them as '.xlsx'.

In [1493]:
path = dir_list[2]

files = glob.glob(path + "/*.csv")
files2 = glob.glob(path + "/*.xlsx")

In [1494]:
files

['for_res/MQ31.6/MQ31_6_FORECAST_2022_ab.csv',
 'for_res/MQ31.6/MQ31_6_FORECAST_2023_ab.csv']

In [1495]:
files2

['for_res/MQ31.6/MQ31_6_FORECAST_2022.xlsx',
 'for_res/MQ31.6/MQ31_6_FORECAST_2023.xlsx']

In [1583]:
thisfile = files[1] #change here for picking right year
thisfile2 = files2[1] #change here for picking right year
year_now = 2023
d_namex = thisfile[15:-4]
df_data = pd.read_csv(thisfile)
df_data2 = pd.read_excel(thisfile2)
del df_data['Unnamed: 0']

d_namefinal = thisfile2[15:-5]

Here 'thisfile' refers to the csv file with anomaly, bounds and mean data. 'thisfile2' refers to the '.xlsx' file with count data from sensors with 1hr interval.

In [1584]:
d_namex

'MQ31_6_FORECAST_2023_ab'

Here is how 'thisfile' looks like:

In [1585]:
df_data.head()


Unnamed: 0,Z_ID,Z_name,Date,Anomaly,Anomaly_val,Min,Max,Lower_Bound,Mean,Upper_Bound
0,B_31.5_2_I,"t6_sun_winter2[""1002""]",2023-01-01 00:00:00,0,0.0,0.0,16.0,2.659524,3.2,3.740476
1,B_31.5_2_I,"t6_sun_winter2[""1002""]",2023-01-01 01:00:00,0,0.0,0.0,16.0,2.659524,3.2,3.740476
2,B_31.5_2_I,"t6_sun_winter2[""1002""]",2023-01-01 02:00:00,0,0.0,0.0,16.0,2.659524,3.2,3.740476
3,B_31.5_2_I,"t6_sun_winter2[""1002""]",2023-01-01 03:00:00,1,0.0,0.0,16.0,2.659524,3.2,3.740476
4,B_31.5_2_I,"t6_sun_winter2[""1002""]",2023-01-01 04:00:00,0,0.0,0.0,16.0,2.659524,3.2,3.740476


To find the name of the Zone IDs within a station, here : MQ 31.3

In [1586]:
ls_zoneid = [x for x in set(df_data['Z_ID'])]

In [1587]:
ls_zoneid

['B_31.5_1_G', 'B_31.5_2_I', 'B_31.6_2_G', 'B_31.6_1_I']

We see that there are 2 zone names for MQ Station: MQ31.3

In [1645]:
field = ls_zoneid[3]

We select the start and end date for processing:

In [1646]:
if year_now == 2023:
    date_ti = datetime.strptime('2023-01-01 00:00:00', date_format)
    date_to = datetime.strptime('2023-03-31 00:00:00', date_format)
else:
    date_ti = datetime.strptime('2022-01-01 00:00:00', date_format)
    date_to = datetime.strptime('2022-12-31 00:00:00', date_format)



# process and filter public holiday data

df_xo = pd.DataFrame(columns=['date', 'p_name'])


for fo in range(len(df_p["date"])):
    date_t = df_p.iloc[fo]["date"]
    date_t = datetime.strptime(date_t, date_format2)
    d_name = df_p.iloc[fo]["name"]
    new_dat = {'date':date_t, 'p_name':d_name}
    df_xo = pd.concat([df_xo, pd.DataFrame([new_dat])], ignore_index=True)
    
df_xo = df_xo[df_xo["date"]>date_ti]
df_xo = df_xo[df_xo["date"]<date_to]

In [1647]:
# process and filter school holiday data

df_xo2 = pd.DataFrame(columns=['date_s', 'date_e', 'duration', 'p_name'])

for fo in range(len(df_s["duration"])):
    date_t1 = df_s.iloc[fo]["endDate"]
    date_t1 = datetime.strptime(date_t1, date_format2)
    date_t2 = df_s.iloc[fo]["startDate"]
    date_t2 = datetime.strptime(date_t2, date_format2)
    d_name = df_s.iloc[fo]["name"]
    d_dur = df_s.iloc[fo]["duration"]
    new_dat = {'date_s':date_t2, 'date_e':date_t1, 'duration':d_dur, 'p_name':d_name}
    df_xo2 = pd.concat([df_xo2, pd.DataFrame([new_dat])], ignore_index=True)
    
df_xo2 = df_xo2[df_xo2["date_s"]>date_ti]
df_xo2 = df_xo2[df_xo2["date_s"]<date_to]

Variable 'df_xo' holds filtered public holiday data and 'df_xo2' holds filtered school holiday data.

## Visualizations via plotly as graph objects:

In [1648]:

import numpy as np
import plotly.graph_objs as go

df = df_data #anomaly and bounds data
df = df[df['Z_ID']==field] #select specific field
df_x = df_data2 #sensor count data '1hr' interval


In [1649]:
anomaly = df['Anomaly'] #anomaly time coordinate from csv
anomaly2 = df['Anomaly_val'] #anomaly count value from csv

In [1650]:
anomaly[anomaly==0] = np.nan #make all non anomalous time as 'NaN'
anomaly2[anomaly2==0] = np.nan #make all non anomalous count value as 'NaN'



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [1651]:
list1 = list(df_x[field]) #sensor count data of 1hr interval
list2 = list(df["Mean"]) #mean of each bound for each data in time interval
subtracted = [abs(element1 - element2) for (element1, element2) in zip(list1, list2)] #element wise abs difference

subtracted_mean = np.mean(subtracted)
subtracted_std = np.std(subtracted)
subtracted_max = np.max(subtracted)

In [1652]:
thres = (subtracted_mean+subtracted_std) #threshold criteroía

I find points which are far away from threshold for each time interval and store them in a list 'anomYlst'

In [1653]:
anomYlst = []
for itemp in subtracted:
    if itemp < thres:
        anomYlst.append(0)
    else:
        anomYlst.append(itemp)
       
    
indx_r = []
count = 0
for itemz in anomYlst: 
    if itemz != 0:
        indx_r.append(count) #append all non zero 
    count += 1  

In [1654]:
list_finale = []
for xit in range(len(df_x)): #sensor count data with 1 hr interval
    val = df_x.iloc[xit][field] 
    if xit in indx_r:
        list_finale.append(df_x.iloc[xit][field]) #index of each count value greater than threshold
    else:
        list_finale.append(float("nan"))
    
for items in indx_r:
    val = df_x.iloc[items][field]
    float("nan")

In [1655]:
list2 #mean of data

[3.3285714285714287,
 3.3285714285714287,
 3.3285714285714287,
 3.3285714285714287,
 3.3285714285714287,
 3.3285714285714287,
 26.125,
 26.125,
 26.125,
 26.125,
 48.16666666666666,
 48.16666666666666,
 48.16666666666666,
 46.766666666666666,
 46.766666666666666,
 46.766666666666666,
 33.6,
 33.6,
 33.6,
 18.7,
 18.7,
 18.7,
 18.7,
 3.3285714285714287,
 22.214285714285715,
 22.214285714285715,
 22.214285714285715,
 22.214285714285715,
 22.214285714285715,
 22.214285714285715,
 118.075,
 118.075,
 118.075,
 118.075,
 87.33333333333333,
 87.33333333333333,
 87.33333333333333,
 108.3,
 108.3,
 108.3,
 73.2,
 73.2,
 73.2,
 40.385714285714286,
 40.385714285714286,
 40.385714285714286,
 40.385714285714286,
 22.214285714285715,
 25.87301587301588,
 25.87301587301588,
 25.87301587301588,
 25.87301587301588,
 25.87301587301588,
 25.87301587301588,
 127.61111111111111,
 127.61111111111111,
 127.61111111111111,
 127.61111111111111,
 90.07407407407408,
 90.07407407407408,
 90.07407407407408,
 112.

In [1656]:
#create the bounds graph
bounds_graph_upper = np.abs(list2+thres)
bounds_graph_lower = list2-thres



In [1657]:
bounds_graph_lower_f = []
for xu in bounds_graph_lower:
    if xu < 0:
        xu = 0
    bounds_graph_lower_f.append(xu)

In [1658]:
len(bounds_graph_lower)

2160

### Plot:

In [1659]:
if year_now == 2023:
    yi = "2023-01-10"
    yo = "2023-01-24"
else:
    yi = "2022-01-10"
    yo = "2022-01-24"



In [1660]:
val_y = np.max(df['Upper_Bound'])
val_ph = [val_y] * len(df_xo) #height of public holiday in final plot

In [1661]:

fig = go.Figure(data=[go.Candlestick(x=df['Date'],
                open=df['Mean'],
                high=df['Upper_Bound'],
                low=df['Lower_Bound'],
                close=df['Mean'],
                increasing_line_color= 'green', decreasing_line_color= 'green', name = 'Upper and Lower Bound')])

fig.add_trace(go.Scatter(x=df_x['Datum'], y=df_x[field], line=dict(color='gray'), name = 'Data'))
fig.add_trace(go.Scatter(x=df_x['Datum'], y=list_finale, line=dict(color='red'), name = 'Anomaly 1 (model 1)'))

fig.add_trace(go.Scatter(x=df['Date'], y=anomaly2, mode='markers', line=dict(color='darkblue'), name = 'Anomaly 2 (model 2)'))

fig.add_trace(go.Scatter(x=df_xo['date'], y=val_ph, mode='markers', line=dict(color='yellow'), name = 'Public Holiday'))

for (start, end, value, y) in zip(df_xo2["date_s"], df_xo2["date_e"], df_xo2["duration"], df_xo2["p_name"]):
    name = f"{start} to {end}"
    fig.add_trace(go.Scatter(x=[start, end], y=[val_y+value, val_y+value], mode='lines', name = y))

fig.update_xaxes(range=[yi,yo])
title_str = 'Anomaly Detection for Station : ' + str(d_namefinal[0:-14] + ' and zone name : ' + str(field))
fig.update_layout(title=title_str)

fig.show()

In [1662]:

fig = go.Figure(data=[go.Candlestick(x=df['Date'],
                open=df['Mean'],
                high=df['Upper_Bound'],
                low=df['Lower_Bound'],
                close=df['Mean'],
                increasing_line_color= 'green', decreasing_line_color= 'green', name = 'Upper and Lower Bound')])

fig.add_trace(go.Scatter(x=df_x['Datum'], y=df_x[field], line=dict(color='gray'), name = 'Data'))
fig.add_trace(go.Scatter(x=df_x['Datum'], y=list_finale, line=dict(color='red'), name = 'Anomaly 1 (model 1)'))
fig.add_trace(go.Scatter(x=df_x['Datum'], y=bounds_graph_upper, line=dict(color='pink'), name = 'Upper Threshold'))
fig.add_trace(go.Scatter(x=df_x['Datum'], y=bounds_graph_lower_f, line=dict(color='yellow'), name = 'Lower Threshold'))


fig.add_trace(go.Scatter(x=df['Date'], y=anomaly2, mode='markers', line=dict(color='darkblue'), name = 'Anomaly 2 (model 2)'))

fig.add_trace(go.Scatter(x=df_xo['date'], y=val_ph, mode='markers', line=dict(color='yellow'), name = 'Public Holiday'))

for (start, end, value, y) in zip(df_xo2["date_s"], df_xo2["date_e"], df_xo2["duration"], df_xo2["p_name"]):
    name = f"{start} to {end}"
    fig.add_trace(go.Scatter(x=[start, end], y=[val_y+value, val_y+value], mode='lines', name = y))

fig.update_xaxes(range=[yi,yo])
title_str = 'Anomaly Detection for Station : ' + str(d_namefinal[0:-14] + ' and zone name : ' + str(field))
fig.update_layout(title=title_str)

fig.show()

In [1663]:
fig.write_html(path+'/'+d_namefinal+'_'+field+"_final.html")