In [282]:
import modules.data_preparation.v1.ETLData as dp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import numpy as np

from menelaus.data_drift import KdqTreeStreaming
import warnings
warnings.filterwarnings('ignore')

from sktime import datasets
from sktime.forecasting import model_selection
from sktime.utils.plotting import plot_series
import plotly.graph_objects as go
import plotly.express as px

#from statsmodels.tsa.seasonal import seasonal_decompose
from scipy import optimize
from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.parser import parse

- get in this notebook, the data which we want to simulate (rendement/ difference rendement + timestamps)
- calculate stats of that data
- generate the data
- create a random timestamp where we introduce drift
- create a function which generates the data with input the data which we want to simulate
- call the function 1000 times (with different datasets as input)
- launch algorithm on data
- calculate metric of the algorithm
- interprete metric

# 1. Get Data To Simulate
- The data to simulate is the data on which drift algorithms will be applied to.
- Here we are simulating efficiency prediction error.

In [19]:
# global variables

# dataset paths
PREPROCESSED_DATASET_PATH = '../data/preprocessed_dataset/preprocessed_dataset.csv'
ERROR_TEST_3_MONTHS_PATH = '../data/efficiency_error_dataset/test_3_months.csv'

# number of samples to generate
NUM_SAMPLES = 1000

In [5]:
# read error file
error_file_dataset = (dp.ETLData(ERROR_TEST_3_MONTHS_PATH, sheet_name='data')
           .rename_column({'Unnamed: 0': ''})
           .to_timeseries(''))
error_file_dataset.get_data().head(3)

Unnamed: 0,pred_out,true_out,pred_eff,true_eff,eff_error,equipment_id
,,,,,,
2022-06-01,1.313046,0.992,7.667421,5.792701,1.87472,eq_it049959b_chiller_compression_03
2022-06-01,0.554161,0.619,6.830956,7.6302,-0.799244,eq_it049959b_chiller_compression_04
2022-06-01,0.213952,0.19,27.16849,24.126984,3.041506,eq_it049959b_chiller_compression_01


### Change Chiller Here!!!

In [11]:
error_file_dataset.get_data()[['equipment_id']].value_counts()

equipment_id                       
eq_it049959b_chiller_compression_04    2071
eq_it049959b_chiller_compression_02    1441
eq_it049959b_chiller_compression_03    1275
eq_it049959b_chiller_compression_05     873
eq_it049959b_chiller_compression_01     337
eq_it019820w_chiller_compression_04      42
eq_it019820w_chiller_compression_05      39
dtype: int64

In [13]:
EQUIPEMENT_ID = 'eq_it049959b_chiller_compression_03'

In [195]:
# timeseries to simulate
series_to_simulate = error_file_dataset.filter_by_column_value('equipment_id', lambda x: x==EQUIPEMENT_ID, save=False)['eff_error']
series_to_simulate


2022-06-01 00:00:00    1.874720
2022-06-01 01:00:00    1.273535
2022-06-01 02:00:00    1.824427
2022-06-01 03:00:00    1.854117
2022-06-01 04:00:00    1.610863
                         ...   
2022-09-13 08:00:00    0.309650
2022-09-13 09:00:00   -0.481371
2022-09-13 12:00:00   -0.814819
2022-09-13 13:00:00    0.334582
2022-09-13 14:00:00   -0.848251
Name: eff_error, Length: 1275, dtype: float64

In [196]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=series_to_simulate.index, y=series_to_simulate.values, mode='markers', name='predictions error'))

fig.update_layout(title='Data to Simulate', xaxis_title='timestamp', yaxis_title='efficiency prediction error')
fig.show()

# 2. Get statistics of Data to Simulate

In [197]:
mean = np.mean(series_to_simulate.values)
print('mean: ' + str(mean))

mean: 0.767894541789666


In [198]:
std = np.std(series_to_simulate.values)
print('std: ' + str(std))

std: 0.8611183351032901


# 3. Generate Data

In [283]:
generated_data = pd.DataFrame()
generated_data['generated_eff_error'] = np.random.normal(mean,std,len(series_to_simulate.index))
generated_data.index = series_to_simulate.index
generated_data.head(3)

Unnamed: 0,generated_eff_error
,
2022-06-01 00:00:00,1.542509
2022-06-01 01:00:00,-0.018574
2022-06-01 02:00:00,0.712841


In [284]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=generated_data.index, y=generated_data['generated_eff_error'], mode='markers', name='generated data'))

fig.update_layout(title='Generated Data (predicted)', xaxis_title='timestamp', yaxis_title='simulated efficiency prediction error')
fig.show()

# 4. Introduce Drift

In [285]:
# percentage drift for x hours
PERCENTAGE_DRIFT_PER_X_HOURS = 0.8 #0.001

# for drift per hour: X_HOURS=1
# for drift per day: X_HOURS=24
# for drift per week: X_HOURS=24*7
# for drift per month: X_HOURS=24*30
X_HOURS = 24

# drift direction
# for downwards drift: DOWNWARDS_DRIFT=True (default)
# for upwards drift: DOWNWARDS_DRIFT=False
# Note: if detecting drift on data representing efficiency --> detect downwards drift (i.e. efficiency decreasing over time)
# Note: if detecting drift on data representing error between predictied and true efficinecy --> detect upwards drift (i.e error increasing over time)
DOWNWARDS_DRIFT = False

In [286]:
# random timestamp to begin drift
random_timestamp_index = np.random.randint(len(list(generated_data.index)))
begin_drift_timestamp = list(generated_data.index)[random_timestamp_index]
print(begin_drift_timestamp)

2022-07-02 21:00:00


In [287]:
# only for visualisation purposes
generated_data_before = generated_data.copy()

In [289]:
# difference between generated data timestamps and timestamp at which to start drift
# time delta starts as from the time stamp from which to start drift
time_delta_in_hours = np.array(list((generated_data.index[random_timestamp_index:] - begin_drift_timestamp).seconds))/(60*60*X_HOURS)


# apply defined percentage drift at X_HOURS time interval
if not DOWNWARDS_DRIFT: # upwards drift
    generated_data.values[random_timestamp_index:] = generated_data.values[random_timestamp_index:] * (1 + PERCENTAGE_DRIFT_PER_X_HOURS * time_delta_in_hours).reshape(-1,1)
else: # downwards drift
    generated_data.values[random_timestamp_index:] = generated_data.values[random_timestamp_index:] * (1 - PERCENTAGE_DRIFT_PER_X_HOURS * time_delta_in_hours).reshape(-1,1)



In [290]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=generated_data_before.index, y=generated_data_before['generated_eff_error'], mode='markers', name='data withou drift'))
fig.add_trace(go.Scatter(x=generated_data.index, y=generated_data['generated_eff_error'], mode='markers', name='data with drift'))

fig.update_layout(title='Generated Data (predicted) with drift', xaxis_title='timestamp', yaxis_title='simulated efficiency prediction error')
fig.show()

# Drift Detection Algorithm

In [291]:
# algorithm hyperparameters

WINDOW_SIZE = 24*15 #24*30
BOOTSTRAP_SAMPLES = 24*15 #24*30
COUNT_UBOUND = 50

In [292]:
def drift_detection(data, window_size=10, alpha=0.05, bootstrap_samples=100, count_ubound=50):
    
    np.random.seed(1)
    det = KdqTreeStreaming(window_size=window_size, alpha=alpha, bootstrap_samples=bootstrap_samples, count_ubound=count_ubound)
    
    status = pd.DataFrame(columns=["timestamp", "efficiency", "drift_detected"])

    plot_data = {}
    
    for i in range(len(data)):
        det.update(data.iloc[[i]])
        #status.loc[i] = [i, data.iloc[i, 0], data.iloc[i, 1], det.drift_state]
        status.loc[i] = [data.index[i], data.iloc[i, 0], det.drift_state]
        if det.drift_state is not None:
            # capture the visualization data
            plot_data[i] = det.to_plotly_dataframe()
            
    return plot_data, status

In [293]:
# detect drift
plot_data, status = drift_detection(generated_data[['generated_eff_error']], window_size=WINDOW_SIZE, bootstrap_samples=BOOTSTRAP_SAMPLES, count_ubound=COUNT_UBOUND)

drift_detected_timestamp = status.loc[status["drift_detected"] == "drift"]["timestamp"]


In [294]:
drift_detected_timestamp

737   2022-07-02 18:00:00
Name: timestamp, dtype: datetime64[ns]

In [295]:
fig = go.Figure()

# add data points
fig.add_trace(go.Scatter(x=status['timestamp'], y=generated_data['generated_eff_error'], mode='markers', name='efficiency error'))

# calculate drift bar limits
drift_bar_max = generated_data['generated_eff_error'].max()
drift_bar_min = generated_data['generated_eff_error'].min()

# add true drift timestamp
true_drift_df=pd.DataFrame()
true_drift_df=true_drift_df.append([[begin_drift_timestamp, drift_bar_min]])
true_drift_df=true_drift_df.append([[begin_drift_timestamp, drift_bar_max]])
fig.add_trace(go.Scatter(x=true_drift_df[0], y=true_drift_df[1], line=dict(color='green'), name='true drift'))


# add drift detected
for i in range(len(drift_detected_timestamp)):
    df=pd.DataFrame()
    df=df.append([[drift_detected_timestamp.iloc[i], drift_bar_min]])
    df=df.append([[drift_detected_timestamp.iloc[i], drift_bar_max]])
    fig.add_trace(go.Scatter(x=df[0], y=df[1], line=dict(color='red'), name='detected drift'))

fig.update_layout(title='DRIFT DETECTION ON EFFICIENCY ERROR', xaxis_title='timestamp', yaxis_title='efficiency error')
fig.show()
