<a href="https://colab.research.google.com/github/paullo0106/prophet_anomaly_detection/blob/master/prophet_anomaly_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Is prophet able to predict clustering results ?

Check if the clustering result of Juhyun can be predicted using Prophet

### Import the relevant library

In [1]:
import pandas as pd
import time
import re

import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.dates import DateFormatter

# Note that the interactive plot may not work in Jupyter lab, but only in Jupyter Notebook (conflict of javascripts)
%matplotlib widget 

from datetime import datetime, timedelta
from pytz import timezone

In [2]:
import fbprophet
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation, performance_metrics
from fbprophet.plot import plot_cross_validation_metric

In [3]:
fbprophet.__version__

'0.6'

In [4]:
from sklearn.model_selection import ParameterGrid

In [5]:
# Import the functions from the helper.py
from helper import prophet_fit, prophet_plot, get_outliers, execute_cross_validation_and_performance_loop

In [6]:
cluster = pd.read_csv('../data/interim/Device1_0501_0612.csv', 
                    delimiter=',')

In [7]:
cluster.head(-10)

Unnamed: 0,ts_date,sun_updown,time,day,co2_slope,temp_slope,light_slope,hum_slope,co2_slope_b,temp_slope_b,...,light_diff2,temperature_diff2,humidity_diff2,cluster,light,temperature,humidity,co2,night,label
0,2019-05-01 12:18:20.409,0,2020-04-17 12:18:20.409,2019-05-01 00:00:00.000,-0.181213,0.000000,-0.5,0.007630,0.764872,0.000890,...,-0.333333,-0.000890,0.023906,0,103.0,21.177591,30.942244,680.433960,,7
1,2019-05-01 12:18:40.460,0,2020-04-17 12:18:40.460,2019-05-01 00:00:00.000,-0.937378,-0.006676,0.0,-0.014496,0.596130,0.000890,...,-0.600000,0.000890,0.025839,0,103.0,21.177591,30.969711,679.306213,,7
2,2019-05-01 12:19:00.520,0,2020-04-17 12:19:00.520,2019-05-01 00:00:00.000,0.502411,0.000000,-0.5,-0.007630,0.510488,0.000890,...,-0.600000,0.002670,0.022176,0,103.0,21.164240,30.913252,678.559204,,7
3,2019-05-01 12:19:20.565,0,2020-04-17 12:19:20.565,2019-05-01 00:00:00.000,0.916199,0.013351,0.0,0.035096,0.494621,0.001335,...,-0.866667,0.002670,0.016581,0,102.0,21.177591,30.954451,680.311035,,7
4,2019-05-01 12:19:40.610,0,2020-04-17 12:19:40.610,2019-05-01 00:00:00.000,0.617432,0.006676,0.5,-0.006104,0.495894,0.002225,...,-0.800000,0.005341,0.012105,0,103.0,21.190942,30.983444,680.391602,,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170071,2019-06-12 11:55:04.335,0,2020-04-17 11:55:04.335,2019-06-12 00:00:00.000,0.942474,0.000000,0.0,-0.123596,0.835724,-0.009524,...,1.133333,-0.055899,-0.136823,0,10.0,24.531526,37.093155,494.147034,,7
170072,2019-06-12 11:55:24.397,0,2020-04-17 11:55:24.397,2019-06-12 00:00:00.000,1.116196,-0.014690,0.0,-0.034332,0.804841,-0.008634,...,1.000000,-0.056077,-0.158490,0,10.0,24.515497,36.882584,495.615143,,7
170073,2019-06-12 11:55:44.451,0,2020-04-17 11:55:44.451,2019-06-12 00:00:00.000,1.103928,-0.013351,0.0,0.044250,0.773862,-0.007210,...,0.866667,-0.053407,-0.180464,0,10.0,24.502145,37.024490,496.379425,,7
170074,2019-06-12 11:56:04.504,0,2020-04-17 11:56:04.504,2019-06-12 00:00:00.000,1.016769,0.006676,0.0,-0.012970,0.744929,-0.005875,...,0.733333,-0.051628,-0.194603,0,10.0,24.488794,36.971085,497.822998,,7


In [8]:
cluster.dtypes

ts_date               object
sun_updown             int64
time                  object
day                   object
co2_slope            float64
temp_slope           float64
light_slope          float64
hum_slope            float64
co2_slope_b          float64
temp_slope_b         float64
light_slope_b        float64
hum_slope_b          float64
co2_slope_b_30       float64
temp_slope_b_30      float64
light_slope_b_30     float64
hum_slope_b_30       float64
co2_slope_b_2hr      float64
temp_slope_b_2hr     float64
light_slope_b_2hr    float64
hum_slope_b_2hr      float64
co2_diff             float64
light_diff           float64
temperature_diff     float64
humidity_diff        float64
co2_diff2            float64
light_diff2          float64
temperature_diff2    float64
humidity_diff2       float64
cluster                int64
light                float64
temperature          float64
humidity             float64
co2                  float64
night                float64
label         

In [9]:
def label_formater(df_raw, sampling_period_st, sampling_period_num, predict_day=1):
    """
    This is a script to load Juhyun's file with clusters convert the
    ts_date column (object) into a column named ds and formated in
    local Swiss time (datetime).

    Arg:
        - df_raw: original file  from Juhyun
        - sampling_period_st: String. Duration of bin for data downsampling. !
          Format is not accurate for date calculations.
        - sampling_period_num: Float. Number of hours of the sampling_period_st.
          Example: resampling every 30min: '0.5
        - predict_day=1. Number of days predicted. 1 by default.

    Returns:
        - df: pandas dataframe for the specific parameter
        - predict_n = Int. Number of data points to predict.
        - today_index = df.shape[0] - predict_n # index
        - lookback_n = int(today_index*0.99) # 1 week 336
    """

    # Convert ts_date into a datetime and convert UTC into Swiss Time
    utc_time = pd.to_datetime(
        df_raw['ts_date'], format='%Y-%m-%d %H:%M:%S', utc=True)
    df_raw['local_time_to_drop'] = utc_time.apply(
        lambda x: x.tz_convert('Europe/Zurich'))

    # Keep only ts_date and the label
    df_raw['ts_date'] = df_raw['local_time_to_drop']
    df_raw.rename(columns={'ts_date': 'ds'}, inplace=True)
    df_raw.rename(columns={'label': 'y'}, inplace=True)
    df_parameter = df_raw[['ds', 'y']]

    # Set ds as the index
    df_parameter.index = df_parameter.ds
    df_parameter.reindex
    df_original = df_parameter.copy()
    
    # Downsampling and fill NaN. Need to have set ds as the index.
#     df = df_parameter.resample(sampling_period_st).pad()
    df = df_original.copy()
    df = df.iloc[1:]

    # Shape report
    last_df = df.shape[0] - 1
    print('Full dataset: {:%Y-%m-%d} to the {:%Y-%m-%d}'
          .format(df['ds'][0], df['ds'][last_df]))

    # specify the time frames.
    predict_n = int(predict_day * 24 / sampling_period_num)  # in data points
    today_index = df.shape[0] - predict_n  # index
    lookback_n = int(today_index * 0.99)  # 1 week 336

    return df, df_original, predict_n, today_index, lookback_n

cluster = pd.read_csv('../data/interim/Device1_0501_0612.csv', delimiter=',')
df, df_original, predict_n, today_index, lookback_n = label_formater(cluster, '5T', 0.08, predict_day=1)
df.head()

Full dataset: 2019-05-01 to the 2019-06-12


Unnamed: 0_level_0,ds,y
ds,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-05-01 14:18:40.460000+02:00,2019-05-01 14:18:40.460000+02:00,7
2019-05-01 14:19:00.520000+02:00,2019-05-01 14:19:00.520000+02:00,7
2019-05-01 14:19:20.565000+02:00,2019-05-01 14:19:20.565000+02:00,7
2019-05-01 14:19:40.610000+02:00,2019-05-01 14:19:40.610000+02:00,7
2019-05-01 14:20:00.660000+02:00,2019-05-01 14:20:00.660000+02:00,7


In [10]:
df_full = df.copy()

In [11]:
def plot_df(df, df_original, begin, end):
    # Plot the df
    fig, ax = plt.subplots(figsize=(8, 3))
    # Data with original frequency
    df_original.y.plot(label="Original", color='gray', linewidth=1)
    df.y.plot(label="Resampled data", color='black', marker='o', linestyle='dashed', linewidth=0.5, markersize=2)

    name = 'Clustering result of Device 1 from Juhyun'
    myFmt = DateFormatter("%d/%m %H:%M")
    ax.xaxis.set_major_formatter(myFmt)
    plt.xlabel('Time', fontsize=8)
    plt.ylabel('label', fontsize=8)
    plt.title(name, fontsize=14)
    plt.legend(loc='upper right')

    # vertical lines
    begin_str_vl = begin + ' 0:00'
    end_str_vl = end + ' 0:00'

    begin_dt_vl = datetime.strptime(begin_str_vl, '%Y-%m-%d %H:%M') + timedelta(days=1)
    end_dt_vl = datetime.strptime(end_str_vl, '%Y-%m-%d %H:%M')

    swiss = timezone('Europe/Zurich')
    begin_dt_vl = swiss.localize(begin_dt_vl)
    end_dt_vl = swiss.localize(end_dt_vl)

    daterange = pd.date_range(begin_dt_vl, end_dt_vl)
    for single_date in daterange:
        plt.axvline(x=single_date, color='lightseagreen', linestyle='--')
    plt.show()

In [12]:
def plot_label(df_dev, parameter, begin, end, sampling_period_st,
                 sampling_period_num, graph=None, predict_day=1):
    """
    This function is generating a new dataframe from entire dataframe.
    Note that for now, df_dev is the device 31 specific dataframe.

    Args:
        - df_dev: Dataframe. Full dataframe with
            o device                                object
            o tenant                                object
            o ds             datetime64[ns, Europe/Zurich]
            o light                                float64
            o temperature                          float64
            o humidity                             float64
            o co2                                  float64
        - parameter: String. among 'light', 'temperature', 'humidity', 'co2'.
          co2 might be the more "human-activity" related
        - begin: String. Day of the beginning of the new dataframe.
        - end: String. Day of the end of the new dataframe.
        - sampling_period_st: String. Duration of bin for data downsampling. !
          Format is not accurate for date calculations.
        - sampling_period_num: Float. Number of hours of the sampling_period_st.
          Example: resampling every 30min: '0.5
        - graph=None: Set to None to show the graph and a value if you don't want
          to show the graph.
        - predict_day=1. Number of days predicted. 1 by default.

    Returns:
        df: pandas dataframe for the specific parameter
        predict_n = Int. Number of data points to predict.
        today_index = df.shape[0] - predict_n # index
        lookback_n = int(today_index*0.99) # 1 week 336

    Note: Real values of predict_n, today_index and lookback_n depend on
          sampling_period_st and sampling_period_num. Wrong indications of
          sampling_period_st or sampling_period_num can lead to wrong predictions.

    TODO: Check if any existing function converts sampling_period_st into
          sampling_period_st and vice-versa. Use a Regex-based function could take
          care of it and avoid miscalculations.
    """
    name = 'Clustering result of Device 1 from Juhyun'
    
    # Prepare the dates
    # day time of the first day of the df. Might be relevant to get a full day
    # and help the day/night clustering
    starting_time = '21:00'

    begin_str = begin + ' ' + starting_time
    end_str = end + ' ' + starting_time

    begin_dt = datetime.strptime(begin_str, '%Y-%m-%d %H:%M')
    end_dt = datetime.strptime(end_str, '%Y-%m-%d %H:%M')

    # Apply the Swiss time zone.
    # http://pytz.sourceforge.net/#localized-times-and-date-arithmetic
    swiss = timezone('Europe/Zurich')
    # swiss.zone
    begin_dt = swiss.localize(begin_dt)
    end_dt = swiss.localize(end_dt)

    # Sorry this is not elegant. Fix it
    pd.options.mode.chained_assignment = None  # default='warn'

    # Filter according to begin and end. Does not take in account the starting
    # time...
    df_full = df_dev[(df_dev['ds'] >= begin_dt) & (df_dev['ds'] <= end_dt)]
    df_original = df_full.copy()

    # Downsampling and fill NaN. Need to have set ds as the index.
    # TODO: Disable the pad function to let Prophet deal with missing data
    df = df_full.resample(sampling_period_st).pad()
    df = df.iloc[1:]

    if not graph:
        # Plot the df
        fig, ax = plt.subplots(figsize=(8, 3))
        # Data with original frequency
        df.y.plot(label="Resampled data", color='black', marker='o',
                  linestyle='dashed', linewidth=0.5, markersize=2)

        myFmt = DateFormatter("%d/%m %H:%M")
        ax.xaxis.set_major_formatter(myFmt)
        plt.xlabel('Time', fontsize=8)
        plt.ylabel(parameter, fontsize=8)
        plt.title(name, fontsize=14)
        plt.legend(loc='upper right')

        # vertical lines
        begin_str_vl = begin + ' 0:00'
        end_str_vl = end + ' 0:00'

        begin_dt_vl = datetime.strptime(
            begin_str_vl, '%Y-%m-%d %H:%M') + timedelta(days=1)
        end_dt_vl = datetime.strptime(end_str_vl, '%Y-%m-%d %H:%M')

        begin_dt_vl = swiss.localize(begin_dt_vl)
        end_dt_vl = swiss.localize(end_dt_vl)

        daterange = pd.date_range(begin_dt_vl, end_dt_vl)
        for single_date in daterange:
            plt.axvline(x=single_date, color='lightseagreen', linestyle='--')
        plt.show()

#         # If you want to save the file
#         folder = '/Users/guillaume/Documents/DS2020/Caru/caru/figures/'
#         filename = folder + name + '.png'
#         plt.savefig(filename, bbox_inches = "tight")
#     else:
#         None

    # Shape report
    last_df = df.shape[0] - 1
    last = df_dev.shape[0] - 1
    print('Full dataset: {:%Y-%m-%d} to the {:%Y-%m-%d}. Analysed data the {:%Y-%m-%d} to the {:%Y-%m-%d}.'
          .format(df_dev['ds'][0], df_dev['ds'][last], df['ds'][0], df['ds'][last_df]))

    # specify the time frames.
    predict_n = int(predict_day * 24 / sampling_period_num)  # in data points
    today_index = df.shape[0] - predict_n  # index
    lookback_n = int(today_index * 0.99)  # 1 week 336

    return df, predict_n, today_index, lookback_n

In [13]:
df, predict_n, today_index, lookback_n = plot_label(df_full, 'y', '2019-05-01', '2019-05-07', '5T', 0.08, graph=None, predict_day=1)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Full dataset: 2019-05-01 to the 2019-06-12. Analysed data the 2019-05-01 to the 2019-05-07.


In [14]:
df, predict_n, today_index, lookback_n = plot_label(df_full, 'y', '2019-05-07', '2019-06-01', '5T', 0.08, graph=None, predict_day=1)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Full dataset: 2019-05-01 to the 2019-06-12. Analysed data the 2019-05-07 to the 2019-06-01.


In [15]:
df, predict_n, today_index, lookback_n = plot_label(df_full, 'y', '2019-05-07', '2019-06-01', '5T', 0.08, graph=None, predict_day=1)
# config the model
model = Prophet(interval_width=0.6, # anomaly threshold,
                yearly_seasonality=False, weekly_seasonality=False, daily_seasonality=False,
                changepoint_prior_scale=0.01) # Adjusting trend flexibility. should be <0.1 low --> toward overfit
model.add_seasonality(name='wekkly', period=7, fourier_order=15) # prior scale
model.add_seasonality(name='daily', period=1, fourier_order=15) # prior scale
# model.add_seasonality(name='half_day', period=0.5, fourier_order=10)

# Fit the model, flag outliers, and visualize
assert today_index>lookback_n, 'Not enough data for prediction (lookback_n<today_index)'
fig, forecast, model = prophet_fit(df, model, today_index, '5T', 0.08, lookback_days=lookback_n, predict_days=predict_n)   
outliers, df_pred = get_outliers(df, forecast, today_index, predict_days=predict_n)
prophet_plot(df, fig, today_index, predict_days=predict_n, outliers=outliers)
plt.show()
# param_grid = {'model' : [model],
#               'initial' : ['20 days'], # If not provided, 3 * horizon is used. Same units as horizon
#               'period'  : ['0.5 days'], # Integer amount of time between cutoff dates. If not provided, 0.5 * horizon is used.
#               'horizon' : ['1 days']} # A forecast is made for every observed point between cutoff and cutoff + horizon}
# execute_cross_validation_and_performance_loop(list(ParameterGrid(param_grid)), metric = 'mape')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Full dataset: 2019-05-01 to the 2019-06-12. Analysed data the 2019-05-07 to the 2019-06-01.
o Trained on the data from the 2019-05-08 to the 2019-05-31 (22 days).
o Predict from the 2019-05-31 to the 2019-06-01 (1 days).


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …