In [1]:
%cd /notebooks/anomaly-detector

/notebooks/anomaly-detector


In [2]:
import os
import numpy as np

try:
    from src.utils.set_seed import set_seed
    from src.utils import utils
    from src.utils import params
    from src.utils import thresholds as th
    from src.models import usad
    from src.models import usad_utils
    from src.data import columns
    from src.data import preprocessing
    from src.visualization import plotter
except ModuleNotFoundError:
    print("installing requirements..")
    os.system('pip install -r requirements.txt')
    from src.utils import set_seed
    from src.utils import utils
    from src.utils import params
    from src.utils import thresholds as th
    from src.models import usad
    from src.models import usad_utils
    from src.data import columns
    from src.data import preprocessing
    from src.visualization import plotter
    
set_seed()

seed set to [42]


In [3]:
PREPROCESSING_PARAMS, TRAINING_PARAMS, INTERVALS_PARAMS, TH_ALGORITHM, PLOT_PARAMS, PATH_PARAMS = params.get_params_jupyter(MULTI=True)

PREPROCESSING_PARAMS: {'downsamplig_rate': 5, 'window_size': 12, 'normalization': 'min-max', 'metrics': 'columns_6', 'single_file': False}
TRAINING_PARAMS: {'batch_size': 32, 'epochs': 70, 'hidden_size': 10, 'alpha': 0.5, 'beta': 0.5}
INTERVALS_PARAMS: {'train_start': 36945, 'train_end': 67047, 'test_start': 67047, 'test_end': 78503, 'th_start': 0, 'th_end': 0}
TH_ALGORITHM: iqr
PATH_PARAMS: {'DF_PATH': '/notebooks/anomaly-detector/data/raw/V_GV_SYSMETRIC_INCTANCE_2.csv', 'DF_TRAIN_PATH': '/notebooks/anomaly-detector/data/processed/filtered/clean_df.csv', 'DF_TEST_PATH': '/notebooks/anomaly-detector/data/processed/filtered/dirty_df.csv'}
PLOT_PARAMS: {'history_static': True, 'history_html': False, 'db_time_static': True, 'db_time_html': False, 'labels_static': True, 'labels_html': False, 'thresholds_static': True, 'thresholds_html': True}


In [4]:
class Multivariate_ad():
    def __init__(self, PREPROCESSING_PARAMS, TRAINING_PARAMS, INTERVALS_PARAMS, 
                         TH_ALGORITHM, PLOT_PARAMS, PATH_PARAMS):
        
        self.PREPROCESSING_PARAMS = PREPROCESSING_PARAMS
        self.TRAINING_PARAMS = TRAINING_PARAMS
        self.INTERVALS_PARAMS = INTERVALS_PARAMS
        self.TH_ALGORITHM = TH_ALGORITHM
        self.PLOT_PARAMS = PLOT_PARAMS

        self.df_path = PATH_PARAMS['DF_PATH']
        self.df_train_path = PATH_PARAMS['DF_TRAIN_PATH']
        self.df_test_path = PATH_PARAMS['DF_TEST_PATH']
        self.batch_size = TRAINING_PARAMS['batch_size']
        self.epochs = TRAINING_PARAMS['epochs']
        self.hidden_size = TRAINING_PARAMS['hidden_size']
        self.single_file = PREPROCESSING_PARAMS['single_file']
        self.scaler = PREPROCESSING_PARAMS['normalization']
        self.columns_name = PREPROCESSING_PARAMS['metrics']
        self.alpha =  TRAINING_PARAMS['alpha']
        self.beta =  TRAINING_PARAMS['beta']
        
        self.df = None
        self.train_df = None
        self.test_df = None

        # self.w_size = windows_train.shape[1] * windows_train.shape[2]
        # self.z_size = windows_train.shape[1] * hidden_size
        self.device = utils.get_default_device()
        
        self.labels_ = None
        self.decision_scores_ = None
        self.anomalies_intervals_ = None
        self.threshold = None
        
        
    def get_data(self, prep=True):
        if prep: scaler = self.scaler
        else: scaler = None
        
        if self.single_file is True:  # no auto data selection
            self.df = preprocessing.get_df(self.df_path, columns_name=self.columns_name)
            return preprocessing.data_preprocessing(
                                                    self.PREPROCESSING_PARAMS, self.df, 
                                                    INTERVALS_PARAMS=self.INTERVALS_PARAMS, 
                                                    scaler=scaler, multi=True, single_file=self.single_file
                                                )

        else:  # auto data selection
            self.train_df = preprocessing.get_df(self.df_train_path, columns_name=self.columns_name)
            # self.test_df = preprocessing.get_df(self.df_test_path, columns_name=self.columns_name)
            self.test_df = preprocessing.get_df(self.df_path, columns_name=self.columns_name)[self.INTERVALS_PARAMS['test_start']:self.INTERVALS_PARAMS['test_end']]
            return preprocessing.data_preprocessing(
                                                    self.PREPROCESSING_PARAMS, df=None, 
                                                    INTERVALS_PARAMS=self.INTERVALS_PARAMS, 
                                                    scaler=scaler, multi=True, single_file=self.single_file, 
                                                    df_train=self.train_df, df_test=self.test_df
                                                )
        
    def get_db_time(self):
        if self.single_file is True:
            _, db_time = preprocessing.get_db_time(self.df, self.PREPROCESSING_PARAMS,
                                                           INTERVALS_PARAMS=self.INTERVALS_PARAMS, multi=True)
        else:
             db_time = preprocessing.get_db_time(self.test_df, self.PREPROCESSING_PARAMS, 
                                                    INTERVALS_PARAMS=None, multi=False)
                
        return db_time
        
    # TODO: add fit and predict
    def fit_predict(self, plot=True):
        df_train, df_test, windows_train, windows_test, train_timestamps, test_timestamps = self.get_data()
        w_size = windows_train.shape[1] * windows_train.shape[2]
        z_size = windows_train.shape[1] * self.hidden_size
        
        db_time = self.get_db_time()
        _, original_test_data, _, _, _, _ = self.get_data(prep=False)
        train_loader, val_loader, test_loader = preprocessing.get_dataloaders(
                                                                    windows_train, windows_test, 
                                                                    self.batch_size, w_size, z_size
                                                                )
        model = usad.UsadModel(w_size, z_size)
        model = utils.to_device(model, self.device)
        history = usad_utils.training(self.epochs, model, self.device, train_loader, val_loader)
        usad_utils.save_model(model)
        model = usad_utils.load_checkpoint(model)        
        
        ################# THRESHOLD TUNING #######################
        print('TUNING THRESHOLD ...')
        results = usad_utils.test_model(model, self.device, val_loader, alpha=self.alpha, beta=self.beta)
        y_pred_tuning = usad_utils.get_prediction_score(results)
        threshold, _ = th.get_th_and_labels(self.TH_ALGORITHM, y_pred_tuning) 
        print(f'\tTHRESHOLD: {threshold}')
        print('... THRESHOLD TUNING COMPLETE')
        ##########################################################
        
        ################# COMPUTE LABELS #########################
        results = usad_utils.test_model(model, self.device, test_loader, alpha=self.alpha, beta=self.beta)
        y_pred = usad_utils.get_prediction_score(results)
        
        l = []
        for y in y_pred:
            y_pred_tuning = np.append(y_pred_tuning, y)
            threshold, _ = th.get_th_and_labels(self.TH_ALGORITHM, y_pred_tuning)
            labels = th.get_labels_from_th(threshold, y_pred_tuning)  # compute labels (we are interested in the last value)
            l.append(labels[-1])  # collect last predicted value
            
        labels = np.array(l)
        # labels = th.get_labels_from_th(threshold, y_pred)
        # labels = th.get_labels(self.TH_ALGORITHM, y_pred) 
        ##########################################################
        
        ################# ADJUST LABELS ##########################
        labels = th.adjust_labels(labels, self.PREPROCESSING_PARAMS['window_size'])
        ##########################################################
        
        anomalies_intervals_df = utils.generate_anomalies_intervals(labels, test_timestamps)
        utils.save_anomalies_intervals(anomalies_intervals_df)
        
        # update public variables
        self.decision_scores_ = y_pred
        # self.threshold = threshold
        self.labels_ = labels
        self.anomalies_intervals_ = anomalies_intervals_df
        
        if plot is True:
            plotter.plot_history(history, save_static=PLOT_PARAMS['history_static'], save_html=PLOT_PARAMS['history_html'])
            plotter.plot_res_db_time(y_pred, db_time, timestamps=test_timestamps, 
                                             save_static=PLOT_PARAMS['db_time_static'], save_html=PLOT_PARAMS['db_time_html'])
            plotter.plot_labels(y_pred, labels, timestamps=test_timestamps, 
                                        save_static=PLOT_PARAMS['labels_static'], save_html=PLOT_PARAMS['labels_html'])
            plotter.plot_thresholds(original_test_data, labels, timestamps=test_timestamps,
                                            save_static=PLOT_PARAMS['thresholds_static'], save_html=PLOT_PARAMS['thresholds_html'])

In [5]:
m = Multivariate_ad(
        PREPROCESSING_PARAMS, TRAINING_PARAMS, INTERVALS_PARAMS, 
        TH_ALGORITHM, PLOT_PARAMS, PATH_PARAMS
    )

In [6]:
m.fit_predict()

dataframe shape: (30018, 19)
dataframe shape: (78503, 19)
normalizing data using MinMax Scaler
normalizing data using MinMax Scaler
dataframe shape: (30018, 19)
dataframe shape: (78503, 19)
Epoch [0], val_loss1: 0.0247, val_loss2: 0.0244
Epoch [1], val_loss1: 0.0163, val_loss2: 0.0001
Epoch [2], val_loss1: 0.0160, val_loss2: -0.0067
Epoch [3], val_loss1: 0.0197, val_loss2: -0.0129
Epoch [4], val_loss1: 0.0158, val_loss2: -0.0106
Epoch [5], val_loss1: 0.0157, val_loss2: -0.0114
Epoch [6], val_loss1: 0.0157, val_loss2: -0.0123
Epoch [7], val_loss1: 0.0159, val_loss2: -0.0132
Epoch [8], val_loss1: 0.0170, val_loss2: -0.0148
Epoch [9], val_loss1: 0.0181, val_loss2: -0.0162
Epoch [10], val_loss1: 0.0175, val_loss2: -0.0156
Epoch [11], val_loss1: 0.0185, val_loss2: -0.0170
Epoch [12], val_loss1: 0.0190, val_loss2: -0.0176
Epoch [13], val_loss1: 0.0176, val_loss2: -0.0161
Epoch [14], val_loss1: 0.0176, val_loss2: -0.0164
Epoch [15], val_loss1: 0.0183, val_loss2: -0.0172
Epoch [16], val_loss1: