In [1]:
%cd /notebooks/anomaly-detector

/notebooks/anomaly-detector


In [2]:
import os

try:
    from src.utils.set_seed import set_seed
    from src.utils import utils
    from src.utils import params
    from src.utils import thresholds as th
    from src.data import columns
    from src.data import preprocessing  
    from src.visualization import plotter
    
    from pyod.models.lof import LOF
    from pyod.models.iforest import IForest
    from pyod.models.ecod import ECOD
    from pyod.models.knn import KNN
    
except ModuleNotFoundError:
    print("installing requirements..")
    os.system('pip install -r requirements.txt')
    from src.utils.set_seed import set_seed
    from src.utils import utils
    from src.utils import params
    from src.utils import thresholds as th
    from src.data import columns
    from src.data import preprocessing
    from src.visualization import plotter
    
    from pyod.models.cblof import CBLOF
    from pyod.models.iforest import IForest
    from pyod.models.ecod import ECOD
    from pyod.models.knn import KNN
    
set_seed()

seed set to [42]


In [3]:
ALGORITHM, PREPROCESSING_PARAMS, TRAINING_PARAMS, INTERVALS_PARAMS, TH_ALGORITHM, PLOT_PARAMS, PATH_PARAMS = params.get_params_jupyter(MULTI=False)

ALGORITHM: knn
PREPROCESSING_PARAMS: {'downsamplig_rate': 5, 'normalization': None, 'metric': ['Database Time Per Sec']}
TRAINING_PARAMS: {'contamination': 0.001}
INTERVALS_PARAMS: {'start': 0, 'end': 67047}
TH_ALGORITHM: yj
PATH_PARAMS: {'DF_PATH': '/notebooks/anomaly-detector/data/raw/V_GV_SYSMETRIC_INCTANCE_2.csv', 'DF_TRAIN_PATH': '/notebooks/anomaly-detector/data/processed/filtered/clean_df.csv', 'DF_TEST_PATH': '/notebooks/anomaly-detector/data/processed/filtered/dirty_df.csv'}
PLOT_PARAMS: {'db_time_static': True, 'db_time_html': False, 'labels_static': True, 'labels_html': False, 'thresholds_static': True, 'thresholds_html': True}


In [4]:
class Univariate_ad():
    def __init__(self, ALGORITHM, PREPROCESSING_PARAMS, TRAINING_PARAMS, 
                     INTERVALS_PARAMS, TH_ALGORITHM, PLOT_PARAMS, PATH_PARAMS):
        self.ALGORITHM = ALGORITHM
        self.PREPROCESSING_PARAMS = PREPROCESSING_PARAMS
        self.TRAINING_PARAMS = TRAINING_PARAMS
        self.INTERVALS_PARAMS = INTERVALS_PARAMS
        self.TH_ALGORITHM = TH_ALGORITHM
        self.PLOT_PARAMS = PLOT_PARAMS
        self.df_path = PATH_PARAMS['DF_PATH']
        self.ALGORITHMS_LIST = ['lof', 'iforest', 'ecod', 'knn']
        self.contamination = TRAINING_PARAMS['contamination']
        self.scaler = PREPROCESSING_PARAMS['normalization']
        self.metric = PREPROCESSING_PARAMS['metric']
        if len(self.metric) != 1:
            print('ERROR: use only one metric in the univariate case')
        
        self.df = None
        self.labels_ = None
        self.decision_scores_ = None
        self.threshold_ = None
        self.anomalies_intervals_ = None
        
    def fit_predict(self, plot=True):
        self.df = preprocessing.get_df(self.df_path, columns_name=self.metric)
        db_time = preprocessing.get_db_time(self.df, self.PREPROCESSING_PARAMS, INTERVALS_PARAMS=self.INTERVALS_PARAMS, multi=False)
        df, timestamps = preprocessing.data_preprocessing(
                                                self.PREPROCESSING_PARAMS, self.df, 
                                                INTERVALS_PARAMS=self.INTERVALS_PARAMS, 
                                                scaler=self.scaler,
                                                multi=False
                                        )
        
        if self.ALGORITHM not in self.ALGORITHMS_LIST:
            print('Error: specified algorithm not supported')
            print('using default algorithm (KNN)')
        
        if self.ALGORITHM == 'lof':
            clf = LOF(contamination=TRAINING_PARAMS['contamination'])
        elif self.ALGORITHM == 'iforest':
            clf = IForest(contamination=TRAINING_PARAMS['contamination'])
        elif self.ALGORITHM == 'ecod':
            clf = ECOD(contamination=TRAINING_PARAMS['contamination'])
        else:
            clf = KNN(contamination=TRAINING_PARAMS['contamination'])
            
        clf.fit(df, y=None)
        y_pred = clf.decision_scores_
        model_thresh = clf.threshold_
        model_labels = clf.labels_
        self.decision_scores_ = y_pred
        if self.TH_ALGORITHM is None:
            self.threshold_, self.labels_ = model_thresh, model_labels
        else:
            self.threshold_, self.labels_ = th.get_th_and_labels(self.TH_ALGORITHM, y_pred)
        
        print(f'detected {sum(self.labels_)} anomalies')
        
        anomalies_intervals_df = utils.generate_anomalies_intervals(self.labels_, timestamps)
        self.anomalies_intervals_ = anomalies_intervals_df
        utils.save_anomalies_intervals(anomalies_intervals_df, filename='anomalies-intervals-univariate')
        
        if plot is True:
            plotter.plot_res_db_time(y_pred, db_time, timestamps=timestamps, 
                             save_static=PLOT_PARAMS['db_time_static'], save_html=PLOT_PARAMS['db_time_html']) 
            plotter.plot_labels(y_pred, self.labels_, timestamps=timestamps,
                                    save_static=PLOT_PARAMS['labels_static'], save_html=PLOT_PARAMS['labels_html'])
            plotter.plot_thresholds(df, self.labels_, timestamps=timestamps,
                                        save_static=self.PLOT_PARAMS['thresholds_static'], save_html=self.PLOT_PARAMS['thresholds_html'])
            
    def remove_anomalies(self, path='./data/processed/filtered/', clean_name='clean_df.csv', dirty_name='dirty_df.csv',
                                plot=False):
        if self.anomalies_intervals_ is None:
            self.fit_predict(plot=plot)
        
        df = preprocessing.get_df(self.df_path, columns_name=None)[self.INTERVALS_PARAMS['start']:self.INTERVALS_PARAMS['end']]
        
        # clean_df, dirty_df = utils.remove_week_with_anomalies(df, self.anomalies_intervals_)
        clean_df, dirty_df = utils.remove_week_with_anomalies(df, self.anomalies_intervals_)
        print(f'clean_df shape: {clean_df.shape}')
        print(f'dirty_df shape: {dirty_df.shape}')
        
        preprocessing.save_df(clean_df, path=path, name=clean_name)
        preprocessing.save_df(dirty_df, path=path, name=dirty_name)

In [5]:
u = Univariate_ad(ALGORITHM, PREPROCESSING_PARAMS, TRAINING_PARAMS, 
                      INTERVALS_PARAMS, TH_ALGORITHM, PLOT_PARAMS, PATH_PARAMS)

In [6]:
u.fit_predict()

Using custom columns
dataframe shape: (78503, 1)
detected 27 anomalies
              start                 end          lenght
2022-01-20 21:30:00 2022-01-20 21:35:00 0 days 00:05:00
2022-01-23 11:00:00 2022-01-23 11:05:00 0 days 00:05:00
2022-01-24 10:00:00 2022-01-24 10:05:00 0 days 00:05:00
2022-01-24 12:00:00 2022-01-24 12:10:00 0 days 00:10:00
2022-02-07 08:55:00 2022-02-07 09:40:00 0 days 00:45:00
2022-02-07 09:45:00 2022-02-07 09:55:00 0 days 00:10:00
2022-02-07 10:00:00 2022-02-07 10:25:00 0 days 00:25:00
2022-02-07 10:30:00 2022-02-07 10:45:00 0 days 00:15:00
2022-02-07 11:20:00 2022-02-07 11:25:00 0 days 00:05:00
2022-02-28 10:05:00 2022-02-28 10:10:00 0 days 00:05:00
2022-03-02 01:15:00 2022-03-02 01:20:00 0 days 00:05:00


In [7]:
u.remove_anomalies()

dataframe shape: (78503, 155)
clean_df shape: (30018, 156)
dirty_df shape: (37029, 156)


In [8]:
# preprocessing.get_df("/notebooks/anomaly-detector/data/processed/filtered/clean_df.csv")

In [9]:
df, timestamps = preprocessing.data_preprocessing(
                                                PREPROCESSING_PARAMS, u.df, 
                                                INTERVALS_PARAMS=u.INTERVALS_PARAMS, 
                                                scaler=u.scaler,
                                                multi=False
                                        )

In [10]:
len(timestamps)

13467

In [11]:
plotter.plot_labels(df['Database Time Per Sec'], u.labels_, timestamps=timestamps,
                                        save_static=True, save_html=True)