# The simplest baseline

More than two standard deviations away from the mean value => anomaly

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from preprocessing import load_data

In [2]:
train, val, test = load_data()

Load hackathon_kpis_anonymised.csv
(729881, 16) (156403, 16) (156404, 16)


In [3]:
tiny = train[:10]

In [4]:
numeric_cols = list(train.columns)
numeric_cols.remove('cell_name')
numeric_cols.remove('timestamp')
numeric_cols

['ho_failure_rate',
 'num_voice_attempts',
 'voice_drop_rate',
 'num_data_attempts',
 'voice_setup_failure_rate',
 'voice_tot_failure_rate',
 'avail_period_duration',
 'bandwidth',
 'throughput_rate',
 'data_setup_failure_rate',
 'data_drop_rate',
 'data_tot_failure_rate',
 'unavail_total_rate',
 'unavail_unplan_rate']

In [5]:
means = train[numeric_cols].mean()
stds = train[numeric_cols].std()

tolerance = 3

threshold_upper = means + tolerance*stds
threshold_lower = means - tolerance*stds

anomalies = {}

for col_name in numeric_cols:
    anomalies[col_name] = (train[col_name] >= threshold_upper[col_name]) | (train[col_name] <= threshold_lower[col_name])

anomalies = pd.DataFrame(anomalies)

anomalies.describe()

ho_failure_rate             0.340535
num_voice_attempts          0.011930
voice_drop_rate             0.000715
num_data_attempts           0.020059
voice_setup_failure_rate    0.000956
voice_tot_failure_rate      0.001033
avail_period_duration       0.998692
bandwidth                   0.343226
throughput_rate             0.000182
data_setup_failure_rate     0.002620
data_drop_rate              0.000326
data_tot_failure_rate       0.000653
unavail_total_rate          0.358941
unavail_unplan_rate         0.000018
dtype: float64

In [39]:
anomalies.sum()

ho_failure_rate              3565
num_voice_attempts          13580
voice_drop_rate              2423
num_data_attempts           11390
voice_setup_failure_rate     1619
voice_tot_failure_rate       3165
avail_period_duration        1925
bandwidth                       0
throughput_rate               321
data_setup_failure_rate      3308
data_drop_rate               2936
data_tot_failure_rate        3567
unavail_total_rate          53708
unavail_unplan_rate          1811
dtype: int64

In [43]:
from std_detector import StdDetector

model = StdDetector(numeric_cols, 3)
model.train(train)
anomalies = model.detect(train)

In [44]:
train.iloc[np.where(anomalies == True)[0]]

Unnamed: 0,cell_name,timestamp,ho_failure_rate,num_voice_attempts,voice_drop_rate,num_data_attempts,voice_setup_failure_rate,voice_tot_failure_rate,avail_period_duration,bandwidth,throughput_rate,data_setup_failure_rate,data_drop_rate,data_tot_failure_rate,unavail_total_rate,unavail_unplan_rate
850004,25_11W,2019-11-04 00:00:00+00:00,,,,,,,1.0,1.00000,,,,,0.666728,0.0
116626,28_11Y,2019-11-04 00:00:00+00:00,,0.000000,,0.000000,,,1.0,0.00000,,,,,0.666728,
927893,25_31R,2019-11-04 00:00:00+00:00,,,,,,,1.0,0.49975,,,,,0.666728,0.0
125148,12_21Y,2019-11-04 00:00:00+00:00,,0.146432,0.000000,0.055375,0.0,0.000000,1.0,0.00000,,0.001195,0.000269,0.000418,0.333364,0.0
776160,01_22W,2019-11-04 00:00:00+00:00,,,,,,,1.0,1.00000,,,,,0.666728,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
561525,00_31Y,2020-08-20 10:00:00+00:00,,0.019462,0.000000,0.137810,0.0,0.000000,1.0,0.00000,,0.003554,0.001081,0.001525,0.333549,0.0
948279,00_21W,2020-08-20 10:00:00+00:00,0.335495,0.104727,0.000000,0.147844,0.0,0.000000,1.0,1.00000,0.000208,0.000358,0.000067,0.000112,0.333364,0.0
948279,00_21W,2020-08-20 10:00:00+00:00,0.335495,0.104727,0.000000,0.147844,0.0,0.000000,1.0,1.00000,0.000208,0.000358,0.000067,0.000112,0.333364,0.0
344399,21_21Z,2020-08-20 10:00:00+00:00,0.335021,0.101946,0.000000,0.078966,0.0,0.000000,1.0,0.49975,0.000154,0.000168,0.000042,0.000063,0.333364,0.0


In [42]:
def plot_anomaly(df: pd.DataFrame, anomaly_idx: pd.DataFrame, plottable_cols):
    """
    df: The full data frame
    anomaly_idx: a boolean data frame indicating where anomalies are located
    """
    groups = df.groupby('cell_name').groups
    for cell_name, cell_idx in groups.items():
        fig = go.Figure()
        fig.update_layout(title=cell_name)
        cell_df = df.loc[cell_idx]
        cell_anomalies = anomaly_idx.loc[cell_idx]
        for col in plottable_cols:
            fig.add_trace(go.Scatter(x=cell_df['timestamp'], y=cell_df[col], mode='lines', name=col))
            anom = cell_df.loc[cell_anomalies[col]][['timestamp', col]]
            fig.add_trace(go.Scatter(x=anom['timestamp'], y=anom[col], mode='markers', name=col + ' ANOMALY'))
            #print(anom)
            #break
        fig.show()
        break

plot_anomaly(train, anomalies, numeric_cols)