# The simplest baseline

More than two standard deviations away from the mean value => anomaly

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from preprocessing import load_data

In [2]:
train, val, test = load_data()

Load hackathon_kpis_anonymised.csv
(729881, 16) (156403, 16) (156404, 16)


In [3]:
tiny = train[:10]

In [4]:
numeric_cols = list(train.columns)
numeric_cols.remove('cell_name')
numeric_cols.remove('timestamp')
numeric_cols

['ho_failure_rate',
 'num_voice_attempts',
 'voice_drop_rate',
 'num_data_attempts',
 'voice_setup_failure_rate',
 'voice_tot_failure_rate',
 'avail_period_duration',
 'bandwidth',
 'throughput_rate',
 'data_setup_failure_rate',
 'data_drop_rate',
 'data_tot_failure_rate',
 'unavail_total_rate',
 'unavail_unplan_rate']

In [5]:
means = train[numeric_cols].mean()
means

ho_failure_rate             0.340535
num_voice_attempts          0.011930
voice_drop_rate             0.000715
num_data_attempts           0.020059
voice_setup_failure_rate    0.000956
voice_tot_failure_rate      0.001033
avail_period_duration       0.998692
bandwidth                   0.343226
throughput_rate             0.000182
data_setup_failure_rate     0.002620
data_drop_rate              0.000326
data_tot_failure_rate       0.000653
unavail_total_rate          0.358941
unavail_unplan_rate         0.000018
dtype: float64

In [6]:
stds = train[numeric_cols].std()
stds

ho_failure_rate             0.045578
num_voice_attempts          0.027265
voice_drop_rate             0.010110
num_data_attempts           0.036745
voice_setup_failure_rate    0.018527
voice_tot_failure_rate      0.011868
avail_period_duration       0.026645
bandwidth                   0.345007
throughput_rate             0.002212
data_setup_failure_rate     0.018017
data_drop_rate              0.001877
data_tot_failure_rate       0.003174
unavail_total_rate          0.087674
unavail_unplan_rate         0.001257
dtype: float64

In [7]:
tolerance = 2

In [8]:
threshold_upper = means + tolerance*stds
threshold_lower = means - tolerance*stds
threshold_upper, threshold_lower

(ho_failure_rate             0.431690
 num_voice_attempts          0.066459
 voice_drop_rate             0.020935
 num_data_attempts           0.093550
 voice_setup_failure_rate    0.038010
 voice_tot_failure_rate      0.024769
 avail_period_duration       1.051983
 bandwidth                   1.033239
 throughput_rate             0.004607
 data_setup_failure_rate     0.038653
 data_drop_rate              0.004080
 data_tot_failure_rate       0.007001
 unavail_total_rate          0.534288
 unavail_unplan_rate         0.002532
 dtype: float64,
 ho_failure_rate             0.249379
 num_voice_attempts         -0.042600
 voice_drop_rate            -0.019506
 num_data_attempts          -0.053431
 voice_setup_failure_rate   -0.036098
 voice_tot_failure_rate     -0.022702
 avail_period_duration       0.945402
 bandwidth                  -0.346788
 throughput_rate            -0.004243
 data_setup_failure_rate    -0.033413
 data_drop_rate             -0.003428
 data_tot_failure_rate      -0.00

In [9]:
anomalies = {}

for col_name in numeric_cols:
    anomalies[col_name] = (train[col_name] >= threshold_upper[col_name]) | (train[col_name] <= threshold_lower[col_name])

anomalies = pd.DataFrame(anomalies)

anomalies.describe()

Unnamed: 0,ho_failure_rate,num_voice_attempts,voice_drop_rate,num_data_attempts,voice_setup_failure_rate,voice_tot_failure_rate,avail_period_duration,bandwidth,throughput_rate,data_setup_failure_rate,data_drop_rate,data_tot_failure_rate,unavail_total_rate,unavail_unplan_rate
count,729881,729881,729881,729881,729881,729881,729881,729881,729881,729881,729881,729881,729881,729881
unique,2,2,2,2,2,2,2,1,2,2,2,2,2,2
top,False,False,False,False,False,False,False,False,False,False,False,False,False,False
freq,725177,705663,726590,707030,727644,725582,727949,729881,729406,724604,724180,723441,675985,728042


In [10]:
anomalies.sum()

ho_failure_rate              4704
num_voice_attempts          24218
voice_drop_rate              3291
num_data_attempts           22851
voice_setup_failure_rate     2237
voice_tot_failure_rate       4299
avail_period_duration        1932
bandwidth                       0
throughput_rate               475
data_setup_failure_rate      5277
data_drop_rate               5701
data_tot_failure_rate        6440
unavail_total_rate          53896
unavail_unplan_rate          1839
dtype: int64

In [11]:
train.iloc[np.where(anomalies == True)[0]]

Unnamed: 0,cell_name,timestamp,ho_failure_rate,num_voice_attempts,voice_drop_rate,num_data_attempts,voice_setup_failure_rate,voice_tot_failure_rate,avail_period_duration,bandwidth,throughput_rate,data_setup_failure_rate,data_drop_rate,data_tot_failure_rate,unavail_total_rate,unavail_unplan_rate
850004,25_11W,2019-11-04 00:00:00+00:00,,,,,,,1.0,1.00000,,,,,0.666728,0.0
116626,28_11Y,2019-11-04 00:00:00+00:00,,0.000000,,0.000000,,,1.0,0.00000,,,,,0.666728,
927893,25_31R,2019-11-04 00:00:00+00:00,,,,,,,1.0,0.49975,,,,,0.666728,0.0
125148,12_21Y,2019-11-04 00:00:00+00:00,,0.146432,0.000000,0.055375,0.000000,0.000000,1.0,0.00000,,0.001195,0.000269,0.000418,0.333364,0.0
776160,01_22W,2019-11-04 00:00:00+00:00,,,,,,,1.0,1.00000,,,,,0.666728,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361355,17_21Z,2020-08-20 10:00:00+00:00,0.333333,0.024096,0.000000,0.041038,0.038462,0.012821,1.0,0.49975,0.000241,0.000000,0.000000,0.000000,0.333364,0.0
909716,00_11Y,2020-08-20 10:00:00+00:00,,0.057461,0.005376,0.506507,0.000000,0.005376,1.0,0.00000,,0.001437,0.001300,0.001480,0.333549,0.0
474918,13_21Q,2020-08-20 10:00:00+00:00,0.333333,0.081557,0.000000,0.009300,0.000000,0.000000,1.0,0.09955,0.000135,0.002847,0.000178,0.000534,0.333364,0.0
908225,25_11W,2020-08-20 10:00:00+00:00,0.333333,0.088044,0.000000,0.068574,0.000000,0.000000,1.0,1.00000,0.000157,0.000193,0.000097,0.000121,0.333364,0.0


In [20]:
def plot_anomaly(df: pd.DataFrame, anomaly_idx: pd.DataFrame, plottable_cols):
    """
    df: The full data frame
    anomaly_idx: a boolean data frame indicating where anomalies are located
    """
    groups = df.groupby('cell_name').groups
    for cell_name, cell_idx in groups.items():
        fig = go.Figure()
        fig.update_layout(title=cell_name)
        cell_df = df.loc[cell_idx]
        cell_anomalies = anomaly_idx.loc[cell_idx]
        for col in plottable_cols:
            fig.add_trace(go.Scatter(x=cell_df['timestamp'], y=cell_df[col], mode='lines', name=col))
            anom = cell_df.loc[cell_anomalies[col]][['timestamp', col]]
            fig.add_trace(go.Scatter(anom, x='timestamp', y='col'))
            print(anom)
        fig.show()
        break
    
plot_anomaly(train, anomalies, numeric_cols)

ValueError: The first argument to the plotly.graph_objs.Scatter 
constructor must be a dict or 
an instance of :class:`plotly.graph_objs.Scatter`