In [None]:
%load_ext autoreload
%autoreload 2

# The simplest baseline

More than two standard deviations away from the mean value => anomaly

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from preprocessing import load_data

In [None]:
train, val, test = load_data()

In [None]:
tiny = train[:10]

In [None]:
numeric_cols = list(train.columns)
numeric_cols.remove('cell_name')
numeric_cols.remove('timestamp')
numeric_cols

In [None]:
means = train[numeric_cols].mean()
stds = train[numeric_cols].std()

tolerance = 3

threshold_upper = means + tolerance*stds
threshold_lower = means - tolerance*stds

anomalies = {}

for col_name in numeric_cols:
    anomalies[col_name] = (train[col_name] >= threshold_upper[col_name]) | (train[col_name] <= threshold_lower[col_name])

anomalies = pd.DataFrame(anomalies)

anomalies.describe()

In [None]:
anomalies.sum()

In [None]:
from std_detector import StdDetector

model = StdDetector(numeric_cols, 3)
model.train(train)
anomalies = model.detect(train)

In [None]:
train.iloc[np.where(anomalies == True)[0]]

In [None]:
def plot_anomalies(df: pd.DataFrame, anomaly_idx: pd.DataFrame, plottable_cols, title, show=True):
    """
    df: The data frame
    anomaly_idx: a boolean data frame indicating where anomalies are located
    """
    fig = make_subplots(
        rows=len(plottable_cols),
        cols=1,
        subplot_titles=plottable_cols,
    )
    for i, col in enumerate(plottable_cols):
        fig.add_trace(go.Scatter(
            x=df['timestamp'],
            y=df[col],
            mode='lines',
            showlegend=False,
            line= {'color' : '#0099C6'}
        ), row=i+1, col=1)

        anomalies = df.loc[anomaly_idx[col]][['timestamp', col]]
        fig.add_trace(go.Scatter(
            x=anomalies['timestamp'],
            y=anomalies[col], 
            mode='markers',
            showlegend=False,
            line= {'color' : 'red'}
        ), row=i+1, col=1)

    yaxis = {
        'yaxis'+str(i) if i > 0 else 'yaxis':{'range':(0,1)}
        for i in range(len(plottable_cols)+1)
    }
    fig.update_layout(
        title=title,
        height=200 * len(plottable_cols),
        xaxis={'range':(df['timestamp'].min(),df['timestamp'].max())},
        **yaxis
    )
    if show: fig.show()
    return fig

def plot_anomalies_by_cell(df: pd.DataFrame, anomaly_idx: pd.DataFrame, plottable_cols):
    """
    df: The full data frame
    anomaly_idx: a boolean data frame indicating where anomalies are located
    """
    groups = df.groupby('cell_name').groups
    for cell_name, cell_idx in groups.items():
        cell_df = df.loc[cell_idx]
        cell_anomalies = anomaly_idx.loc[cell_idx]
        plot_anomalies(cell_df, cell_anomalies, plottable_cols, cell_name)

#plot_anomalies_by_cell(train, anomalies, numeric_cols)

In [None]:
from plot import plot_anomalies

plot_anomalies_by_cell(train, anomalies, numeric_cols)