In [1]:
from random import normalvariate, randint
from itertools import count
from datetime import datetime
from itertools import groupby

from scipy.stats import normaltest
from itertools import filterfalse
from itertools import islice

In [5]:
def read_data(filename):
    with open(filename) as fd:
        for line in fd:
            data = line.strip().split(',')
            timestamp, value = map(int, data)
            yield datetime.fromtimestamp(timestamp), value



def read_fake_data(filename):
    for timestamp in count():
        # insert anomalous data poit aooroxinately once a week
        if randint(0, 7 * 60 * 60 * 24 - 1) !=1:
            value = normalvariate(0,1)

        else:
            value = 100
        
        yield datetime.fromtimestamp(timestamp), value

def groupby_day(iterable):
    key = lambda row: row[0].day
    for data, data_group in groupby(iterable, key):
        yield list(data_group)

def groupby_window(data, window_size=3600):
    window = tuple(islice(data, window_size))
    for item in data:
        yield window
        window = window[1:] + (item,)

def is_normal(data, threshold=1e-3):
    _, values = zip(*data)
    k2, p_values = normaltest(values)
    if p_values<threshold:
        return False
    return True

def filter_anomalous_groups(data):
    """Filter down the dataset only to inputs that don't pass the test"""

    yield from filterfalse(is_normal, data)


def filter_anomalous_data(data):
    data_group= groupby_day(data)
    # data_group= groupby_window(data)
    yield from filter_anomalous_groups(data_group)

In [6]:
data = read_fake_data(filename='abc')

anomaly_generator = filter_anomalous_data(data)

first_10_anomalies = islice(anomaly_generator, 10)

In [8]:
for data_anomaly in first_10_anomalies:
    # print(data_anomaly)

    start_date = data_anomaly[0][0]
    end_date = data_anomaly[-1][0]
    print(f"Anomaly from {start_date} - {end_date}")