# Find anomaly in a single time series using `matrixprofile`

More on matrixprofile: https://towardsdatascience.com/introduction-to-matrix-profiles-5568f3375d90

In [1]:
%matplotlib widget
%reload_ext autoreload
%autoreload 2

import os
import re
import json
from typing import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matrixprofile as mp
from matrixprofile import *
from tqdm import tqdm

In [2]:
NOTEBOOK_DIR = os.getcwd()

In [3]:
os.chdir('../data-sets/KDD-Cup/data/')

In [4]:
def cal_matrix_profile(filename: str, type: str = 'discords') -> dict:
    'type: either `motifs` or `discords`'
    df = pd.read_csv(filename, names=['values'])
    try:
        df['values'] = df['values'].astype(float)
    except ValueError:
        df = pd.DataFrame([i for i in df.loc[0, 'values'].split(' ') if i != ''], columns=['values'])
        df['values'] = df['values'].astype(float)
        
    #set window size
    window_size=100
    #calculating the matrix profile with window size'4'
    profile = mp.compute(df['values'].values, window_size)
    if type == 'motifs':
        profile = mp.discover.motifs(profile, k=window_size)
    else:
        profile = mp.discover.discords(profile)
    return profile

In [12]:
def cal_matrix_profile_list(filenames: list[str]) -> dict[str, Dict[str, Union[List[int], Exception]]]:
    discords_dict = dict()
    for filename in tqdm(filenames):
        try:
            profile = cal_matrix_profile(filename)
            discords = sorted(profile['discords'].tolist())
            regex = re.compile(r'^\d{3}_UCR_Anomaly_(?P<pos>\d+)\.txt$')
            result = regex.search(filename)
            threshold = int(result.group('pos'))
            filtered_discords = [i for i in discords if i >= threshold]
            if len(filtered_discords) > 0 and filtered_discords[-1] - filtered_discords[0] < 10:
                merged_discord = int(np.average(filtered_discords))
            else:
                merged_discord = None
            discords_dict[filename] = {
                'discords': discords,
                'filtered_discords': filtered_discords, 
                'merged_discord': merged_discord
            }
        except Exception as e:
            print(f'Error at {filename}')
            discords_dict[filename] = e
    return discords_dict

filenames = [i for i in sorted(os.listdir()) if 'txt' in i]
discords = cal_matrix_profile_list(filenames)

100%|██████████| 250/250 [1:58:58<00:00, 28.55s/it]


In [13]:
# export those without error
# note that some of items contain multiple anomaly points

os.chdir(NOTEBOOK_DIR)

with open("matrixprofile.json", "w") as outfile:
    json.dump(discords, outfile)


In [11]:
len(discords)

241

In [75]:
d = sorted(discords['187_UCR_Anomaly_30000.txt'].tolist())

In [76]:
if d[-1] - d[0] < 10:
    merged_discord = int(sum(d) / len(d))

In [78]:
np.average([1,2,3])

2.0

# debug

In [54]:
df = pd.read_csv('206_UCR_Anomaly_25130.txt', names=['value'])
try:
    df['value'] = df['value'].astype(float)
except ValueError:
    df = pd.DataFrame([i for i in df.loc[0, 'value'].split(' ') if i != ''], columns=['value'])
    df['value'] = df['value'].astype(float)

df

Unnamed: 0,value
0,7716.0
1,7573.0
2,7409.0
3,7298.0
4,7139.0
...,...
41995,8287.0
41996,8202.0
41997,8075.0
41998,7880.0
