In [1]:
import pandas as pd
import numpy as np
import json, csv, datetime
from vadetisweb.utils.anomaly_detection_utils import next_later_dt

In [2]:
#input

yahoo_path = '/home/adrian/Dokumente/real_data/ydata-labeled-time-series-anomalies-v1_0'

a1_path = '/A1Benchmark/'
a2_path = '/A2Benchmark/'
a3_path = '/A3Benchmark/'
a4_path = '/A4Benchmark/'

a1_elements = np.arange(1, 68).tolist()
a234_elements = np.arange(1, 101).tolist()

a1files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a1_path + 'real_' + str(x) + '.csv' } for x in a1_elements]
a2files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a2_path + 'synthetic_' + str(x) + '.csv' } for x in a234_elements]
a3files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a3_path + 'A3Benchmark-TS' + str(x) + '.csv' } for x in a234_elements]
a4files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a4_path + 'A4Benchmark-TS' + str(x) + '.csv' } for x in a234_elements]

#output
output_path = '/home/adrian/Dokumente/real_data/yahoo_out2'

test_file_name = output_path + a1_path + 'test.csv'
train_file_name = output_path + a1_path + 'train.csv'

In [3]:
def load_data_frame(ts_name, file_path):
    df = pd.read_csv(file_path)
    df.insert(0, 'ts_name', ts_name)
    df.insert(2, 'unit', "Value")
    df = df.rename(columns={'timestamp': 'time', 'is_anomaly' : 'class'})
    return df

In [4]:
def load_data_frame_values_only(ts_name, file_path, drop_columns):
    df = pd.read_csv(file_path)
    df = df.drop(columns=drop_columns)
    df = df.rename(columns={'value': ts_name})
    return df

In [5]:
def time_index_to_dt(df):
    date_time_str = '2020-04-01 00:00:00'
    dt = datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
    for idx, row in df.iterrows():
        time_index = row['time']
        dt = next_later_dt(dt, '1H')
        df.loc[idx, 'time'] = dt
        
    return df

In [6]:
def init_file(outputfile):
    with open(outputfile, 'w') as file_output:
        writer = csv.writer(file_output, delimiter=';')
        header = ['ts_name', 'time', 'unit', 'value', 'class']
        writer.writerow(header)
        
def append_to_file(df, outputfile):
    with open(outputfile, 'a') as file_output:
        writer = csv.writer(file_output, delimiter=';')
        for index, row in df.iterrows():
            row = [row[0], row[1].isoformat(), row[2], row[3], row[4]]
            writer.writerow(row)

In [7]:
# S1 set
names_set_a1 = ['TS' + str(x) for x in np.arange(1,68).tolist()]

# S2 set
names_set_a2 = ['TS' + str(x) for x in np.arange(1,101).tolist()]

# S3 set
names_set_a3 = ['TS' + str(x) for x in np.arange(1,101).tolist()]

# S4 set
names_set_a4 = ['TS' + str(x) for x in np.arange(1,101).tolist()]

In [8]:
def generate_correlation(files, path, drop_columns):
    df_concat = None
    for entry in files:
        ts_name = entry['name']
        file_path = entry['file_path']
        df = load_data_frame_values_only(ts_name, file_path, drop_columns)
        if df_concat is None:
            df_concat = df
        else:
            df_concat = pd.concat([df_concat, df], axis=1)
    df_concat = df_concat[0:1001]
    df_corr = df_concat.corr()
    df_corr = df_corr.round(2)
    df_corr.to_csv(path +'corr.csv', index = True, header=True)
    
    return df_corr

In [9]:
df_corr_a1 = generate_correlation(a1files, output_path + a1_path, ['timestamp', 'is_anomaly'])
df_corr_a2 = generate_correlation(a2files, output_path + a2_path, ['timestamp', 'is_anomaly'])
df_corr_a3 = generate_correlation(a3files, output_path + a3_path, ['timestamps', 'anomaly', 'changepoint', 'trend', 'noise', 'seasonality1', 'seasonality2', 'seasonality3'])
df_corr_a4 = generate_correlation(a4files, output_path + a4_path, ['timestamps', 'anomaly', 'changepoint', 'trend', 'noise', 'seasonality1', 'seasonality2', 'seasonality3'])