In [1]:
import pandas as pd
import numpy as np
import json, csv, datetime

In [2]:
#input

yahoo_path = '/home/adrian/Dokumente/real_data/ydata-labeled-time-series-anomalies-v1_0'

a1_path = '/A1Benchmark/'
a2_path = '/A2Benchmark/'
a3_path = '/A3Benchmark/'
a4_path = '/A4Benchmark/'

a1_elements = np.arange(1, 68).tolist()
a234_elements = np.arange(1, 101).tolist()

a1files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a1_path + 'real_' + str(x) + '.csv' } for x in a1_elements]
a2files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a2_path + 'synthetic_' + str(x) + '.csv' } for x in a234_elements]
a3files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a3_path + 'A3Benchmark-TS' + str(x) + '.csv' } for x in a234_elements]
a4files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a4_path + 'A4Benchmark-TS' + str(x) + '.csv' } for x in a234_elements]

#output
output_path = '/home/adrian/Dokumente/real_data/yahoo_out_eval'

test_file_name = output_path + a2_path + 'test.csv'
train_file_name = output_path + a2_path + 'train.csv'

In [3]:
def load_data_frame(ts_name, file_path):
    df = pd.read_csv(file_path)
    df.insert(0, 'ts_name', ts_name)
    df.insert(2, 'unit', "Value")
    df = df.rename(columns={'timestamp': 'time', 'is_anomaly' : 'class'})
    return df

In [4]:
def time_index_to_dt(df):
    for idx, row in df.iterrows():
        dt = datetime.datetime.fromtimestamp(row['time'])
        df.loc[idx, 'time'] = dt
    return df
        

In [5]:
def init_file(outputfile):
    with open(outputfile, 'w') as file_output:
        writer = csv.writer(file_output, delimiter=';')
        header = ['ts_name', 'time', 'unit', 'value', 'class']
        writer.writerow(header)
        
def append_to_file(df, outputfile):
    with open(outputfile, 'a') as file_output:
        writer = csv.writer(file_output, delimiter=';')
        for index, row in df.iterrows():
            row = [row[0], row[1].isoformat(), row[2], row[3], row[4]]
            writer.writerow(row)

In [18]:
# S1 set - Corr > 0.95
#names_set_a2_s1 = ['TS' + str(x) for x in [100, 28, 37, 42, 44, 49, 51, 56, 58, 63, 67, 70, 72, 76, 79, 84, 86, 88, 91, 93, 98]]
#file_suffix_s1 = "a2"

#names_set_a2_s1 = ['TS' + str(x) for x in [24, 10, 17, 31, 38, 45, 66, 73, 80, 87]]
names_set_a2_s1 = ['TS' + str(x) for x in [24, 66, 3, 10, 45, 17, 87, 73, 31, 38]]
file_suffix_s1 = "a2"

In [19]:
def create_set(path, files, file_set_names, file_suffix):
    test_file_name = output_path + path + 'test_' + file_suffix + '.csv'
    train_file_name = output_path + path + 'train_' + file_suffix + '.csv'
    init_file(test_file_name)
    init_file(train_file_name)
    
    for entry in files:
        ts_name = entry['name']
        if ts_name in file_set_names:
            file_path = entry['file_path']
            df = load_data_frame(ts_name, file_path)
            df = time_index_to_dt(df)
            print(ts_name + "\t anomalies " 
                  + str(df.loc[df['class']==1].shape[0]) 
                  + '\t values ' + str(df.shape[0])
                  + '\t min ' + str(df.loc[df['class']==0, 'value'].min()) 
                  + '\t max ' + str(df.loc[df['class']==0, 'value'].max()))
            df_test = df.iloc[0:710]
            print('test' + "\t anomalies " + str(df_test.loc[df_test['class']==1].shape[0]) + '\t values ' + str(df_test.shape[0]))

            df_train = df.iloc[710:1420]
            print('train' + "\t anomalies " + str(df_train.loc[df_train['class']==1].shape[0]) + '\t values ' + str(df_train.shape[0]))
            print('\n')
            append_to_file(df_test, test_file_name)
            append_to_file(df_train, train_file_name)

In [20]:
create_set(a2_path, a2files, names_set_a2_s1, file_suffix_s1)

TS3	 anomalies 1	 values 1421	 min -1469.17024959319	 max 1476.17862753322
test	 anomalies 0	 values 710
train	 anomalies 1	 values 710


TS10	 anomalies 4	 values 1421	 min -1549.60051249137	 max 1781.47914710953
test	 anomalies 2	 values 710
train	 anomalies 2	 values 710


TS17	 anomalies 9	 values 1421	 min -1619.89508548574	 max 2212.82357199317
test	 anomalies 6	 values 710
train	 anomalies 3	 values 710


TS24	 anomalies 1	 values 1421	 min -1841.0297711985002	 max 2729.6232025764803
test	 anomalies 0	 values 710
train	 anomalies 1	 values 710


TS31	 anomalies 4	 values 1421	 min -1747.1208457847101	 max 3696.4468132008396
test	 anomalies 2	 values 710
train	 anomalies 2	 values 710


TS38	 anomalies 9	 values 1421	 min -1787.15788146886	 max 4738.29344409956
test	 anomalies 6	 values 710
train	 anomalies 3	 values 710


TS45	 anomalies 1	 values 1421	 min -2044.18178008182	 max 4791.57054077542
test	 anomalies 0	 values 710
train	 anomalies 1	 values 710


TS66	 anomalies 1	 v