In [1]:
import pandas as pd
import numpy as np
import json, csv, datetime

In [2]:
#input

yahoo_path = '/home/adrian/Dokumente/real_data/ydata-labeled-time-series-anomalies-v1_0'

a1_path = '/A1Benchmark/'
a2_path = '/A2Benchmark/'
a3_path = '/A3Benchmark/'
a4_path = '/A4Benchmark/'

a1_elements = np.arange(1, 68).tolist()
a234_elements = np.arange(1, 101).tolist()

a1files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a1_path + 'real_' + str(x) + '.csv' } for x in a1_elements]
a2files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a2_path + 'synthetic_' + str(x) + '.csv' } for x in a234_elements]
a3files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a3_path + 'A3Benchmark-TS' + str(x) + '.csv' } for x in a234_elements]
a4files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a4_path + 'A4Benchmark-TS' + str(x) + '.csv' } for x in a234_elements]

#output
output_path = '/home/adrian/Dokumente/real_data/yahoo_out2'

#test_file_name = output_path + a2_path + 'test.csv'
#train_file_name = output_path + a2_path + 'train.csv'

In [3]:
def load_data_frame(ts_name, file_path):
    df = pd.read_csv(file_path)
    df.insert(0, 'ts_name', ts_name)
    df.insert(2, 'unit', "Value")
    df = df.rename(columns={'timestamps': 'time', 'anomaly' : 'class'})
    df = df.drop(columns=['changepoint', 'trend', 'noise', 'seasonality1', 'seasonality2', 'seasonality3'])
    return df

In [4]:
def time_index_to_dt(df):
    for idx, row in df.iterrows():
        dt = datetime.datetime.fromtimestamp(row['time'])
        df.loc[idx, 'time'] = dt
    return df
        

In [5]:
def init_file(outputfile):
    with open(outputfile, 'w') as file_output:
        writer = csv.writer(file_output, delimiter=';')
        header = ['ts_name', 'time', 'unit', 'value', 'class']
        writer.writerow(header)
        
def append_to_file(df, outputfile):
    with open(outputfile, 'a') as file_output:
        writer = csv.writer(file_output, delimiter=';')
        for index, row in df.iterrows():
            row = [row[0], row[1].isoformat(), row[2], row[3], row[4]]
            writer.writerow(row)

In [15]:
# S1 set
names_set_a3_s1 = ['TS' + str(x) for x in [22, 3, 5, 11, 13, 16, 17, 25]]
file_suffix_s1 = "s1"
# S2 set
names_set_a3_s2 = ['TS' + str(x) for x in [22, 3, 5, 11, 13, 16, 17]]
file_suffix_s2 = "s2"
# S3 set
names_set_a3_s3 = ['TS' + str(x) for x in [22, 3, 5, 11, 13, 16]]
file_suffix_s3 = "s3"
# S4 set
names_set_a3_s4 = ['TS' + str(x) for x in [22, 3, 5, 11, 13]]
file_suffix_s4 = "s4"
# S5 set
names_set_a3_s5 = ['TS' + str(x) for x in [22, 3, 5, 11]]
file_suffix_s5 = "s5"
# S6 set
names_set_a3_s6 = ['TS' + str(x) for x in [22, 3, 5]]
file_suffix_s6 = "s6"

In [16]:
def create_set(path, files, file_set_names, file_suffix):
    test_file_name = output_path + path + 'test_' + file_suffix + '.csv'
    train_file_name = output_path + path + 'train_' + file_suffix + '.csv'
    init_file(test_file_name)
    init_file(train_file_name)
    
    for entry in files:
        ts_name = entry['name']
        if ts_name in file_set_names:
            file_path = entry['file_path']
            df = load_data_frame(ts_name, file_path)
            df = time_index_to_dt(df)
            print(ts_name + "\t anomalies " 
                  + str(df.loc[df['class']==1].shape[0]) 
                  + '\t values ' + str(df.shape[0])
                  + '\t min ' + str(df.loc[df['class']==0, 'value'].min()) 
                  + '\t max ' + str(df.loc[df['class']==0, 'value'].max()))
            df_test = df.iloc[0:1000]
            print('test' + "\t anomalies " + str(df_test.loc[df_test['class']==1].shape[0]) + '\t values ' + str(df_test.shape[0]))

            df_train = df.iloc[1000:1681]
            print('train' + "\t anomalies " + str(df_train.loc[df_train['class']==1].shape[0]) + '\t values ' + str(df_train.shape[0]))
            print('\n')
            append_to_file(df_test, test_file_name)
            append_to_file(df_train, train_file_name)

In [14]:
create_set(a3_path, a3files, names_set_a3_s1, file_suffix_s1)

TS3	 anomalies 6	 values 1680	 min -455.124029606913	 max 3793.35711668
test	 anomalies 3	 values 1000
train	 anomalies 3	 values 680


TS5	 anomalies 2	 values 1680	 min -692.5193251525819	 max 4006.7266584886997
test	 anomalies 1	 values 1000
train	 anomalies 1	 values 680


TS11	 anomalies 7	 values 1680	 min -693.754935483698	 max 4003.2504520817297
test	 anomalies 5	 values 1000
train	 anomalies 2	 values 680


TS13	 anomalies 9	 values 1680	 min -1630.7865546076	 max 3384.70905885213
test	 anomalies 7	 values 1000
train	 anomalies 2	 values 680


TS16	 anomalies 9	 values 1680	 min -1701.08492942196	 max 6579.30692418012
test	 anomalies 4	 values 1000
train	 anomalies 5	 values 680


TS17	 anomalies 1	 values 1680	 min -739.206063334008	 max 5749.1714457047
test	 anomalies 1	 values 1000
train	 anomalies 0	 values 680


TS22	 anomalies 4	 values 1680	 min -1839.3630921341198	 max 7006.206125087901
test	 anomalies 2	 values 1000
train	 anomalies 2	 values 680


TS25	 anomalies 2	 

In [17]:
create_set(a3_path, a3files, names_set_a3_s2, file_suffix_s2)

TS3	 anomalies 6	 values 1680	 min -455.124029606913	 max 3793.35711668
test	 anomalies 3	 values 1000
train	 anomalies 3	 values 680


TS5	 anomalies 2	 values 1680	 min -692.5193251525819	 max 4006.7266584886997
test	 anomalies 1	 values 1000
train	 anomalies 1	 values 680


TS11	 anomalies 7	 values 1680	 min -693.754935483698	 max 4003.2504520817297
test	 anomalies 5	 values 1000
train	 anomalies 2	 values 680


TS13	 anomalies 9	 values 1680	 min -1630.7865546076	 max 3384.70905885213
test	 anomalies 7	 values 1000
train	 anomalies 2	 values 680


TS16	 anomalies 9	 values 1680	 min -1701.08492942196	 max 6579.30692418012
test	 anomalies 4	 values 1000
train	 anomalies 5	 values 680


TS17	 anomalies 1	 values 1680	 min -739.206063334008	 max 5749.1714457047
test	 anomalies 1	 values 1000
train	 anomalies 0	 values 680


TS22	 anomalies 4	 values 1680	 min -1839.3630921341198	 max 7006.206125087901
test	 anomalies 2	 values 1000
train	 anomalies 2	 values 680




In [18]:
create_set(a3_path, a3files, names_set_a3_s3, file_suffix_s3)

TS3	 anomalies 6	 values 1680	 min -455.124029606913	 max 3793.35711668
test	 anomalies 3	 values 1000
train	 anomalies 3	 values 680


TS5	 anomalies 2	 values 1680	 min -692.5193251525819	 max 4006.7266584886997
test	 anomalies 1	 values 1000
train	 anomalies 1	 values 680


TS11	 anomalies 7	 values 1680	 min -693.754935483698	 max 4003.2504520817297
test	 anomalies 5	 values 1000
train	 anomalies 2	 values 680


TS13	 anomalies 9	 values 1680	 min -1630.7865546076	 max 3384.70905885213
test	 anomalies 7	 values 1000
train	 anomalies 2	 values 680


TS16	 anomalies 9	 values 1680	 min -1701.08492942196	 max 6579.30692418012
test	 anomalies 4	 values 1000
train	 anomalies 5	 values 680


TS22	 anomalies 4	 values 1680	 min -1839.3630921341198	 max 7006.206125087901
test	 anomalies 2	 values 1000
train	 anomalies 2	 values 680




In [19]:
create_set(a3_path, a3files, names_set_a3_s4, file_suffix_s4)

TS3	 anomalies 6	 values 1680	 min -455.124029606913	 max 3793.35711668
test	 anomalies 3	 values 1000
train	 anomalies 3	 values 680


TS5	 anomalies 2	 values 1680	 min -692.5193251525819	 max 4006.7266584886997
test	 anomalies 1	 values 1000
train	 anomalies 1	 values 680


TS11	 anomalies 7	 values 1680	 min -693.754935483698	 max 4003.2504520817297
test	 anomalies 5	 values 1000
train	 anomalies 2	 values 680


TS13	 anomalies 9	 values 1680	 min -1630.7865546076	 max 3384.70905885213
test	 anomalies 7	 values 1000
train	 anomalies 2	 values 680


TS22	 anomalies 4	 values 1680	 min -1839.3630921341198	 max 7006.206125087901
test	 anomalies 2	 values 1000
train	 anomalies 2	 values 680




In [20]:
create_set(a3_path, a3files, names_set_a3_s5, file_suffix_s5)

TS3	 anomalies 6	 values 1680	 min -455.124029606913	 max 3793.35711668
test	 anomalies 3	 values 1000
train	 anomalies 3	 values 680


TS5	 anomalies 2	 values 1680	 min -692.5193251525819	 max 4006.7266584886997
test	 anomalies 1	 values 1000
train	 anomalies 1	 values 680


TS11	 anomalies 7	 values 1680	 min -693.754935483698	 max 4003.2504520817297
test	 anomalies 5	 values 1000
train	 anomalies 2	 values 680


TS22	 anomalies 4	 values 1680	 min -1839.3630921341198	 max 7006.206125087901
test	 anomalies 2	 values 1000
train	 anomalies 2	 values 680




In [21]:
create_set(a3_path, a3files, names_set_a3_s6, file_suffix_s6)

TS3	 anomalies 6	 values 1680	 min -455.124029606913	 max 3793.35711668
test	 anomalies 3	 values 1000
train	 anomalies 3	 values 680


TS5	 anomalies 2	 values 1680	 min -692.5193251525819	 max 4006.7266584886997
test	 anomalies 1	 values 1000
train	 anomalies 1	 values 680


TS22	 anomalies 4	 values 1680	 min -1839.3630921341198	 max 7006.206125087901
test	 anomalies 2	 values 1000
train	 anomalies 2	 values 680




In [23]:
# S1
test_file_name_s1 = output_path + a3_path + 'test_s1.csv'
train_file_name_s1 = output_path + a3_path + 'train_s1.csv'
init_file(test_file_name_s1)
init_file(train_file_name_s1)

In [24]:
for entry in a3files:
    ts_name = entry['name']
    if ts_name in names_set_a3_s1:
        file_path = entry['file_path']
        df = load_data_frame(ts_name, file_path)
        df = time_index_to_dt(df)
        print(ts_name + "\t anomalies " 
              + str(df.loc[df['class']==1].shape[0]) 
              + '\t values ' + str(df.shape[0])
              + '\t min ' + str(df.loc[df['class']==0, 'value'].min()) 
              + '\t max ' + str(df.loc[df['class']==0, 'value'].max()))
        df_test = df.iloc[0:1000]
        print('test' + "\t anomalies " + str(df_test.loc[df_test['class']==1].shape[0]) + '\t values ' + str(df_test.shape[0]))

        df_train = df.iloc[1001:1681]
        print('train' + "\t anomalies " + str(df_train.loc[df_train['class']==1].shape[0]) + '\t values ' + str(df_train.shape[0]))
        print('\n')
        append_to_file(df_test, test_file_name_s1)
        append_to_file(df_train, train_file_name_s1)

TS3	 anomalies 6	 values 1680	 min -455.124029606913	 max 3793.35711668
test	 anomalies 3	 values 1000
train	 anomalies 1	 values 400


TS5	 anomalies 2	 values 1680	 min -692.5193251525819	 max 4006.7266584886997
test	 anomalies 1	 values 1000
train	 anomalies 0	 values 400


TS11	 anomalies 7	 values 1680	 min -693.754935483698	 max 4003.2504520817297
test	 anomalies 5	 values 1000
train	 anomalies 2	 values 400


TS13	 anomalies 9	 values 1680	 min -1630.7865546076	 max 3384.70905885213
test	 anomalies 7	 values 1000
train	 anomalies 2	 values 400


TS16	 anomalies 9	 values 1680	 min -1701.08492942196	 max 6579.30692418012
test	 anomalies 4	 values 1000
train	 anomalies 4	 values 400


TS17	 anomalies 1	 values 1680	 min -739.206063334008	 max 5749.1714457047
test	 anomalies 1	 values 1000
train	 anomalies 0	 values 400


TS22	 anomalies 4	 values 1680	 min -1839.3630921341198	 max 7006.206125087901
test	 anomalies 2	 values 1000
train	 anomalies 2	 values 400


TS25	 anomalies 2	 