In [1]:
import pandas as pd
import numpy as np
import json, csv, datetime

In [2]:
#input

yahoo_path = '/home/adrian/Dokumente/real_data/ydata-labeled-time-series-anomalies-v1_0'

a1_path = '/A1Benchmark/'
a2_path = '/A2Benchmark/'
a3_path = '/A3Benchmark/'
a4_path = '/A4Benchmark/'

a1_elements = np.arange(1, 68).tolist()
a234_elements = np.arange(1, 100).tolist()

a1files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a1_path + 'real_' + str(x) + '.csv' } for x in a1_elements]
a2files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a2_path + 'synthetic_' + str(x) + '.csv' } for x in a234_elements]
a3files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a3_path + 'A3Benchmark-TS' + str(x) + '.csv' } for x in a234_elements]
a4files = [{ 'name' : 'TS' + str(x), 'file_path' : yahoo_path + a4_path + 'A4Benchmark-TS' + str(x) + '.csv' } for x in a234_elements]

#output
output_path = '/home/adrian/Dokumente/real_data/yahoo_out'

test_file_name = output_path + a2_path + 'test.csv'
train_file_name = output_path + a2_path + 'train.csv'

In [3]:
def load_data_frame(ts_name, file_path):
    df = pd.read_csv(file_path)
    df.insert(0, 'ts_name', ts_name)
    df.insert(2, 'unit', "Value")
    df = df.rename(columns={'timestamp': 'time', 'is_anomaly' : 'class'})
    return df

In [4]:
def time_index_to_dt(df):
    for idx, row in df.iterrows():
        dt = datetime.datetime.fromtimestamp(row['time'])
        df.loc[idx, 'time'] = dt
    return df
        

In [5]:
def init_file(outputfile):
    with open(outputfile, 'w') as file_output:
        writer = csv.writer(file_output, delimiter=';')
        header = ['ts_name', 'time', 'unit', 'value', 'class']
        writer.writerow(header)
        
def append_to_file(df, outputfile):
    with open(outputfile, 'a') as file_output:
        writer = csv.writer(file_output, delimiter=';')
        for index, row in df.iterrows():
            row = [row[0], row[1].isoformat(), row[2], row[3], row[4]]
            writer.writerow(row)

In [20]:
# S1 set
names_set_a2_s1 = ['TS1', 'TS2', 'TS3', 'TS4', 'TS5', 'TS6', 'TS7', 'TS8', 'TS9']

# S2 set
names_set_a2_s2 = ['TS' + str(x) for x in np.arange(11,20).tolist()]

In [13]:
# S1
test_file_name_s1 = output_path + a2_path + 'test_s1.csv'
train_file_name_s1 = output_path + a2_path + 'train_s1.csv'
init_file(test_file_name_s1)
init_file(train_file_name_s1)

In [14]:
for entry in a2files:
    ts_name = entry['name']
    if ts_name in names_set_a2_s1:
        file_path = entry['file_path']
        df = load_data_frame(ts_name, file_path)
        df = time_index_to_dt(df)
        print(ts_name + "\t anomalies " 
              + str(df.loc[df['class']==1].shape[0]) 
              + '\t values ' + str(df.shape[0])
              + '\t min ' + str(df.loc[df['class']==0, 'value'].min()) 
              + '\t max ' + str(df.loc[df['class']==0, 'value'].max()))
        df_test = df.iloc[0:700]
        print('test' + "\t anomalies " + str(df_test.loc[df_test['class']==1].shape[0]) + '\t values ' + str(df_test.shape[0]))

        df_train = df.iloc[701:1401]
        print('train' + "\t anomalies " + str(df_train.loc[df_train['class']==1].shape[0]) + '\t values ' + str(df_train.shape[0]))
        print('\n')
        append_to_file(df_test, test_file_name_s1)
        append_to_file(df_train, train_file_name_s1)

TS1	 anomalies 4	 values 1421	 min -1421.03900780435	 max 1416.91254278074
test	 anomalies 2	 values 700
train	 anomalies 2	 values 700


TS2	 anomalies 9	 values 1421	 min -1337.39554558125	 max 2230.21351209765
test	 anomalies 6	 values 700
train	 anomalies 3	 values 700


TS3	 anomalies 1	 values 1421	 min -1469.17024959319	 max 1476.17862753322
test	 anomalies 0	 values 700
train	 anomalies 1	 values 700


TS4	 anomalies 4	 values 1421	 min -1349.7201913828599	 max 2569.31864920635
test	 anomalies 2	 values 700
train	 anomalies 2	 values 700


TS5	 anomalies 9	 values 1421	 min -1480.24277746569	 max 1548.9621898117498
test	 anomalies 6	 values 700
train	 anomalies 3	 values 700


TS6	 anomalies 1	 values 1421	 min -1369.95553721034	 max 2796.0554553537395
test	 anomalies 0	 values 700
train	 anomalies 1	 values 700


TS7	 anomalies 4	 values 1421	 min -759.7765474868539	 max 6070.118294224821
test	 anomalies 2	 values 700
train	 anomalies 2	 values 700


TS8	 anomalies 9	 values 1

In [21]:
# S2
test_file_name_s2 = output_path + a2_path + 'test_s2.csv'
train_file_name_s2 = output_path + a2_path + 'train_s2.csv'
init_file(test_file_name_s2)
init_file(train_file_name_s2)

In [22]:
for entry in a2files:
    ts_name = entry['name']
    if ts_name in names_set_a2_s2:
        file_path = entry['file_path']
        df = load_data_frame(ts_name, file_path)
        df = time_index_to_dt(df)
        print(ts_name + "\t anomalies " 
              + str(df.loc[df['class']==1].shape[0]) 
              + '\t values ' + str(df.shape[0])
              + '\t min ' + str(df.loc[df['class']==0, 'value'].min()) 
              + '\t max ' + str(df.loc[df['class']==0, 'value'].max()))
        df_test = df.iloc[0:700]
        print('test' + "\t anomalies " + str(df_test.loc[df_test['class']==1].shape[0]) + '\t values ' + str(df_test.shape[0]))

        df_train = df.iloc[701:1401]
        print('train' + "\t anomalies " + str(df_train.loc[df_train['class']==1].shape[0]) + '\t values ' + str(df_train.shape[0]))
        print('\n')
        append_to_file(df_test, test_file_name_s2)
        append_to_file(df_train, train_file_name_s2)

TS11	 anomalies 9	 values 1421	 min -1288.4912053444198	 max 4492.0546085613205
test	 anomalies 6	 values 700
train	 anomalies 3	 values 700


TS12	 anomalies 1	 values 1421	 min -1716.1728402295098	 max 1756.67495057559
test	 anomalies 0	 values 700
train	 anomalies 1	 values 700


TS13	 anomalies 4	 values 1421	 min -1435.8133483540398	 max 4298.9986399934005
test	 anomalies 2	 values 700
train	 anomalies 2	 values 700


TS14	 anomalies 9	 values 1421	 min -165.031044984485	 max 10640.2416388357
test	 anomalies 6	 values 700
train	 anomalies 3	 values 700


TS15	 anomalies 1	 values 1421	 min -1732.7463531757	 max 1954.1237572643104
test	 anomalies 0	 values 700
train	 anomalies 1	 values 700


TS16	 anomalies 4	 values 1421	 min -969.1594552207409	 max 7619.48340953066
test	 anomalies 2	 values 700
train	 anomalies 2	 values 700


TS17	 anomalies 9	 values 1421	 min -1619.89508548574	 max 2212.82357199317
test	 anomalies 6	 values 700
train	 anomalies 3	 values 700


TS18	 anomalies

In [23]:
test_file_all = output_path + a2_path + 'test_all.csv'
train_file_all = output_path + a2_path + 'train_all.csv'
init_file(test_file_all)
init_file(test_file_all)

In [28]:
for entry in a2files:
    ts_name = entry['name']
    file_path = entry['file_path']
    df = load_data_frame(ts_name, file_path)
    df = time_index_to_dt(df)
    """print(ts_name + "\t anomalies " 
          + str(df.loc[df['class']==1].shape[0]) 
          + '\t values ' + str(df.shape[0])
          + '\t min ' + str(df["value"].min()) 
          + '\t max ' + str(df["value"].max()))"""
    df_test = df.iloc[0:700]
    #print('test' + "\t anomalies " + str(df_test.loc[df_test['class']==1].shape[0]) + '\t values ' + str(df_test.shape[0]))

    df_train = df.iloc[701:1401]
    """print('train' + "\t anomalies " + str(df_train.loc[df_train['class']==1].shape[0]) + '\t values ' + str(df_train.shape[0]))
    print('\n')"""
    append_to_file(df_test, test_file_all)
    append_to_file(df_train, train_file_all)

In [24]:
init_file(test_file_name)
init_file(train_file_name)

In [12]:
for entry in a2files:
    ts_name = entry['name']
    file_path = entry['file_path']
    df = load_data_frame(ts_name, file_path)
    df = time_index_to_dt(df)
    print(ts_name + "\t anomalies " 
          + str(df.loc[df['class']==1].shape[0]) 
          + '\t values ' + str(df.shape[0])
          + '\t min ' + str(df.loc[df['class']==0, 'value'].min()) 
          + '\t max ' + str(df.loc[df['class']==0, 'value'].max()))
    #print(df)
    df_test = df.iloc[0:700]
    print('test' + "\t anomalies " + str(df_test.loc[df_test['class']==1].shape[0]) + '\t values ' + str(df_test.shape[0]))

    df_train = df.iloc[701:1401]
    print('train' + "\t anomalies " + str(df_train.loc[df_train['class']==1].shape[0]) + '\t values ' + str(df_train.shape[0]))
    print('\n')
    #append_to_file(df_test, test_file_name)
    #append_to_file(df_train, train_file_name)
    """if ts_name == 'TS9':
        break"""

TS1	 anomalies 4	 values 1421	 min -1421.03900780435	 max 1416.91254278074
test	 anomalies 2	 values 700
train	 anomalies 2	 values 700


TS2	 anomalies 9	 values 1421	 min -1337.39554558125	 max 2230.21351209765
test	 anomalies 6	 values 700
train	 anomalies 3	 values 700


TS3	 anomalies 1	 values 1421	 min -1469.17024959319	 max 1476.17862753322
test	 anomalies 0	 values 700
train	 anomalies 1	 values 700


TS4	 anomalies 4	 values 1421	 min -1349.7201913828599	 max 2569.31864920635
test	 anomalies 2	 values 700
train	 anomalies 2	 values 700


TS5	 anomalies 9	 values 1421	 min -1480.24277746569	 max 1548.9621898117498
test	 anomalies 6	 values 700
train	 anomalies 3	 values 700


TS6	 anomalies 1	 values 1421	 min -1369.95553721034	 max 2796.0554553537395
test	 anomalies 0	 values 700
train	 anomalies 1	 values 700


TS7	 anomalies 4	 values 1421	 min -759.7765474868539	 max 6070.118294224821
test	 anomalies 2	 values 700
train	 anomalies 2	 values 700


TS8	 anomalies 9	 values 1

TS60	 anomalies 1	 values 1421	 min -124.77756082596599	 max 19066.8324396306
test	 anomalies 0	 values 700
train	 anomalies 1	 values 700


TS61	 anomalies 4	 values 1421	 min -1391.0175760576	 max 1531.03792380424
test	 anomalies 2	 values 700
train	 anomalies 2	 values 700


TS62	 anomalies 9	 values 1421	 min -626.858212895174	 max 15454.4728011991
test	 anomalies 6	 values 700
train	 anomalies 3	 values 700


TS63	 anomalies 1	 values 1421	 min 11.616456793740099	 max 44139.4975079728
test	 anomalies 0	 values 700
train	 anomalies 1	 values 700


TS64	 anomalies 4	 values 1421	 min -1396.5859673743803	 max 1921.6263868414999
test	 anomalies 2	 values 700
train	 anomalies 2	 values 700


TS65	 anomalies 9	 values 1421	 min 67.47999768712229	 max 27436.5094346389
test	 anomalies 6	 values 700
train	 anomalies 3	 values 700


TS66	 anomalies 1	 values 1421	 min -1462.6141903562198	 max 1917.2919262614998
test	 anomalies 0	 values 700
train	 anomalies 1	 values 700


TS67	 anomalies 4

In [54]:
df.iloc[10001:15801]

Unnamed: 0,ts_name,time,unit,value,class
10001,UPS,2015-04-02 15:07:53,Value,2,0
10002,UPS,2015-04-02 15:12:53,Value,1,0
10003,UPS,2015-04-02 15:17:53,Value,5,0
10004,UPS,2015-04-02 15:22:53,Value,1,0
10005,UPS,2015-04-02 15:27:53,Value,2,0
...,...,...,...,...,...
15796,UPS,2015-04-22 18:02:53,Value,1,0
15797,UPS,2015-04-22 18:07:53,Value,2,0
15798,UPS,2015-04-22 18:12:53,Value,1,0
15799,UPS,2015-04-22 18:17:53,Value,5,0


In [47]:
test_data_frame

Unnamed: 0,ts_name,time,unit,value,class
0,Test,2015-02-26 21:42:53,Value,2,0
1,Test,2015-02-26 21:47:53,Value,2,0
2,Test,2015-02-26 21:52:53,Value,4,0
3,Test,2015-02-26 21:57:53,Value,3,0
4,Test,2015-02-26 22:02:53,Value,5,0
...,...,...,...,...,...
15861,Test,2015-04-22 23:27:53,Value,3,0
15862,Test,2015-04-22 23:32:53,Value,2,0
15863,Test,2015-04-22 23:37:53,Value,2,0
15864,Test,2015-04-22 23:42:53,Value,3,0


In [23]:
test_data_frame.loc[test_data_frame['class']==1]

Unnamed: 0,ts_name,time,unit,value,class
1993,TEST,2015-03-05 19:47:53,Value,716,1
3735,TEST,2015-03-11 20:57:53,Value,1673,1
9795,TEST,2015-04-01 21:57:53,Value,282,1
11606,TEST,2015-04-08 04:52:53,Value,211,1
