In [3]:
import numpy as np
import pandas as pd

In [4]:
### PARAMETERS
##################

dataset_path = '/home/andres/CAiN_repos/telemetry/1/baseline_no_anomaly_500Gbps.csv'

# functions to use in aggregation
# columns using 'sum' as aggregator function
sum_cols = ['acl-in-rpf-packets', 'active-routes-count', 'backup-routes-count', 'bytes-received', 'bytes-sent', \
           'carrier-transitions', 'checksum-error-packets', 'crc-errors', 'deleted-routes-count', \
            'df-unreachable-packets', 'discard-packets', 'encapsulation-failure-packets', \
            'fragmenation-consumed-packets', 'fragmenation-failure-packets', \
            'global__established-neighbors-count-total', 'global__neighbors-count-total', 'global__nexthop-count', \
           'global__restart-count', 'gre-error-drop', 'gre-lookup-failed-drop', 'incomplete-adjacency-packets', \
           'input-drops', 'input-errors', 'input-ignored-packets', 'input-queue-drops', 'lisp-decap-error-drops', \
           'lisp-encap-error-drops', 'lisp-punt-drops', 'load-interval', 'multi-label-drops', 'no-route-packets', \
           'null-packets', 'output-buffer-failures', 'output-drops', 'output-errors', 'output-queue-drops', \
           'packets-received', 'packets-sent', 'paths-count', \
            'performance-statistics__global__configuration-items-processed', \
            'performance-statistics__vrf__inbound-update-messages', 'punt-unreachable-packets', \
           'rp-destination-drop-packets', 'rpf-check-failure-packets', 'total-number-of-drop-packets', \
           'unresolved-prefix-packets', 'unsupported-feature-packets', 'vrf__neighbors-count', \
           'vrf__network-count', 'vrf__path-count', 'vrf__update-messages-received']
# columns using 'first' as aggregator function
first_cols = ['name', 'time', 'EncodingPath', 'Producer', 'af-name', 'as', 'instance-name', 'interface-name', \
             'mpls-disabled-interface', 'node-name', \
             'performance-statistics__global__ipv4rib-server__is-rib-connection-up', \
             'performance-statistics__global__ipv4rib-server__rib-connection-up-count', \
             'route-table-name', 'routes-counts', 'saf-name', 'vrf-name']
# columns using 'np.mean' as aggregator function
avg_cols = ['bandwidth', 'free-application-memory', 'free-physical-memory', 'input-data-rate', \
           'input-load', 'input-packet-rate', 'output-data-rate', 'output-load', 'output-packet-rate', \
           'protocol-route-memory', 'ram-memory', 'reliability', 'system-ram-memory', 'total-cpu-fifteen-minute', \
           'total-cpu-five-minute', 'total-cpu-one-minute']
# columns using 'max' as aggregator function
max_cols = ['peak-input-data-rate', 'peak-input-packet-rate', 'peak-output-data-rate', 'peak-output-packet-rate']

# names of features (columns) to ignore
non_features=['EncodingPath', 'af-name', 'instance-name', 'interface-name', 'node-name', 'saf-name', 'vrf-name']

In [5]:
# Create dict of aggregator functions
sum_list = [(i, 'sum') for i in sum_cols]
first_list = [(i, 'first') for i in first_cols]
avg_list = [(i, np.mean) for i in avg_cols]
max_list = [(i, 'max') for i in max_cols]
aggregator_funcs = dict(sum_list + first_list + avg_list + max_list)

In [6]:
# specify data types for text fields, to avoid warning
column_dtypes = {'af-name': str, 'instance-name': str, 'interface-name': str, 'node-name': str, \
                 'performance-statistics__global__ipv4rib-server__is-rib-connection-up': str, \
                 'saf-name': str, 'route-table-name': str, 'vrf-name': str, 'time': int}

# read only columns of interest
#telemetry = pd.read_csv(dataset_path, usecols=features)
telemetry = pd.read_csv(dataset_path, dtype = column_dtypes) # use all features for now

In [7]:
# Shift times to start from 0 and convert from nanoseconds to miliseconds
telemetry['time'] = (telemetry['time'] - min(telemetry['time'])) / 1000000

In [8]:
# Group repeated time entries, and apply aggregator_funcs to overlapping fields
agg_telemetry=telemetry.groupby(telemetry['time']).aggregate(aggregator_funcs)

# Return time as a column
#agg_telemetry['time'] = agg_telemetry.index
#agg_telemetry.reset_index(level=0, inplace=True)

In [9]:
agg_telemetry.head()

Unnamed: 0_level_0,acl-in-rpf-packets,active-routes-count,backup-routes-count,bytes-received,bytes-sent,carrier-transitions,checksum-error-packets,crc-errors,deleted-routes-count,df-unreachable-packets,...,ram-memory,reliability,system-ram-memory,total-cpu-fifteen-minute,total-cpu-five-minute,total-cpu-one-minute,peak-input-data-rate,peak-input-packet-rate,peak-output-data-rate,peak-output-packet-rate
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,255.0,,,,,0.0,0.0,0.0,0.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,255.0,,,,,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,255.0,,,,,0.0,0.0,0.0,0.0
7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,255.0,,,,,0.0,0.0,0.0,0.0
9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,255.0,,,,,0.0,0.0,0.0,0.0


In [10]:
agg_telemetry.tail()

Unnamed: 0_level_0,acl-in-rpf-packets,active-routes-count,backup-routes-count,bytes-received,bytes-sent,carrier-transitions,checksum-error-packets,crc-errors,deleted-routes-count,df-unreachable-packets,...,ram-memory,reliability,system-ram-memory,total-cpu-fifteen-minute,total-cpu-five-minute,total-cpu-one-minute,peak-input-data-rate,peak-input-packet-rate,peak-output-data-rate,peak-output-packet-rate
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3599798.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,19473100000.0,,19473100000.0,,,,,,,
3599802.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,19473100000.0,,19473100000.0,,,,,,,
3599889.0,0.0,182.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3599966.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,23.0,23.0,23.0,,,,
3599974.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8589935000.0,,8589935000.0,,,,,,,


In [11]:
agg_telemetry[0:].to_csv('embeddings1.txt', index=False, columns=sum_cols + avg_cols + max_cols, header=False, na_rep=0, sep='\t')

In [12]:
agg_telemetry[0:].to_csv('labels1.txt', index=False, columns=['time', 'name'], header=True, na_rep=0, sep='\t')