In [74]:
import os
import sys
import math
from datetime import datetime, timedelta

import tsaug
from tsaug import TimeWarp, Crop, Quantize, Drift, Reverse
from tsaug.visualization import plot

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import plotly.graph_objects as go

In [75]:
# Start date needs for final augmented data.
HISTORICAL_DATE = '2022-01-01'

In [76]:
DATE_FIELDS_CONFIG = {
    "metrics": "avgValue",
    "cost": "costInUsd",
}

In [77]:
# UTILS FUNCTIONS
date_format = '%Y-%m-%d'
def calculate_number_of_augment_need(start_date, end_date, days_range):
    res = math.ceil(((end_date - start_date)/days_range))
    return res

def date_parser(x):
	return datetime.strptime(x, date_format)

def add_noise(Y, X, scale=0.1):
    Y_aug_noise, X_aug_noise = tsaug.AddNoise(scale=scale).augment(Y, X)
    return Y_aug_noise, X_aug_noise

In [78]:
# Read real metrics data from VNG EP account
cost_with_instances_info_file = './data/gcp_cost_with_instance_info.csv'
cpu_utilize_file = './data/raw_gcp_cpu_utilization.json'
network_in_file = './data/raw_gcp_network_in.json'
network_out_file = './data/raw_gcp_network_out.json'
df_cost_instances_info = pd.read_csv(cost_with_instances_info_file, sep=',', header=0, parse_dates=[2],
        date_parser=date_parser,  dtype={'instanceId': str})
df_cpu = pd.read_json(cpu_utilize_file, dtype={'instanceId': str})
df_network_in = pd.read_json(network_in_file, dtype={'instanceId': str})
df_network_out = pd.read_json(network_out_file, dtype={'instanceId': str})

In [79]:
"""
@param df_raw: data raw read from file
@param instanceId: instance ID/ asset ID
@param historical_date: starting date needs for data
@param metric_type: new columne name for new dataset, example: CPUUtilization
@return: df augmented to historical_date
"""
def augment_metrics(df_raw, asset_id, historical_date, metric_type, value_field):
    asset_id_field = "instanceId"
    datetime_field = "datetime"
    
    # Get data belonging to an asset
    df_raw_i = df_raw[df_raw[asset_id_field] == asset_id].copy()
    
    # days_range: number of days in real data
    start_date = df_raw_i.min(axis=0)[datetime_field].date()
    end_date = df_raw_i.max(axis=0)[datetime_field].date()
    days_range = end_date - start_date + timedelta(days=1)
    
    # Build X-axis, values from 0 to length of X. X with date values is causing errors when augmenting.
    X = np.arange(len(df_raw_i))
    
    # Build X-axis with date values, use for final dataset, not to augment.
    X_date = df_raw_i[datetime_field].map(lambda x: int(x.timestamp()*1000)).tolist()
    
    # Build Y axis
    Y = df_raw_i[value_field].tolist()
    Y = np.array(Y)
    
    # Augment Y and X. Results are augmented Y; X remains the same.
    Y_aug_noise, X_aug_noise = add_noise(Y, X)
    
    # Calculate number of loops until date reach historical date
    loops_count = calculate_number_of_augment_need(historical_date.date(), start_date,days_range)
    
    # Map dates to timestamp format
    X_date = df_raw_i[datetime_field].map(lambda x: int(x.timestamp()*1000)).tolist()
    
    # Extend date (X-axis) to historical date
    X_date_extended1 = pd.date_range(start_date -(days_range) * loops_count,start_date,freq='d').map(lambda x: int(x.timestamp()*1000))
    
    # Remove first record of extended X
    X_date_extended = np.hstack([X_date_extended1, X_date])[1:]
    
    extended_Y = []
    cur_Y = Y
    
    # Augment Y until X reach historical date
    for i in range(0, loops_count):
        cur_Y, X = add_noise(cur_Y, X)
        extended_Y.append(cur_Y)
        
    # Append augmented Y to original Y
    Y_extended = np.hstack([Y] + extended_Y)
    
    # Build new DF, ,ap back X from timestime to datetime format
    new_df = pd.DataFrame({'datetime': list(map(lambda x: datetime.fromtimestamp(x/1000.0).strftime('%Y-%m-%d'),X_date_extended))})
    
    # Add Y column and sort the new dataframe by datetime
    new_df[metric_type] = pd.Series(Y_extended)
    new_df = new_df.sort_values('datetime')
    return new_df

In [90]:
historical_date = datetime.strptime(HISTORICAL_DATE, '%Y-%m-%d')
instanceIds = df_cpu['instanceId'].unique().tolist()
df_final = None
for instanceId in instanceIds:
    try:
        df_cpu_i = augment_metrics(df_cpu, instanceId, historical_date, 'cpuUtilization', DATE_FIELDS_CONFIG['metrics'])
        df_network_i = augment_metrics(df_network_in, instanceId, historical_date, 'networkIn', DATE_FIELDS_CONFIG['metrics'])
        df_network_out_i = augment_metrics(df_network_out, instanceId, historical_date, 'networkOut', DATE_FIELDS_CONFIG['metrics'])
        df_vcpus_i = augment_metrics(df_cost_instances_info, instanceId, historical_date, 'vCPUs', 'vCPUs')
        df_memory_i = augment_metrics(df_cost_instances_info, instanceId, historical_date, 'memory_gb', 'memory_gb')
        df_bandwidth_i = augment_metrics(df_cost_instances_info, instanceId, historical_date, 'bandwidth_gbps', 'bandwidth_gbps')
        df_cost_i = augment_metrics(df_cost_instances_info, instanceId, historical_date, 'costInUsd', 'costInUsd')
        
        df_new = pd.merge(df_cpu_i, df_network_i, on=['datetime'], how='outer')
        df_new = pd.merge(df_new, df_network_out_i, on=['datetime'], how='outer')
        df_new = pd.merge(df_new, df_vcpus_i, on=['datetime'], how='outer')
        df_new = pd.merge(df_new, df_memory_i, on=['datetime'], how='outer')
        df_new = pd.merge(df_new, df_bandwidth_i, on=['datetime'], how='outer')
        df_new = pd.merge(df_new, df_cost_i, on=['datetime'], how='outer')
        df_new['instanceId'] = instanceId
        if df_final is not None:
            df_final = pd.concat([df_final, df_new])
        else:
            df_final = df_new
    except Exception as e:
        print(e)
        print(instanceId)

zero-size array to reduction operation maximum which has no identity
3922265808357194945
zero-size array to reduction operation maximum which has no identity
2265529010543344118
zero-size array to reduction operation maximum which has no identity
4036654423970744204
zero-size array to reduction operation maximum which has no identity
6835791843203149124
zero-size array to reduction operation maximum which has no identity
5218666880235370347
zero-size array to reduction operation maximum which has no identity
5353556143556258917


In [93]:
df_final = df_final.drop(df_final[df_final.datetime < HISTORICAL_DATE].index)

In [94]:
df_final

Unnamed: 0,datetime,cpuUtilization,networkIn,networkOut,vCPUs,memory_gb,bandwidth_gbps,costInUsd,instanceId
60,2022-02-20,0.008582,17451.044863,67.058404,1.0,1.7,1.0,0.773680,2084983531904533635
61,2022-02-21,0.009292,-6202.601734,69.420038,1.0,1.7,1.0,0.839641,2084983531904533635
62,2022-02-22,0.008949,21430.947361,67.890760,1.0,1.7,1.0,0.940766,2084983531904533635
63,2022-02-23,0.008145,4833.443798,107.485340,1.0,1.7,1.0,0.588254,2084983531904533635
64,2022-02-24,0.008616,-10456.035972,74.375590,1.0,1.7,1.0,0.770617,2084983531904533635
...,...,...,...,...,...,...,...,...,...
285,2022-07-30,0.015261,-52830.844564,92.267675,1.0,1.7,1.0,0.231569,5681595381851713673
286,2022-07-31,0.025634,-9269.332498,100.647637,1.0,1.7,1.0,0.502435,5681595381851713673
287,2022-08-01,0.018768,-147036.122436,12.548980,1.0,1.7,1.0,0.663850,5681595381851713673
288,2022-08-02,0.009884,27404.555652,108.825503,1.0,1.7,1.0,1.182839,5681595381851713673


In [95]:
df_final.to_csv('./data/gcp_metrics_augmented.csv', sep=',')