In [204]:
import os
import sys
import math
from datetime import datetime, timedelta

import tsaug
from tsaug import TimeWarp, Crop, Quantize, Drift, Reverse
from tsaug.visualization import plot

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import plotly.graph_objects as go

In [205]:
# Start date needs for final augmented data.
HISTORICAL_DATE = '2022-01-01'

In [206]:
# Example more tsaug options
OTHER_TSAUG_OPTIONS = {
    #REFERENCE: https://tsaug.readthedocs.io/en/stable/references.html#tsaug.AddNoise
    "kind": "additive",
}

In [207]:
# UTILS FUNCTIONS
date_format = '%Y-%m-%d'
def calculate_number_of_augment_need(start_date, end_date, days_range):
    res = math.ceil(((end_date - start_date)/days_range))
    return res

def date_parser(x):
	return datetime.strptime(x, date_format)

def add_noise(Y, X, scale=0.1, repeats=1, **other_options):
    return tsaug.AddNoise(scale=scale, 
                          repeats=repeats,
                          **other_options
                         ).augment(Y, X)

def pre_process(df_raw):
    # Find missing dates
    empty = pd.date_range(df_final.datetime.min(), df_final.datetime.max()).difference(df_final.datetime)

In [208]:
# Read real metrics data from VNG EP account
cost_with_instances_info_file = './data/gcp_cost_with_instance_info.csv'
cpu_utilize_file = './data/raw_gcp_cpu_utilization.json'
network_in_file = './data/raw_gcp_network_in.json'
network_out_file = './data/raw_gcp_network_out.json'
df_cost_instances_info = pd.read_csv(cost_with_instances_info_file, sep=',', header=0, parse_dates=[2],
        date_parser=date_parser,  dtype={'instanceId': str})
df_cpu = pd.read_json(cpu_utilize_file, dtype={'instanceId': str})
df_network_in = pd.read_json(network_in_file, dtype={'instanceId': str})
df_network_out = pd.read_json(network_out_file, dtype={'instanceId': str})

In [209]:
"""
@param df_raw: data raw read from file
@param instanceId: instance ID/ asset ID
@param historical_date: starting date needs for data
@param metric_type: new columne name for new dataset, example: CPUUtilization
@return: df augmented to historical_date
"""
def augment_metrics(df_raw, asset_id, historical_date, metric_type, value_field, scale=0.01):
    asset_id_field = "instanceId"
    datetime_field = "datetime"
    
    # Get data belonging to an asset
    df_raw_i = df_raw[df_raw[asset_id_field] == asset_id].copy()
    if df_raw_i.empty:
        raise Exception(f"No data for metrics {metric_type} of instance {asset_id}. Ignore this instance.")
    
    # days_range: number of days in real data
    start_date = df_raw_i.datetime.min().date()
    end_date = df_raw_i.datetime.min().date()
    days_range = end_date - start_date + timedelta(days=1)
    
    # Build X-axis, values from 0 to length of X. X with date values is causing errors when augmenting.
    X = np.arange(len(df_raw_i))
    
    # Build X-axis with date values, use for final dataset, not to augment.
    X_date = df_raw_i[datetime_field].map(lambda x: int(x.timestamp()*1000)).tolist()
    
    # Build Y axis
    Y = df_raw_i[value_field].tolist()
    Y = np.array(Y)
    
    # Calculate number of loops until date reach historical date
    repeats = calculate_number_of_augment_need(historical_date.date(), start_date,days_range)
    
    # Map dates to timestamp format
    X_date = df_raw_i[datetime_field].map(lambda x: int(x.timestamp()*1000)).tolist()
    
    # Extend date (X-axis) to historical date
    X_date_extended1 = pd.date_range(start_date - days_range * repeats,start_date,freq='d').map(lambda x: int(x.timestamp()*1000))
    
    # Remove first record of extended X
    X_date_extended = np.hstack([X_date_extended1, X_date])[1:]
    
    Y_extended = []
    
    Y_aug, X_aug = add_noise(Y, X, scale, repeats, **OTHER_TSAUG_OPTIONS)
    
    # Augment Y until X reach historical date
    for i in Y_aug:
        Y_extended.append(i)
        
    # Append augmented Y to original Y
    Y_extended = np.hstack([Y] + Y_extended)
    
    # Build new DF, ,ap back X from timestime to datetime format
    new_df = pd.DataFrame({'datetime': list(map(lambda x: datetime.fromtimestamp(x/1000.0).strftime('%Y-%m-%d'),X_date_extended))})
    
    # Add Y column and sort the new dataframe by datetime
    new_df[metric_type] = pd.Series(Y_extended)
    new_df = new_df.sort_values('datetime')
    return new_df

In [210]:
historical_date = datetime.strptime(HISTORICAL_DATE, '%Y-%m-%d')
instanceIds = df_cpu['instanceId'].unique().tolist()
df_final = None
for instanceId in instanceIds:
    try:
        df_cpu_i = augment_metrics(df_cpu, instanceId, historical_date, 'cpuUtilization', "avgValue")
        # Needs smaller scale for network, because it may produce negative values
        df_network_i = augment_metrics(df_network_in, instanceId, historical_date, 'networkIn', "avgValue", scale=0.006)
        df_network_out_i = augment_metrics(df_network_out, instanceId, historical_date, 'networkOut', "avgValue")
        df_vcpus_i = augment_metrics(df_cost_instances_info, instanceId, historical_date, 'vCPUs', 'vCPUs', scale=0)
        df_memory_i = augment_metrics(df_cost_instances_info, instanceId, historical_date, 'memory_gb', 'memory_gb', scale=0)
        df_bandwidth_i = augment_metrics(df_cost_instances_info, instanceId, historical_date, 'bandwidth_gbps', 'bandwidth_gbps', scale=0)
        df_cost_i = augment_metrics(df_cost_instances_info, instanceId, historical_date, 'costInUsd', 'costInUsd')
        
        df_new = pd.merge(df_cpu_i, df_network_i, on=['datetime'], how='outer')
        df_new = pd.merge(df_new, df_network_out_i, on=['datetime'], how='outer')
        df_new = pd.merge(df_new, df_vcpus_i, on=['datetime'], how='outer')
        df_new = pd.merge(df_new, df_memory_i, on=['datetime'], how='outer')
        df_new = pd.merge(df_new, df_bandwidth_i, on=['datetime'], how='outer')
        df_new = pd.merge(df_new, df_cost_i, on=['datetime'], how='outer')
        df_new['instanceId'] = instanceId
        if df_final is not None:
            df_final = pd.concat([df_final, df_new])
        else:
            df_final = df_new
    except Exception as e:
        print(e)
        pass

No data for metrics vCPUs of instance 3922265808357194945. Ignore this instance.
No data for metrics vCPUs of instance 2265529010543344118. Ignore this instance.
No data for metrics vCPUs of instance 4036654423970744204. Ignore this instance.
No data for metrics vCPUs of instance 6835791843203149124. Ignore this instance.
No data for metrics networkIn of instance 5218666880235370347. Ignore this instance.
No data for metrics vCPUs of instance 5353556143556258917. Ignore this instance.


In [211]:
df_final = df_final.drop(df_final[df_final.datetime < HISTORICAL_DATE].index)\
                    .sort_values('datetime')\
                    .fillna(0)
display(df_final)

Unnamed: 0,datetime,cpuUtilization,networkIn,networkOut,vCPUs,memory_gb,bandwidth_gbps,costInUsd,instanceId
0,2022-01-02,0.010492,8.798382e+03,60.344538,1.0,1.7,1.0,0.211182,2084983531904533635
0,2022-01-02,0.029246,1.350986e+04,89.967667,1.0,0.6,1.0,0.428642,2626184973528935008
0,2022-01-02,0.028106,1.031189e+04,118.875289,1.0,0.6,1.0,0.349357,7878069179189498993
0,2022-01-02,0.011845,1.541477e+04,209.912240,1.0,1.7,1.0,0.900609,896514558657869575
0,2022-01-02,0.090497,6.794989e+06,831.914550,1.0,1.7,1.0,0.881905,3987000818135333307
...,...,...,...,...,...,...,...,...,...
339,2022-11-19,0.000000,0.000000e+00,49.188867,0.0,0.0,0.0,0.000000,1236188674412042392
339,2022-11-19,0.000000,0.000000e+00,70.650361,0.0,0.0,0.0,0.000000,7884692980835346248
343,2022-11-19,0.000000,0.000000e+00,98.968359,0.0,0.0,0.0,0.000000,8144675344174665202
340,2022-11-19,0.000000,0.000000e+00,128.579854,0.0,0.0,0.0,0.000000,2092202371188836866


In [212]:
# Check data abnormalities
empty = pd.date_range(df_final.datetime.min(), df_final.datetime.max()).difference(df_final.datetime)
print("MISSING DATE:", len(empty))
print("NULL COLUMNS:")
print(df_final.isna().sum())

MISSING DATE: 0
NULL COLUMNS:
datetime          0
cpuUtilization    0
networkIn         0
networkOut        0
vCPUs             0
memory_gb         0
bandwidth_gbps    0
costInUsd         0
instanceId        0
dtype: int64


In [213]:
# WRITE TO FINAL FILE
df_final.to_csv('./data/gcp_cost_metrics_augmented_final.csv', sep=',')

In [214]:
# ---- EXPERIMENT

In [219]:
sample_instance = '8144675344174665202'
df_final_i = df_final[df_final['instanceId'] == sample_instance].copy()

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_final_i.datetime, y=df_final_i.costInUsd,
                    mode='lines',
                    name='costInUsd'))
# fig.add_trace(go.Scatter(x=df_final_i.datetime, y=df_final_i.cpuUtilization,
#                     mode='lines',
#                     name='cpuUtilization'))
fig.update_layout(title=f'Sample data for instance {sample_instance}')
fig.show()

In [220]:
df_final_i

Unnamed: 0,datetime,cpuUtilization,networkIn,networkOut,vCPUs,memory_gb,bandwidth_gbps,costInUsd,instanceId
0,2022-01-02,0.038884,60569.117083,92.387991,1.0,0.6,1.0,0.337985,8144675344174665202
1,2022-01-03,0.029275,15015.289583,99.502778,1.0,0.6,1.0,0.310175,8144675344174665202
2,2022-01-04,0.025629,31372.985417,148.977083,1.0,0.6,1.0,0.410504,8144675344174665202
3,2022-01-05,0.017950,14180.461111,201.702778,1.0,0.6,1.0,0.371703,8144675344174665202
4,2022-01-06,0.016799,25287.998611,117.104167,1.0,0.6,1.0,0.347432,8144675344174665202
...,...,...,...,...,...,...,...,...,...
339,2022-11-15,0.017797,24170.724459,137.008416,0.0,0.0,0.0,0.000000,8144675344174665202
340,2022-11-16,0.000000,45816.089695,117.225102,0.0,0.0,0.0,0.000000,8144675344174665202
341,2022-11-17,0.000000,0.000000,203.655682,0.0,0.0,0.0,0.000000,8144675344174665202
342,2022-11-18,0.000000,0.000000,145.969112,0.0,0.0,0.0,0.000000,8144675344174665202
