In [3]:
import math
from datetime import datetime, timedelta

import tsaug
from tsaug import TimeWarp, Crop, Quantize, Drift, Reverse
from tsaug.visualization import plot

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import plotly.graph_objects as go

In [14]:
date_format = '%Y-%m-%d'
def date_parser(x):
	return datetime.strptime(x, date_format)

file = './data/gcp_cost.csv'
df = pd.read_csv(file, sep=',', header=0, parse_dates=[2],
                  date_parser=date_parser,  dtype={'gcpId': str})
df = df[['date', 'costInUsd', 'gcpId', 'instanceType', 'disk']].sort_values('date')
df = df.dropna()

In [15]:
df

Unnamed: 0,date,costInUsd,gcpId,instanceType,disk
3650,2022-03-12,0.001052,4292318004420660415,f1-micro,PERSISTENT
716,2022-03-12,0.094728,6757684040402322142,f1-micro,PERSISTENT
4124,2022-03-12,0.000697,4930064688747760894,f1-micro,PERSISTENT
2731,2022-03-12,0.001206,8246387926456453502,f1-micro,PERSISTENT
2732,2022-03-12,0.108769,3106432454018283980,f1-micro,PERSISTENT
...,...,...,...,...,...
5467,2022-11-04,0.066417,5930986063805270825,f1-micro,PERSISTENT
1569,2022-11-04,0.179333,3987000818135333307,g1-small,PERSISTENT
4581,2022-11-04,0.175617,6769269070446335364,g1-small,PERSISTENT
37,2022-11-04,0.073031,1236188674412042392,f1-micro,PERSISTENT


In [16]:

def calculate_number_of_augment_need(start_date, end_date, days_range):
    res = math.ceil(((end_date - start_date)/days_range))
    return res


def add_noise(Y, X, scale=0.1):
    Y_aug_noise, X_aug_noise = tsaug.AddNoise(scale=scale).augment(Y, X)
    return Y_aug_noise, X_aug_noise

def add_drift(Y, X, max_drift=0.5, n_drift_points=5):
    Y_aug_drift, X_aug_drift = tsaug.Drift(max_drift=max_drift, n_drift_points=n_drift_points).augment(Y, X)
    return Y_aug_drift, X_aug_drift

In [17]:
def augment_metrics(df_metrics, instanceId, historical_date, metricType='costInUsd'):
    df_metrics_i = df_metrics[df_metrics['gcpId'] == instanceId].copy()
    
    # days_range: number of days of real data
    start_date = df_metrics_i.min(axis=0)['date'].date()
    end_date = df_metrics_i.max(axis=0)['date'].date()
    days_range = end_date - start_date + timedelta(days=1)
    
    X = np.arange(len(df_metrics_i))
    X_date = df_metrics_i['date'].map(lambda x: int(x.timestamp()*1000)).tolist()
    
    Y = df_metrics_i[metricType].tolist()
    Y = np.array(Y)
    Y_aug_noise, X_aug_noise = add_noise(Y, X)
    

    NUM_OF_PERIODS = calculate_number_of_augment_need(historical_date.date(), start_date,days_range)
    X_date = df_metrics_i['date'].map(lambda x: int(x.timestamp()*1000)).tolist()
    X_date_extended1 = pd.date_range(start_date -(days_range) * NUM_OF_PERIODS,start_date,freq='d').map(lambda x: int(x.timestamp()*1000))
    
    X_date_extended = np.hstack([X_date_extended1, X_date])[1:]
    
    extended_Y = []
    cur_Y = Y
    for i in range(0, NUM_OF_PERIODS):
        cur_Y, X = add_noise(cur_Y, X)
        extended_Y.append(cur_Y)
    Y_extended = np.hstack([Y] + extended_Y)
    new_df = pd.DataFrame({'date': list(map(lambda x: datetime.fromtimestamp(x/1000.0).strftime('%Y-%m-%d'),X_date_extended))})

    new_df[metricType] = pd.Series(Y_extended)
    new_df = new_df.sort_values('date')
    return new_df

In [18]:
instanceIds = df['gcpId'].unique().tolist()

In [19]:
historical_date = datetime.strptime('2022-01-01', '%Y-%m-%d')

In [27]:
file_augmented_metrics = './data/gcp_metrics_augmented.csv'
df_augmented_metrics = pd.read_csv(file_augmented_metrics, sep=',', header=0)[['datetime','cpuUtilization', 'networkIn', 'networkOut', 'instanceId']]
display(df_augmented_metrics)

Unnamed: 0,datetime,cpuUtilization,networkIn,networkOut,instanceId
0,2021-12-22,0.010492,,,2084983531904533635
1,2021-12-23,0.009063,,,2084983531904533635
2,2021-12-24,0.009381,8798.382353,60.344538,2084983531904533635
3,2021-12-25,0.009093,6866.004167,54.988194,2084983531904533635
4,2021-12-26,0.009061,7228.202778,59.928472,2084983531904533635
...,...,...,...,...,...
14436,2022-11-16,,155835.372860,70.227377,5681595381851713673
14437,2021-11-01,,,111.854503,5681595381851713673
14438,2022-11-17,,,73.480377,5681595381851713673
14439,2022-11-18,,,67.406196,5681595381851713673


In [None]:
df_final = None
for instanceId in instanceIds:
    try:
        df_cost_i = augment_metrics(df, instanceId, historical_date, 'costInUsd').rename(columns={"date": "datetime"})
#         df_instanceType_i = augment_metrics(df, instanceId, historical_date, 'instanceType').rename(columns={"date": "datetime"})
        df_new = pd.merge(df_cost_i, df_augmented_metrics, on=['datetime'], how='outer')
#         df_new = pd.merge(df_new, df_augmented_metrics, on=['datetime'], how='outer')
        df_new['instanceId'] = instanceId
        if df_final is not None:
            df_final = df_final.append(df_new)
        else:
            df_final = df_new
    except Exception as e:
        print(e)
        print(instanceId)

In [None]:
display(df_final)

In [None]:
df_final.to_csv('./data/gcp_metrics_cost_augmented.csv', sep=',')

In [None]:
display(df_final)