In [227]:
import math
from datetime import datetime, timedelta

import tsaug
from tsaug import TimeWarp, Crop, Quantize, Drift, Reverse
from tsaug.visualization import plot

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import plotly.graph_objects as go

In [228]:
date_format = '%Y-%m-%d 00:00:00+00:00'
def date_parser(x):
	return datetime.strptime(x, date_format)

In [253]:
cpu_utilize_file = './data/raw_gcp_cpu_utilization.json'
network_in_file = './data/raw_gcp_network_in.json'
df_cpu = pd.read_json(cpu_utilize_file, dtype={'instanceId': object})
df_network_in = pd.read_json(network_in_file, dtype={'instanceId': object})

In [257]:
### AUGMENTATION HERE
def add_noise(Y, X, scale=0.1):
    Y_aug_noise, X_aug_noise = tsaug.AddNoise(scale=scale).augment(Y, X)
    return Y_aug_noise, X_aug_noise

def calculate_number_of_augment_need(start_date, end_date, days_range):
    res = math.ceil(((end_date - start_date)/days_range))
    return res

def augment_metrics(df_metrics, instanceId, historical_date, metricType='cpuUtilization'):
    df_metrics_i = df_metrics[df_metrics['instanceId'] == instanceId].copy()
    
    # days_range: number of days of real data
    start_date = df_metrics_i.min(axis=0)['datetime'].date()
    end_date = df_metrics_i.max(axis=0)['datetime'].date()
    days_range = end_date - start_date + timedelta(days=1)
    
    X = np.arange(len(df_metrics_i))
    X_date = df_metrics_i['datetime'].map(lambda x: int(x.timestamp()*1000)).tolist()
    
    Y = df_metrics_i['avgValue'].tolist()
    Y = np.array(Y)
    Y_aug_noise, X_aug_noise = add_noise(Y, X)
    

    NUM_OF_PERIODS = calculate_number_of_augment_need(historical_date.date(), start_date,days_range)
    X_date = df_metrics_i['datetime'].map(lambda x: int(x.timestamp()*1000)).tolist()
    X_date_extended1 = pd.date_range(start_date -(days_range) * NUM_OF_PERIODS,start_date,freq='d').map(lambda x: int(x.timestamp()*1000))
    
    X_date_extended = np.hstack([X_date_extended1, X_date])[1:]
    
    extended_Y = []
    cur_Y = Y
    for i in range(0, NUM_OF_PERIODS):
        cur_Y, X = add_noise(cur_Y, X)
        extended_Y.append(cur_Y)
    Y_extended = np.hstack([Y] + extended_Y)
    new_df = pd.DataFrame({'datetime': list(map(lambda x: datetime.fromtimestamp(x/1000.0).strftime('%Y-%m-%d'),X_date_extended))})

    new_df[metricType] = pd.Series(Y_extended)
    new_df = new_df.sort_values('datetime')
    return new_df

In [258]:
instanceIds = df_cpu['instanceId'].unique().tolist()

In [262]:
historical_date = datetime.strptime('2022-01-01', '%Y-%m-%d')

df_final = None

for instanceId in instanceIds:
    try:
        df_cpu_i = augment_metrics(df_cpu, instanceId, historical_date, 'cpuUtilization')
        df_network_i = augment_metrics(df_network_in, instanceId, historical_date, 'networkIn')
        df_new = pd.merge(df_cpu_i, df_network_i, on=['datetime'], how='outer')
    #     df_new = df_cpu_i
        df_new['instanceId'] = instanceId
        if df_final is not None:
            df_final = df_final.append(df_new)
        else:
            df_final = df_new
    except:
        print(instanceId)

5218666880235370347


In [264]:
print(len(df_final))
df_final

14047


Unnamed: 0,datetime,cpuUtilization,networkIn,instanceId
0,2021-12-22,0.010492,,2084983531904533635
1,2021-12-23,0.009063,,2084983531904533635
2,2021-12-24,0.009381,8798.382353,2084983531904533635
3,2021-12-25,0.009093,6866.004167,2084983531904533635
4,2021-12-26,0.009061,7228.202778,2084983531904533635
...,...,...,...,...
377,2022-09-06,,-96609.598384,5681595381851713673
378,2022-09-07,,90133.473567,5681595381851713673
379,2022-09-16,,,5681595381851713673
380,2022-09-17,,,5681595381851713673


In [265]:
df_final.to_csv('./data/gcp_metrics_augmented.csv', sep=',')