In [1]:
import os
import sys
import math
from datetime import datetime, timedelta

import tsaug
from tsaug import TimeWarp, Crop, Quantize, Drift, Reverse
from tsaug.visualization import plot

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import plotly.graph_objects as go

In [73]:
# Start date needs for final augmented data.
HISTORICAL_DATE = '2020-01-01'

In [74]:
date_format = '%Y-%m-%d'
def date_parser(x):
	return datetime.strptime(x, date_format)

def calculate_number_of_augment_need(start_date, end_date, days_range):
    res = math.ceil(((end_date - start_date)/days_range))
    return res

In [75]:
file = './data/gcp_cost_latest.csv'
df_raw = pd.read_csv(file, sep=",", header=0, parse_dates=[1], date_parser=date_parser)

In [76]:
df_raw.sort_values('date')

Unnamed: 0,cost,date
253,3.302464,2022-02-24
252,4.037611,2022-02-25
251,2.806195,2022-02-26
250,5.156335,2022-02-27
249,3.702743,2022-02-28
...,...,...
4,18.215597,2022-10-31
3,17.167843,2022-11-01
2,17.983306,2022-11-02
1,15.156536,2022-11-03


In [77]:
start_date = df_raw.date.min().date()
end_date = df_raw.date.max().date()
days_range = end_date - start_date + timedelta(days=1)
print(start_date, end_date, days_range)

2022-02-24 2022-11-04 254 days, 0:00:00


In [78]:
X = np.arange(len(df_raw))
X_date = df_raw['date'].map(lambda x: int(x.timestamp()*1000)).tolist()

In [79]:
Y = df_raw['cost'].tolist()
Y = np.array(Y)

In [80]:
repeats = calculate_number_of_augment_need(datetime.strptime(HISTORICAL_DATE, '%Y-%m-%d').date(), start_date,days_range)

In [81]:
X_date_extended1 = pd.date_range(start_date - days_range * repeats,start_date,freq='d').map(lambda x: int(x.timestamp()*1000))
X_date_extended = np.hstack([X_date_extended1, X_date])[1:]

In [82]:
Y_aug, X_aug = tsaug.AddNoise(scale=0.01, 
                          repeats=repeats
                         ).augment(Y, X)

In [83]:
Y_extended = []
for i in Y_aug:
        Y_extended.append(i)
        
Y_extended = np.hstack([Y] + Y_extended)

In [84]:
new_df = pd.DataFrame({'datetime': list(map(lambda x: datetime.fromtimestamp(x/1000.0).strftime('%Y-%m-%d'),X_date_extended))})
new_df['cost'] = pd.Series(Y_extended)
new_df = new_df.sort_values('datetime')

In [85]:
new_df

Unnamed: 0,datetime,cost
0,2019-05-16,4.003497
1,2019-05-17,15.156536
2,2019-05-18,17.983306
3,2019-05-19,17.167843
4,2019-05-20,18.215597
...,...,...
1020,2022-10-31,19.771047
1019,2022-11-01,17.109812
1018,2022-11-02,17.800210
1017,2022-11-03,15.287568


In [86]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=new_df.datetime, y=new_df.cost,
                    mode='lines',
                    name='cost'))
# fig.add_trace(go.Scatter(x=df_raw.date, y=df_raw.cost,
#                     mode='lines',
#                     name='cost'))
fig.update_layout(title=f'Sample data')
fig.show()

In [87]:
new_df[['datetime', 'cost']].to_csv('./data/gcp_cost_date_2_years_augmented.csv', sep=',', index=False)

In [88]:
new_df

Unnamed: 0,datetime,cost
0,2019-05-16,4.003497
1,2019-05-17,15.156536
2,2019-05-18,17.983306
3,2019-05-19,17.167843
4,2019-05-20,18.215597
...,...,...
1020,2022-10-31,19.771047
1019,2022-11-01,17.109812
1018,2022-11-02,17.800210
1017,2022-11-03,15.287568
