In [7]:
import os

import pandas as pd
import argparse
import matplotlib.pyplot as plt

In [8]:
dataset_folder = "../data/original_data/cpu"
save_folder = "../data/combine_data"

In [9]:
def read_data(filename, group_data = None):
    raw_df = pd.read_csv(filename)
    raw_df['timestamp'] = pd.to_datetime(raw_df['timestamp'])

    if group_data is not None:
        raw_df.set_index('timestamp', inplace=True)
        raw_df = raw_df.groupby('hostname').resample(group_data).mean().reset_index()

    # Use pivot to transform the DataFrame
    raw_df = raw_df.pivot_table(index='timestamp', columns='hostname', values='avg')

    # Set the timestamp as index
    raw_df.reset_index(inplace=True)
    raw_df.columns.name = None
    raw_df = raw_df.set_index('timestamp')

    frequency = '5T' if group_data is None else group_data
    idx = pd.date_range(start=raw_df.index.min(), end=raw_df.index.max(), freq=frequency)
    full_time_series_df = raw_df.reindex(idx)
    full_time_series_df.index.name = 'timestamp'

    # Handle empty data row
    df = full_time_series_df.interpolate(method='linear')

    return df

def train_test_split(df, ratio):

    # Split the dataset into Dataframe training and testing with ratio
    train_length = round(len(df)*ratio)
    test_length = len(df) - train_length

    train = df.iloc[0: train_length]
    test = df[train_length :]

    return train, test

df = read_data("../data/combine_data/combine.csv", group_data = "H")
train, test = train_test_split(df, 0.8)

In [10]:
train

Unnamed: 0_level_0,amphora-00106840-3bfa-43d0-acdb-5fa78683d8ca,amphora-002a6401-7521-485b-a4fb-ede70de13870,amphora-0067676d-0ad2-4358-aa8b-cc9fbf926250,amphora-007a71c6-7f2c-42d7-ab40-3379953f1235,amphora-00853e9d-5b93-4ca2-86c1-ef4a1d9fe87e
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-08-09 00:00:00+00:00,4.0,9.333333,3.000000,3.0,
2023-08-09 01:00:00+00:00,4.0,9.333333,2.750000,3.0,
2023-08-09 02:00:00+00:00,4.0,9.875000,2.333333,3.0,
2023-08-09 03:00:00+00:00,4.0,10.958333,3.000000,3.0,
2023-08-09 04:00:00+00:00,4.0,10.791667,2.666667,3.0,
...,...,...,...,...,...
2023-08-14 17:00:00+00:00,4.0,2.000000,2.000000,3.0,2.0
2023-08-14 18:00:00+00:00,4.0,2.000000,2.583333,3.0,2.0
2023-08-14 19:00:00+00:00,4.0,2.000000,2.722222,3.0,2.0
2023-08-14 20:00:00+00:00,4.0,2.333333,3.000000,3.0,2.0
