In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
import pandas as pd
import os
from src.extract import (
    extract_resource_consumption_from_dataset_2022_M,
    extract_resource_consumption_from_dataset_2022_Y,
    get_metadata_about_resource_consumption
)
from src.constants import RAW_POLCOM_DATA_BASE_PATH, PROCESSED_POLCOM_DATA_2022_M_PATH, PROCESSED_POLCOM_DATA_2022_Y_PATH
from src.utils import concat_dataframes_horizontally

In [38]:
def extract_consumption_data_from_dataset_2022_Y(type) -> None:
    vmware_to_local = {
        "DM": "VM01",
        "PM": "VM02",
        "R02": "VM03",
        "R03": "VM04",
        "R04": "VM05",
        "S": "VM06",
        "V02": "VM07",
        "V03": "VM08",
    }

    local_to_vmware = {v: k for k, v in vmware_to_local.items()}

    VIRTUAL_MACHINES: list[str] = [
        "VM01", "VM02", "VM03", "VM04", "VM05", "VM06", "VM07", "VM08"
    ]

    for virtual_machine_id in VIRTUAL_MACHINES:
        vmware_server_id: str = local_to_vmware[virtual_machine_id]

        metadata = get_metadata_about_resource_consumption(
            RAW_POLCOM_DATA_BASE_PATH + vmware_server_id
        )

        if type == 'Y':
            extract_resource_consumption_from_dataset_2022_func = extract_resource_consumption_from_dataset_2022_Y
            destination = PROCESSED_POLCOM_DATA_2022_Y_PATH
        else:
            extract_resource_consumption_from_dataset_2022_func = extract_resource_consumption_from_dataset_2022_M
            destination = PROCESSED_POLCOM_DATA_2022_M_PATH
        destination += f"{virtual_machine_id}.parquet"
        os.makedirs(os.path.dirname(destination), exist_ok=True)
        
        dfs: list[pd.DataFrame] = extract_resource_consumption_from_dataset_2022_func(vmware_server_id, metadata)
        print('Extracted data of length ', len(dfs))
        print(dfs)
        print('Concatenating data...')

        df_merged: pd.DataFrame = concat_dataframes_horizontally(dfs)
        
        print(f"Saving {vmware_server_id} combined consumption data to {destination}")
        df_merged.to_parquet(destination)

extract_consumption_data_from_dataset_2022_Y('Y')
extract_consumption_data_from_dataset_2022_Y('M')

Extracted data of length  6
[            CPU_USAGE_MHZ  CPU_USAGE_PERCENT
DATE                                        
2022-03-14            6.0                0.0
2022-03-14            7.0                0.0
2022-03-14            7.0                0.0
2022-03-14            7.0                0.0
2022-03-14            7.0                0.0
...                   ...                ...
2022-04-13            7.0                0.0
2022-04-13            7.0                0.0
2022-04-13            7.0                0.0
2022-04-13            7.0                0.0
2022-04-13            0.0                0.0

[360 rows x 2 columns],             MEMORY_USAGE_KB  MEMORY_USAGE_PERCENT
DATE                                             
2022-03-14        8274103.0                   0.0
2022-03-14        8274179.0                   0.0
2022-03-14        8274185.0                   0.0
2022-03-14        8274078.0                   0.0
2022-03-14        8274152.0                   0.0
...        