## Setup

In [71]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Imports

In [77]:
import pandas as pd
import os
from src.extract import (
    extract_resource_consumption_from_dataset_2022_M,
    extract_resource_consumption_from_dataset_2022_Y,
    get_metadata_about_resource_consumption
)

from src.constants import RAW_DATA_UNZIP_PATH, RAW_POLCOM_DATA_ZIP_PATH, RAW_POLCOM_DATA_BASE_PATH, RAW_POLCOM_DATA_2022_PATH, PROCESSED_POLCOM_DATA_2022_M_PATH, PROCESSED_POLCOM_DATA_2022_Y_PATH
from src.utils import concat_dataframes_horizontally

## Extracting data from the zip file

In [73]:
os.chdir(r'/home/dodeu/AGH/PD1/prototyping/src')

In [79]:
if os.path.exists(RAW_POLCOM_DATA_BASE_PATH):
    import shutil
    shutil.rmtree(RAW_POLCOM_DATA_BASE_PATH)
    print(f"Removed existing directory: {RAW_POLCOM_DATA_BASE_PATH}")

In [80]:
os.makedirs(RAW_DATA_UNZIP_PATH, exist_ok=True)
import zipfile
with zipfile.ZipFile(RAW_POLCOM_DATA_ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(RAW_DATA_UNZIP_PATH)
    print(f"Unzipped {RAW_POLCOM_DATA_ZIP_PATH} into {RAW_DATA_UNZIP_PATH}")

Unzipped ../data/Dane - Polcom.zip into ../data/raw/


## Extracting data from the unzipped folder

In [82]:
def extract_consumption_data_from_dataset_2022_Y(type) -> None:
    vmware_to_local = {
        "DM": "VM01",
        "PM": "VM02",
        "R02": "VM03",
        "R03": "VM04",
        "R04": "VM05",
        "S": "VM06",
        "V02": "VM07",
        "V03": "VM08",
    }

    local_to_vmware = {v: k for k, v in vmware_to_local.items()}

    VIRTUAL_MACHINES = list(vmware_to_local.values())
    
    if type == 'Y':
        extract_resource_consumption_from_dataset_2022_func = extract_resource_consumption_from_dataset_2022_Y
        base_destination = PROCESSED_POLCOM_DATA_2022_Y_PATH
    else:
        extract_resource_consumption_from_dataset_2022_func = extract_resource_consumption_from_dataset_2022_M
        base_destination = PROCESSED_POLCOM_DATA_2022_M_PATH

    for virtual_machine_id in VIRTUAL_MACHINES:
        if virtual_machine_id != "VM05":
            continue
        vmware_server_id: str = local_to_vmware[virtual_machine_id]

        metadata = get_metadata_about_resource_consumption(
            RAW_POLCOM_DATA_2022_PATH + vmware_server_id, type
        )
        print(f"metadata: {metadata}")
        
        destination = f"{base_destination}{virtual_machine_id}.parquet"
        print("Final destination: ", destination)
        os.makedirs(os.path.dirname(destination), exist_ok=True)
        
        dfs: list[pd.DataFrame] = extract_resource_consumption_from_dataset_2022_func(vmware_server_id, metadata)
        print('Extracted data of length ', len(dfs))
        print(dfs)
        print('Concatenating data...')

        df_merged: pd.DataFrame = concat_dataframes_horizontally(dfs)
        
        print(f"Saving {vmware_server_id} combined consumption data to {destination}")
        df_merged.to_parquet(destination)

extract_consumption_data_from_dataset_2022_Y('Y')
extract_consumption_data_from_dataset_2022_Y('M')

metadata: {'cpu': '../data/raw/Dane - Polcom/2022/AGH2022/R04/R04_cpu_1Y.csv', 'memory': '../data/raw/Dane - Polcom/2022/AGH2022/R04/R04_memory_1Y.csv', 'disk': [{'1': '../data/raw/Dane - Polcom/2022/AGH2022/R04/R04_node1_disk_1Y.csv'}, {'2': '../data/raw/Dane - Polcom/2022/AGH2022/R04/R04_node2_disk_1Y.csv'}, {'3': '../data/raw/Dane - Polcom/2022/AGH2022/R04/R04_node3_disk_1Y.csv'}], 'network': [{'1': '../data/raw/Dane - Polcom/2022/AGH2022/R04/R04_node1_network_1Y.csv'}, {'2': '../data/raw/Dane - Polcom/2022/AGH2022/R04/R04_node2_network_1Y.csv'}, {'3': '../data/raw/Dane - Polcom/2022/AGH2022/R04/R04_node3_network_1Y.csv'}]}
Final destination:  ../data/processed/Dane - Polcom/2022/Y/VM05.parquet
Calculating RAM usage percent based on (ACTIVE/GRANTED) * 100 for R04
Extracted data of length  8
[            CPU_USAGE_MHZ  CPU_USAGE_PERCENT
DATE                                        
2021-04-13        14596.0               14.0
2021-04-14        14345.0               14.0
2021-04-15    