## Setup

In [2]:
%load_ext autoreload
%autoreload 2

## Imports

In [1]:
import pandas as pd
import os
from src.extract import (
    extract_resource_consumption_from_dataset_2022_M,
    extract_resource_consumption_from_dataset_2022_Y,
    get_metadata_about_resource_consumption
)

from src.constants import RAW_POLCOM_DATA_UNZIP_PATH, RAW_POLCOM_DATA_ZIP_PATH, RAW_POLCOM_DATA_BASE_PATH, RAW_POLCOM_DATA_2022_PATH, PROCESSED_POLCOM_DATA_2022_M_PATH, PROCESSED_POLCOM_DATA_2022_Y_PATH
from src.utils import concat_dataframes_horizontally

## Extracting data from the zip file

In [4]:
RAW_POLCOM_DATA_BASE_PATH

'../data/raw/Dane - Polcom/'

In [11]:
os.chdir(r'/home/dodeu/AGH/PD1/prototyping/src')

In [16]:
if os.path.exists(RAW_POLCOM_DATA_BASE_PATH):
    import shutil
    shutil.rmtree(RAW_POLCOM_DATA_BASE_PATH)
    print(f"Removed existing directory: {RAW_POLCOM_DATA_BASE_PATH}")

In [17]:
os.makedirs(RAW_POLCOM_DATA_UNZIP_PATH, exist_ok=True)
import zipfile
with zipfile.ZipFile(RAW_POLCOM_DATA_ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(RAW_POLCOM_DATA_UNZIP_PATH)
    print(f"Unzipped {RAW_POLCOM_DATA_ZIP_PATH} into {RAW_POLCOM_DATA_UNZIP_PATH}")

Unzipped /home/dodeu/AGH/PD1/prototyping/data/Dane - Polcom.zip into /home/dodeu/AGH/PD1/prototyping/data/raw


## Extracting data from the unzipped folder

In [19]:
def extract_consumption_data_from_dataset_2022_Y(type) -> None:
    vmware_to_local = {
        "DM": "VM01",
        "PM": "VM02",
        "R02": "VM03",
        "R03": "VM04",
        "R04": "VM05",
        "S": "VM06",
        "V02": "VM07",
        "V03": "VM08",
    }

    local_to_vmware = {v: k for k, v in vmware_to_local.items()}

    VIRTUAL_MACHINES = list(vmware_to_local.values())
    
    if type == 'Y':
        extract_resource_consumption_from_dataset_2022_func = extract_resource_consumption_from_dataset_2022_Y
        base_destination = PROCESSED_POLCOM_DATA_2022_Y_PATH
    else:
        extract_resource_consumption_from_dataset_2022_func = extract_resource_consumption_from_dataset_2022_M
        base_destination = PROCESSED_POLCOM_DATA_2022_M_PATH

    print(f"base_destination: {base_destination}")
    for virtual_machine_id in VIRTUAL_MACHINES:
        print(f"Processing {virtual_machine_id}...")
        #if virtual_machine_id == "VM05":
        #    print(f"Skipping {virtual_machine_id}...")
        #    continue
        vmware_server_id: str = local_to_vmware[virtual_machine_id]
        print(f"metadadata path: {str(RAW_POLCOM_DATA_2022_PATH / vmware_server_id)}")
        metadata = get_metadata_about_resource_consumption(
            str(RAW_POLCOM_DATA_2022_PATH / vmware_server_id), type
        )
        
        print(f"metadata: {metadata}")
        destination = base_destination / f"{virtual_machine_id}.parquet"
        print("Final destination: ", destination)
        os.makedirs(base_destination, exist_ok=True)
        
        dfs: list[pd.DataFrame] = extract_resource_consumption_from_dataset_2022_func(vmware_server_id, metadata)
        print('Extracted data of length ', len(dfs))
        print(dfs)
        print('Concatenating data...')

        df_merged: pd.DataFrame = concat_dataframes_horizontally(dfs)
        
        print(f"Saving {vmware_server_id} combined consumption data to {destination}")
        df_merged.to_parquet(destination)

extract_consumption_data_from_dataset_2022_Y('Y')
extract_consumption_data_from_dataset_2022_Y('M')

base_destination: /home/dodeu/AGH/PD1/prototyping/data/processed/Dane - Polcom/2022/Y
Processing VM01...
metadadata path: /home/dodeu/AGH/PD1/prototyping/data/raw/Dane - Polcom/2022/AGH2022/DM
metadata: {'cpu': '/home/dodeu/AGH/PD1/prototyping/data/raw/Dane - Polcom/2022/AGH2022/DM/DM_cpu_1Y.csv', 'memory': '/home/dodeu/AGH/PD1/prototyping/data/raw/Dane - Polcom/2022/AGH2022/DM/DM_memory_1Y.csv', 'disk': [{'1': '/home/dodeu/AGH/PD1/prototyping/data/raw/Dane - Polcom/2022/AGH2022/DM/DM_node1_disk_1Y.csv'}], 'network': [{'1': '/home/dodeu/AGH/PD1/prototyping/data/raw/Dane - Polcom/2022/AGH2022/DM/DM_node1_network_1Y.csv'}]}
Final destination:  /home/dodeu/AGH/PD1/prototyping/data/processed/Dane - Polcom/2022/Y/VM01.parquet
Extracted data of length  4
[            CPU_USAGE_MHZ  CPU_USAGE_PERCENT
DATE                                        
2021-04-14           61.0                5.0
2021-04-15           61.0                5.0
2021-04-16           48.0                6.0
2021-04-17     