In [1]:
import os
from utils import utils

"""
00 - Configuration
"""

dataset_zip_path = "../../data_warehouse/warehouse_7c/snapshots/"
output_folder = "../../data_warehouse/minimized_warehouse_7c/"

zip_files_list = utils.list_zip_files(dataset_zip_path)

print("List of zip files:")
for zip_file in zip_files_list:
    print(zip_file)


List of zip files:
1738915261_(1.5000).zip
1738948257_(1.10000).zip
1738881900_(1.1000).zip


In [6]:
"""
01 - Process qos metrics from .zip files:

- Combine all .csv-files in the given .zip to a DataFrame
- Save the dataframe to output_path
"""


import zipfile
import time
import pandas as pd

failed_zips = []
successful_zips = []

def process_zip(path, output_folder, name="qos_metrics", starts_with=""):
    # Open as zip
    with zipfile.ZipFile(path, 'r') as zip_ref:
    
        # Get a list of all files in the zip
        items = zip_ref.namelist()
    
        # Get a list of all .csv-files in the zip
        csv_files = [x for x in items if x.endswith('.csv')]
        print(csv_files)
        csv_files = [x for x in csv_files if starts_with in x]  # Separate worker_n.csv from master_n.csv
        print(csv_files)
        csv_files.sort(reverse=True)  # NOTE: Sorting does not matter, but may be useful for debugging
        if len(csv_files) == 0:
            print(f"No .csv-files found in {path}")
            failed_zips.append(path)
            return
        
        count = 0
        start_time = time.time()
    
        # Iterate over all csv-files
        dataframes = []
        for path in csv_files:
            count += 1
            print(f"Progress {count}/{len(csv_files):6}, ({count/len(csv_files)*100:5.3} %) (time_spent: {time.time() - start_time:.3} s  - avg: {(time.time() - start_time) / count} s)")
            with zip_ref.open(path) as csv_file:
                x = pd.read_csv(csv_file)
                dataframes.append(x)

        if len(dataframes) == 0:
            print(f"No dataframes found in {path} for {starts_with}")
            return pd.DataFrame()

        # Combine data to a single DataFrame
        df = pd.concat(dataframes)
        os.makedirs(output_folder, exist_ok=True)
        output_path = os.path.join(output_folder, f"{name}.feather")
        df.sort_index(inplace=True)
        if "timestamp" not in df.columns:
            # HPA-files already have timestamp, qos-files do not
            df.reset_index(drop=False, inplace=True, names=["timestamp"])
        df.to_feather(output_path)
        print(f"Saved to {output_path}")
        successful_zips.append(output_path)
        return df

dfs = []
titles = []
for application_type in ["master", "worker", "hpa"]:
    if dataset_zip_path.endswith(".zip"):
        # Process single zip
        print(f"Processing {dataset_zip_path}")
        df = process_zip(dataset_zip_path, output_folder, name=f"{application_type}_qos", starts_with=application_type)
        dfs.append(df)
        zip_name = dataset_zip_path.split('/')[-1].replace(".zip", "")
        titles.append(zip_name)
    else:
        # Process all zips in path
        for zip_name_full in utils.list_zip_files(dataset_zip_path):
            print(zip_name_full)
            zip_name = zip_name_full.replace(".zip", "")
            full_output_path = f"{output_folder}/{zip_name}"
            df = process_zip(dataset_zip_path + zip_name_full, full_output_path, name=f"{application_type}_qos", starts_with=application_type)
            dfs.append(df)
            titles.append(zip_name)

print(f"Failed zips:")
for zip_name in failed_zips:
    print(zip_name)

print("")
print(f"Successful zips:")
for zip_name in successful_zips:
    print(zip_name)

1738915261_(1.5000).zip
['qos_outputs_0/worker_33.csv', 'qos_outputs_0/master_1151.csv', 'qos_outputs_0/master_61.csv', 'qos_outputs_0/worker_1439.csv', 'qos_outputs_0/worker_2074.csv', 'qos_outputs_0/master_72.csv', 'qos_outputs_0/worker_588.csv', 'qos_outputs_0/worker_128.csv', 'qos_outputs_0/master_1974.csv', 'qos_outputs_0/master_345.csv', 'qos_outputs_0/master_1250.csv', 'qos_outputs_0/master_1007.csv', 'qos_outputs_0/worker_947.csv', 'qos_outputs_0/master_382.csv', 'qos_outputs_0/master_1485.csv', 'qos_outputs_0/worker_1162.csv', 'qos_outputs_0/worker_1330.csv', 'qos_outputs_0/worker_1168.csv', 'qos_outputs_0/master_97.csv', 'qos_outputs_0/master_75.csv', 'qos_outputs_0/worker_468.csv', 'qos_outputs_0/worker_433.csv', 'qos_outputs_0/worker_765.csv', 'qos_outputs_0/worker_60.csv', 'qos_outputs_0/worker_1028.csv', 'qos_outputs_0/master_137.csv', 'qos_outputs_0/master_971.csv', 'qos_outputs_0/worker_1907.csv', 'qos_outputs_0/master_1378.csv', 'qos_outputs_0/master_1337.csv', 'qos_ou

In [3]:
"""
Quick plot to partially visualize the data.
"""
# import seaborn as sns
# import matplotlib.pyplot as plt
# for i, df in enumerate(dfs):
#     sns.scatterplot(y="queue", x="start_time", hue="source", data=df)
#     plt.title(titles[i])
#     plt.show()





    # sns.scatterplot(y="queue", x="Unnamed: 0", data=df)
    # plt.savefig("yolo_queue.png")
    # df.plot.scatter(y=df.queue, x=df.index)

'\nQuick plot to partially visualize the data.\n'

In [4]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# for i, df in enumerate(dfs):
#     df2 = df.copy()
#     df2["q"] = df2["end_time"] - df2["start_time"]
#     sns.scatterplot(y="q", x="start_time", hue="source", data=df2)
#     plt.ylim(0,200)
#     plt.title(titles[i])
#     plt.show()





    # sns.scatterplot(y="queue", x="Unnamed: 0", data=df)
    # plt.savefig("yolo_queue.png")
    # df.plot.scatter(y=df.queue, x=df.index)

In [5]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# for i, df in enumerate(dfs):
#     df2 = df.copy()
#     df2["q"] = df2["end_time"] - df2["start_time"]
#     sns.ecdfplot(y="start_time", hue="source", data=df2)
#     plt.title(titles[i])
#     plt.show()
#     sns.ecdfplot(y="end_time", hue="source", data=df2)
#     plt.title(titles[i])
#     plt.show()
#     sns.ecdfplot(y="q", hue="source", data=df2)
