In [None]:
import os.path
import time
import zipfile
import pandas as pd
import ujson as json
import os

"""
00: Configuration and imports

NOTE: If using DataSpell, or similar IDE, you may need to increase the maximum allowed memory usage.
- Larger datasets will cause the IDE to completely freeze with the default limit of 4GB ram.
"""

# This script will process all zips located at the input_path
# input_path = "../../data/raw_datasets/8.8_ajot"
# input_path = "/home/anton/Downloads/ov-ajo"
# input_path = "../../data/raw_datasets/ov_vs_pytorch"
# output_path = "../../data/processed/ov_vs_pytorch/prom"
input_path = "../../data/raw_datasets/19.8._linear_run"
output_path = "../../data/processed/linear/prom"


# Print all zips that will be processed
def list_zip_files(directory_path):
    zip_files = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.lower().endswith(".zip") or file.lower().endswith(".7z") :
                # zip_files.append(file)
                full_path = os.path.join(root, file)
                relative_path = os.path.relpath(full_path, directory_path)
                zip_files.append(relative_path)
    return zip_files
zip_files_list = list_zip_files(input_path)

print("List of zip files:")
for zip_file in zip_files_list:
    print(zip_file)

In [17]:
"""
01: Helper functions
"""

import json
import zipfile
import pandas as pd
import time
from tqdm import tqdm  # Import the tqdm module for progress bar
import utils.prometheus_processing as prom_util

import py7zr

# Open the .7z file

def get_slices(zip_file, size_limit_mb):
    if zip_file.endswith(".zip"):
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            items = zip_ref.namelist()
            json_files = [x for x in items if x.endswith('.json')]
            json_files_info = [(x, zip_ref.getinfo(x)) for x in json_files]
            json_files_info = sorted(json_files_info, key=lambda x: x[1].file_size, reverse=True)
    else:
        print(f"Cannot parse {zip_file}")

    total_file_size = sum(info.file_size for _, info in json_files_info)
    slice_limit = size_limit_mb * 1024 * 1024  # 100MB in bytes
    slices = []
    current_slice = []
    current_size = 0

    for file_name, file_info in json_files_info:
        if current_size + file_info.file_size <= slice_limit:
            current_slice.append(file_name)
            current_size += file_info.file_size
        else:
            slices.append(current_slice)
            current_slice = [file_name]
            current_size = file_info.file_size

    if current_slice:
        slices.append(current_slice)

    return slices

def parse_slice(zip_file, slice):
    values_container = {}
    index = 0
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        for path in slice:
            size_in_megabytes = zip_ref.getinfo(path).file_size / (1024 * 1024)
            print(f"\t{index}: {size_in_megabytes} MB, {path}")
            index += 1
            with zip_ref.open(path) as json_file:
                parse_metric(json_file, path, values_container)

    values_df = pd.DataFrame(values_container).apply(pd.to_numeric,
                                                     errors='ignore')  # Move to numeric if possible, cutting off 90% of size
    return values_df


def parse_metric(data, path, values_container):
        json_data = json.load(data)
        # print(path)

        # LOOP THROUGH EACH SUB-METRIC
        try:
            for item in json_data['data']['result']:
                header = json.dumps(item['metric']) # Use a tuple of the metric dictionary's items
                values = dict(item['values'])

                # ADD HEADER KEY TO VALUES DICT
                if header not in values_container:
                    values_container[header] = {}
                values_container[header].update(values)

        except KeyError as e:
            print(f"KeyError occurred while parsing JSON file '{path}': {e}")
        except ValueError as e:
            print(f"ValueError occurred while parsing JSON file '{path}': {e}")
        except Exception as e:
            print(f"An unexpected error occurred while parsing JSON file '{path}': {e}")


def main(input_path, zip_relative_path, output_path2):
    dfs = []
    print(f"Processing {zip_relative_path}")
    zip_name = zip_relative_path.replace(".zip", "")  # Remove file-extension for now
    full_output_path = f"{output_path2}/{zip_name}"
    intermediate_folder_path = f"{full_output_path}/intermediate"
    processed_folder_path = f"{full_output_path}/"
    start_time = time.time()
    max_slice_size_mb = 200
    slices = get_slices(f"{input_path}/{zip_relative_path}", max_slice_size_mb)
    for i, slice in enumerate(slices):
        os.makedirs(intermediate_folder_path, exist_ok=True)
        output_path = intermediate_folder_path + f"/{i}.feather"
        if os.path.exists(output_path):
            values = pd.read_feather(output_path)
            print(f"Got intermediate file from {output_path}")
        else:
            values = parse_slice(
                zip_file=f'{input_path}/{zip_relative_path}',
                slice=slice,
            )
            # values = values.apply(pd.to_numeric, errors='coerce')
            values.reset_index(drop=False, inplace=True, names=["timestamp"])  # Reset to default index (in case of old pandas/pyarrow version)
            values.to_feather(output_path)
            print(f"Saved intermediate {output_path}")
        values.index = values["timestamp"]
        values.drop(columns=["timestamp"], inplace=True)
        dfs.append(values)
    
    df = pd.concat(dfs, axis=1)
    df = df.loc[:,
         ~df.columns.duplicated()]  # TODO: Does removing duplicates remove information? Happens probably at zip-file slice boundaries
    df = df.reset_index(drop=False, inplace=False, names=["timestamp"])  # Reset to default index (in case of old pandas/pyarrow version)
    df.to_feather(intermediate_folder_path + f"/full.feather")
    df.index = df["timestamp"]
    df.drop(columns=["timestamp"], inplace=True)
    
    print(f"Saved full df to {intermediate_folder_path}/full.feather")

    # df.index = df["timestamp"]  # Re-add index (in case of old pandas/pyarrow version)

    # Split df by instance
    sub_dfs = prom_util.sub_df_by_instance(df)

    # Minimize headers and save each instance as separate file
    for instance, sub_df in sub_dfs.items():
        df_minimized = sub_df.copy()
        # df_minimized.index = df_minimized["timestamp"] # 
        # df_minimized.drop("index", axis=1, inplace=True)

        # Group headers by name
        grouped_by_name = {}
        for col in list(df_minimized.columns):
            header_dict = json.loads(col)
            name = header_dict["__name__"]
            if name not in grouped_by_name:
                grouped_by_name[name] = {}
            grouped_by_name[name][col] = header_dict

        # Minimize headers
        for feature_name, headers in grouped_by_name.items():
            try:
                descriptive_keys = prom_util.get_descriptive_keys(headers)
            except:
                print(f"Non-matching keys: {feature_name}")
                continue
            prom_util.remove_unnecessary_keys(df_minimized, headers, descriptive_keys)

        # Save df
        path = f"{processed_folder_path}"
        os.makedirs(path, exist_ok=True)
        # df_minimized = df_minimized.sort_index()  # Make sure the dataframe is sorted by timestamp
        # df_minimized.index = df_minimized["index"]
        df_minimized = df_minimized.sort_index().reset_index(drop=False, inplace=False, names=["timestamp"])
        # print(df_minimized.index)
        df_minimized.to_feather(path + f"/{instance}.feather")
        print("Saved instanced df as", path + f"/{instance}.feather")

"""
02: Process and save dataframes
"""

zips = list_zip_files(input_path)
print(zips)

for zip_name_full in zips:
    print(zip_name_full)
    main(input_path, zip_name_full, output_path)


Saved full df to ../../data/processed/linear/prom/linear/intermediate/full.feather
Non-matching keys: apiserver_request_sli_duration_seconds_bucket
['__name__', 'endpoint', 'instance', 'job', 'namespace', 'service']
Non-matching keys: apiserver_request_duration_seconds_bucket
Non-matching keys: apiserver_response_sizes_bucket
Non-matching keys: kubernetes_feature_enabled
Non-matching keys: apiserver_watch_events_sizes_bucket
['__name__', 'endpoint', 'host', 'instance', 'job', 'namespace', 'service']
Non-matching keys: apiserver_request_duration_seconds_sum
Non-matching keys: apiserver_request_total
['__name__', 'endpoint', 'host', 'instance', 'job', 'namespace', 'service']
['__name__', 'endpoint', 'host', 'instance', 'job', 'namespace', 'service']
Non-matching keys: apiserver_request_duration_seconds_count
['__name__', 'endpoint', 'instance', 'job', 'namespace', 'service']
['__name__', 'endpoint', 'instance', 'job', 'namespace', 'service']
['__name__', 'endpoint', 'instance', 'job', 'n

In [18]:
"""
03: Print some statistics from the resulting dataframes

- Mostly for quick sanity checking of the results
"""
import os
import pandas as pd
import pyarrow.feather as feather

def count_feather_files(fpath):
    feather_files = []
    file_info = []
    for root, dirs, files in os.walk(fpath):
        for file in files:
            if file.endswith(".feather"):
                file_path = os.path.join(root, file)
                feather_files.append(file_path)

    for file_path in feather_files:
        try:
            df = pd.read_feather(file_path)
            file_size = os.path.getsize(file_path)
            file_info.append((file_path, file_size, len(df.columns), len(df)))
        except pd.errors.EmptyDataError:
            file_info.append((file_path, 0, 0, 0))
        except Exception as e:
            file_info.append((file_path, -1, -1, -1))

    file_info.sort(key=lambda x: x[1], reverse=True)  # Sort based on file size in descending order

    for info in file_info:
        print("Size:", info[1] / 10**6, "mb", end="\t")
        print("Cols:", info[2], end="\t")
        print("Rows:", info[3], end="\t")
        print("File:", info[0].replace(fpath, ""))

# Provide the path to the folder containing the feather files
count_feather_files(output_path)

Size: 515.92625 mb	Cols: 195788	Rows: 148	File: /linear/intermediate/full.feather
Size: 201.285322 mb	Cols: 75708	Rows: 148	File: /linear/intermediate/3.feather
Size: 199.36185 mb	Cols: 70604	Rows: 148	File: /linear/intermediate/2.feather
Size: 60.571162 mb	Cols: 29333	Rows: 148	File: /linear/130.233.193.117:6443.feather
Size: 54.512426 mb	Cols: 35791	Rows: 148	File: /linear/worker1.feather
Size: 42.712674 mb	Cols: 28200	Rows: 148	File: /linear/worker2.feather
Size: 40.057674 mb	Cols: 26398	Rows: 148	File: /linear/worker3.feather
Size: 36.29949 mb	Cols: 23873	Rows: 148	File: /linear/worker5.feather
Size: 34.102106 mb	Cols: 22591	Rows: 148	File: /linear/worker4.feather
Size: 31.769514 mb	Cols: 12127	Rows: 148	File: /linear/intermediate/4.feather
Size: 27.601098 mb	Cols: 12017	Rows: 148	File: /linear/intermediate/1.feather
Size: 25.972522 mb	Cols: 12325	Rows: 148	File: /linear/intermediate/5.feather
Size: 17.169754 mb	Cols: 6645	Rows: 148	File: /linear/intermediate/0.feather
Size: 12.778