In [15]:
import json
import pandas as pd

def parse_metric(data, path, values_container):
    json_data = json.load(data)
    # print(path)

    # LOOP THROUGH EACH SUB-METRIC
    try:
        for item in json_data['data']['result']:
            header = json.dumps(item['metric']) # Use a tuple of the metric dictionary's items
            values = dict(item['values'])

            # ADD HEADER KEY TO VALUES DICT
            if header not in values_container:
                values_container[header] = {}
            values_container[header].update(values)

    except KeyError as e:
        print(f"KeyError occurred while parsing JSON file '{path}': {e}")
    except ValueError as e:
        print(f"ValueError occurred while parsing JSON file '{path}': {e}")
    except Exception as e:
        print(f"An unexpected error occurred while parsing JSON file '{path}': {e}")

"""
Issues:
- merging is very slow with multiple files in folder
- The first timestamp of every consecutive file overlaps

Solution:
- Fully process all files of a single folder before merging dataframes
"""

small_p = r"small/kepler_container_cpu_cycles_total/1735842642-1735842914.json"
big_p = r"big\apiserver_request_sli_duration_seconds_bucket\1738709783-1738710783.json"
values_container  = {}
with open(big_p) as f:
    parse_metric(f, p, values_container)
df = pd.DataFrame.from_dict(values_container)




In [12]:
import os


def find_top_folders_by_size(path, top_n=10):
    # List all folders and calculate their sizes
    folders = [
        (folder, sum(os.path.getsize(os.path.join(root, file)) for root, _, files in os.walk(folder) for file in files))
        for folder in [os.path.join(path, d) for d in os.listdir(path)]
        if os.path.isdir(folder)
    ]

    # Sort folders by size in descending order
    top_folders = sorted(folders, key=lambda x: x[1], reverse=True)[:top_n]

    # Convert bytes to megabytes and print results
    for index, (folder, size) in enumerate(top_folders, start=1):
        size_in_mb = size / (1024 * 1024)  # Convert bytes to MB
        print(f"{index}. Folder: {folder}, Size: {size_in_mb:.2f} MB")

    return [(folder, size / (1024 * 1024)) for folder, size in top_folders]


# Provide the path and call the function
path_to_scan = r"big"
find_top_folders_by_size(path_to_scan)


1. Folder: big\apiserver_request_sli_duration_seconds_bucket, Size: 4380.75 MB
2. Folder: big\etcd_request_duration_seconds_bucket, Size: 4311.61 MB
3. Folder: big\apiserver_request_duration_seconds_bucket, Size: 3541.98 MB
4. Folder: big\apiserver_response_sizes_bucket, Size: 1129.72 MB
5. Folder: big\qos_outputs_0, Size: 1082.37 MB
6. Folder: big\kubelet_runtime_operations_duration_seconds_bucket, Size: 1049.07 MB
7. Folder: big\kubernetes_feature_enabled, Size: 675.04 MB
8. Folder: big\container_memory_failures_total, Size: 664.37 MB
9. Folder: big\storage_operation_duration_seconds_bucket, Size: 654.09 MB
10. Folder: big\kepler_process_package_joules_total, Size: 534.51 MB


[('big\\apiserver_request_sli_duration_seconds_bucket', 4380.7509479522705),
 ('big\\etcd_request_duration_seconds_bucket', 4311.6130027771),
 ('big\\apiserver_request_duration_seconds_bucket', 3541.981864929199),
 ('big\\apiserver_response_sizes_bucket', 1129.7169761657715),
 ('big\\qos_outputs_0', 1082.3690786361694),
 ('big\\kubelet_runtime_operations_duration_seconds_bucket',
  1049.0686979293823),
 ('big\\kubernetes_feature_enabled', 675.0434646606445),
 ('big\\container_memory_failures_total', 664.3732261657715),
 ('big\\storage_operation_duration_seconds_bucket', 654.093339920044),
 ('big\\kepler_process_package_joules_total', 534.5144147872925)]

In [14]:
import os


def find_top_files_by_size(path, top_n=10):
    # List all files with their sizes
    files = [
        (os.path.join(root, file), os.path.getsize(os.path.join(root, file)))
        for root, _, files in os.walk(path)
        for file in files
    ]

    # Sort files by size in descending order
    top_files = sorted(files, key=lambda x: x[1], reverse=True)[:top_n]

    # Convert bytes to MB and print results
    for index, (file, size) in enumerate(top_files, start=1):
        size_in_mb = size / (1024 * 1024)  # Convert bytes to MB
        print(f"{index}. File: {file}, Size: {size_in_mb:.2f} MB")

    return [(file, size / (1024 * 1024)) for file, size in top_files]


# Provide the path and call the function
path_to_scan = r"big"
find_top_files_by_size(path_to_scan)


1. File: big\apiserver_request_sli_duration_seconds_bucket\1738709783-1738710783.json, Size: 148.32 MB
2. File: big\apiserver_request_sli_duration_seconds_bucket\1738710783-1738711783.json, Size: 148.32 MB
3. File: big\apiserver_request_sli_duration_seconds_bucket\1738711783-1738712783.json, Size: 148.32 MB
4. File: big\apiserver_request_sli_duration_seconds_bucket\1738708783-1738709783.json, Size: 148.32 MB
5. File: big\apiserver_request_sli_duration_seconds_bucket\1738696783-1738697783.json, Size: 148.32 MB
6. File: big\apiserver_request_sli_duration_seconds_bucket\1738697783-1738698783.json, Size: 148.32 MB
7. File: big\apiserver_request_sli_duration_seconds_bucket\1738698783-1738699783.json, Size: 148.32 MB
8. File: big\apiserver_request_sli_duration_seconds_bucket\1738699783-1738700783.json, Size: 148.32 MB
9. File: big\apiserver_request_sli_duration_seconds_bucket\1738700783-1738701783.json, Size: 148.32 MB
10. File: big\apiserver_request_sli_duration_seconds_bucket\1738701783-17

[('big\\apiserver_request_sli_duration_seconds_bucket\\1738709783-1738710783.json',
  148.32113647460938),
 ('big\\apiserver_request_sli_duration_seconds_bucket\\1738710783-1738711783.json',
  148.32113647460938),
 ('big\\apiserver_request_sli_duration_seconds_bucket\\1738711783-1738712783.json',
  148.32113647460938),
 ('big\\apiserver_request_sli_duration_seconds_bucket\\1738708783-1738709783.json',
  148.32025527954102),
 ('big\\apiserver_request_sli_duration_seconds_bucket\\1738696783-1738697783.json',
  148.31691932678223),
 ('big\\apiserver_request_sli_duration_seconds_bucket\\1738697783-1738698783.json',
  148.31691932678223),
 ('big\\apiserver_request_sli_duration_seconds_bucket\\1738698783-1738699783.json',
  148.31691932678223),
 ('big\\apiserver_request_sli_duration_seconds_bucket\\1738699783-1738700783.json',
  148.31691932678223),
 ('big\\apiserver_request_sli_duration_seconds_bucket\\1738700783-1738701783.json',
  148.31691932678223),
 ('big\\apiserver_request_sli_duratio

In [18]:
import os

file_paths = [os.path.join(root, file) for root, _, files in os.walk(r"big\apiserver_request_sli_duration_seconds_bucket") for file in files]

# Print file paths along with their sizes
for file_path in file_paths:
    size_in_mb = os.path.getsize(file_path) / (1024 * 1024)  # Convert bytes to MB
    print(f"File: {file_path}, Size: {size_in_mb:.2f} MB")

File: big\apiserver_request_sli_duration_seconds_bucket\1738683783-1738684783.json, Size: 148.29 MB
File: big\apiserver_request_sli_duration_seconds_bucket\1738684783-1738685783.json, Size: 148.30 MB
File: big\apiserver_request_sli_duration_seconds_bucket\1738685783-1738686783.json, Size: 148.30 MB
File: big\apiserver_request_sli_duration_seconds_bucket\1738686783-1738687783.json, Size: 148.30 MB
File: big\apiserver_request_sli_duration_seconds_bucket\1738687783-1738688783.json, Size: 148.31 MB
File: big\apiserver_request_sli_duration_seconds_bucket\1738688783-1738689783.json, Size: 148.31 MB
File: big\apiserver_request_sli_duration_seconds_bucket\1738689783-1738690783.json, Size: 148.31 MB
File: big\apiserver_request_sli_duration_seconds_bucket\1738690783-1738691783.json, Size: 148.31 MB
File: big\apiserver_request_sli_duration_seconds_bucket\1738691783-1738692783.json, Size: 148.31 MB
File: big\apiserver_request_sli_duration_seconds_bucket\1738692783-1738693783.json, Size: 148.31 MB


In [23]:
values_container  = {}
for i, file_path in enumerate(file_paths):
    print(f"Processing file {i+1}/{len(file_paths)}")
    with open(file_path) as f:
        parse_metric(f, p, values_container)

Processing file 1/30
Processing file 2/30
Processing file 3/30
Processing file 4/30
Processing file 5/30
Processing file 6/30
Processing file 7/30
Processing file 8/30
Processing file 9/30
Processing file 10/30
Processing file 11/30
Processing file 12/30
Processing file 13/30
Processing file 14/30
Processing file 15/30
Processing file 16/30
Processing file 17/30
Processing file 18/30
Processing file 19/30
Processing file 20/30
Processing file 21/30
Processing file 22/30
Processing file 23/30
Processing file 24/30
Processing file 25/30
Processing file 26/30
Processing file 27/30
Processing file 28/30
Processing file 29/30
Processing file 30/30


In [24]:
print(f"Size of values_container in megabytes: {len(values_container) * 1024 * 1024 / (1024 * 1024):.2f} MB ")

Size of values_container in megabytes: 6798.00 MB 


In [25]:
df = pd.DataFrame.from_dict(values_container)

In [26]:
print(f"Size of df in megabytes: {df.memory_usage().sum() / (1024 * 1024):.2f} MB ")

Size of df in megabytes: 306.31 MB 


In [27]:
df

Unnamed: 0,"{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""component"": ""apiserver"", ""container"": ""prometheus-adapter"", ""endpoint"": ""https"", ""group"": ""metrics.k8s.io"", ""instance"": ""10.213.61.2:6443"", ""job"": ""prometheus-adapter"", ""le"": ""+Inf"", ""namespace"": ""monitoring"", ""pod"": ""prometheus-adapter-6f6995c6d9-lm9kb"", ""resource"": ""nodes"", ""scope"": ""cluster"", ""service"": ""prometheus-adapter"", ""verb"": ""LIST"", ""version"": ""v1beta1""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""component"": ""apiserver"", ""container"": ""prometheus-adapter"", ""endpoint"": ""https"", ""group"": ""metrics.k8s.io"", ""instance"": ""10.213.61.2:6443"", ""job"": ""prometheus-adapter"", ""le"": ""+Inf"", ""namespace"": ""monitoring"", ""pod"": ""prometheus-adapter-6f6995c6d9-lm9kb"", ""resource"": ""pods"", ""scope"": ""cluster"", ""service"": ""prometheus-adapter"", ""verb"": ""LIST"", ""version"": ""v1beta1""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""component"": ""apiserver"", ""container"": ""prometheus-adapter"", ""endpoint"": ""https"", ""group"": ""metrics.k8s.io"", ""instance"": ""10.213.61.2:6443"", ""job"": ""prometheus-adapter"", ""le"": ""+Inf"", ""namespace"": ""monitoring"", ""pod"": ""prometheus-adapter-6f6995c6d9-lm9kb"", ""resource"": ""pods"", ""scope"": ""namespace"", ""service"": ""prometheus-adapter"", ""verb"": ""LIST"", ""version"": ""v1beta1""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""component"": ""apiserver"", ""container"": ""prometheus-adapter"", ""endpoint"": ""https"", ""group"": ""metrics.k8s.io"", ""instance"": ""10.213.61.2:6443"", ""job"": ""prometheus-adapter"", ""le"": ""0.05"", ""namespace"": ""monitoring"", ""pod"": ""prometheus-adapter-6f6995c6d9-lm9kb"", ""resource"": ""nodes"", ""scope"": ""cluster"", ""service"": ""prometheus-adapter"", ""verb"": ""LIST"", ""version"": ""v1beta1""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""component"": ""apiserver"", ""container"": ""prometheus-adapter"", ""endpoint"": ""https"", ""group"": ""metrics.k8s.io"", ""instance"": ""10.213.61.2:6443"", ""job"": ""prometheus-adapter"", ""le"": ""0.05"", ""namespace"": ""monitoring"", ""pod"": ""prometheus-adapter-6f6995c6d9-lm9kb"", ""resource"": ""pods"", ""scope"": ""cluster"", ""service"": ""prometheus-adapter"", ""verb"": ""LIST"", ""version"": ""v1beta1""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""component"": ""apiserver"", ""container"": ""prometheus-adapter"", ""endpoint"": ""https"", ""group"": ""metrics.k8s.io"", ""instance"": ""10.213.61.2:6443"", ""job"": ""prometheus-adapter"", ""le"": ""0.05"", ""namespace"": ""monitoring"", ""pod"": ""prometheus-adapter-6f6995c6d9-lm9kb"", ""resource"": ""pods"", ""scope"": ""namespace"", ""service"": ""prometheus-adapter"", ""verb"": ""LIST"", ""version"": ""v1beta1""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""component"": ""apiserver"", ""container"": ""prometheus-adapter"", ""endpoint"": ""https"", ""group"": ""metrics.k8s.io"", ""instance"": ""10.213.61.2:6443"", ""job"": ""prometheus-adapter"", ""le"": ""0.1"", ""namespace"": ""monitoring"", ""pod"": ""prometheus-adapter-6f6995c6d9-lm9kb"", ""resource"": ""nodes"", ""scope"": ""cluster"", ""service"": ""prometheus-adapter"", ""verb"": ""LIST"", ""version"": ""v1beta1""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""component"": ""apiserver"", ""container"": ""prometheus-adapter"", ""endpoint"": ""https"", ""group"": ""metrics.k8s.io"", ""instance"": ""10.213.61.2:6443"", ""job"": ""prometheus-adapter"", ""le"": ""0.1"", ""namespace"": ""monitoring"", ""pod"": ""prometheus-adapter-6f6995c6d9-lm9kb"", ""resource"": ""pods"", ""scope"": ""cluster"", ""service"": ""prometheus-adapter"", ""verb"": ""LIST"", ""version"": ""v1beta1""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""component"": ""apiserver"", ""container"": ""prometheus-adapter"", ""endpoint"": ""https"", ""group"": ""metrics.k8s.io"", ""instance"": ""10.213.61.2:6443"", ""job"": ""prometheus-adapter"", ""le"": ""0.1"", ""namespace"": ""monitoring"", ""pod"": ""prometheus-adapter-6f6995c6d9-lm9kb"", ""resource"": ""pods"", ""scope"": ""namespace"", ""service"": ""prometheus-adapter"", ""verb"": ""LIST"", ""version"": ""v1beta1""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""component"": ""apiserver"", ""container"": ""prometheus-adapter"", ""endpoint"": ""https"", ""group"": ""metrics.k8s.io"", ""instance"": ""10.213.61.2:6443"", ""job"": ""prometheus-adapter"", ""le"": ""0.2"", ""namespace"": ""monitoring"", ""pod"": ""prometheus-adapter-6f6995c6d9-lm9kb"", ""resource"": ""nodes"", ""scope"": ""cluster"", ""service"": ""prometheus-adapter"", ""verb"": ""LIST"", ""version"": ""v1beta1""}",...,"{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""endpoint"": ""https"", ""instance"": ""130.233.193.117:6443"", ""job"": ""apiserver"", ""le"": ""60"", ""namespace"": ""default"", ""service"": ""kubernetes"", ""subresource"": ""/healthz"", ""verb"": ""GET""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""endpoint"": ""https"", ""instance"": ""130.233.193.117:6443"", ""job"": ""apiserver"", ""le"": ""60"", ""namespace"": ""default"", ""service"": ""kubernetes"", ""subresource"": ""/livez"", ""verb"": ""GET""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""endpoint"": ""https"", ""instance"": ""130.233.193.117:6443"", ""job"": ""apiserver"", ""le"": ""60"", ""namespace"": ""default"", ""service"": ""kubernetes"", ""subresource"": ""/readyz"", ""verb"": ""GET""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""endpoint"": ""https"", ""instance"": ""130.233.193.117:6443"", ""job"": ""apiserver"", ""le"": ""60"", ""namespace"": ""default"", ""service"": ""kubernetes"", ""subresource"": ""openapi/v3"", ""verb"": ""GET""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""endpoint"": ""https"", ""instance"": ""130.233.193.117:6443"", ""job"": ""apiserver"", ""le"": ""60"", ""namespace"": ""default"", ""service"": ""kubernetes"", ""subresource"": ""openapi/v3/"", ""verb"": ""GET""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""endpoint"": ""https"", ""instance"": ""130.233.193.117:6443"", ""job"": ""apiserver"", ""le"": ""8"", ""namespace"": ""default"", ""service"": ""kubernetes"", ""subresource"": ""/healthz"", ""verb"": ""GET""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""endpoint"": ""https"", ""instance"": ""130.233.193.117:6443"", ""job"": ""apiserver"", ""le"": ""8"", ""namespace"": ""default"", ""service"": ""kubernetes"", ""subresource"": ""/livez"", ""verb"": ""GET""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""endpoint"": ""https"", ""instance"": ""130.233.193.117:6443"", ""job"": ""apiserver"", ""le"": ""8"", ""namespace"": ""default"", ""service"": ""kubernetes"", ""subresource"": ""/readyz"", ""verb"": ""GET""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""endpoint"": ""https"", ""instance"": ""130.233.193.117:6443"", ""job"": ""apiserver"", ""le"": ""8"", ""namespace"": ""default"", ""service"": ""kubernetes"", ""subresource"": ""openapi/v3"", ""verb"": ""GET""}","{""__name__"": ""apiserver_request_sli_duration_seconds_bucket"", ""endpoint"": ""https"", ""instance"": ""130.233.193.117:6443"", ""job"": ""apiserver"", ""le"": ""8"", ""namespace"": ""default"", ""service"": ""kubernetes"", ""subresource"": ""openapi/v3/"", ""verb"": ""GET""}"
1738683783,1,2,1,1,2,1,1,2,1,1,...,1002845,1003589,10035896,1091,9,1002845,1003589,10035896,1091,9
1738683788,1,2,1,1,2,1,1,2,1,1,...,1002845,1003589,10035896,1091,9,1002845,1003589,10035896,1091,9
1738683793,1,2,1,1,2,1,1,2,1,1,...,1002848,1003592,10035926,1092,9,1002848,1003592,10035926,1092,9
1738683798,1,2,1,1,2,1,1,2,1,1,...,1002848,1003592,10035926,1092,9,1002848,1003592,10035926,1092,9
1738683803,1,2,1,1,2,1,1,2,1,1,...,1002848,1003592,10035926,1092,9,1002848,1003592,10035926,1092,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1738713283,1,2,1,1,2,1,1,2,1,1,...,1005795,1006541,10065416,1092,9,1005795,1006541,10065416,1092,9
1738713288,1,2,1,1,2,1,1,2,1,1,...,1005795,1006541,10065416,1092,9,1005795,1006541,10065416,1092,9
1738713293,1,2,1,1,2,1,1,2,1,1,...,1005795,1006541,10065416,1092,9,1005795,1006541,10065416,1092,9
1738713298,1,2,1,1,2,1,1,2,1,1,...,1005795,1006541,10065416,1092,9,1005795,1006541,10065416,1092,9
