In [None]:
# import json
import ujson as json
import os
import pprint
import time

import numpy as np
import pandas as pd

dataset_root = "../../data/raw_datasets/short-test-2/prometheus.zip"
output_path = "../../data/processed/short-test-2/"

# Each name specified here will result in a separate .csv file
# In addition to workers, our cluster produces data attributed to 'master' and some random IPs
worker_names = ["worker1", "worker2", "worker3", "worker4", "worker5"]

        


In [None]:
"""
Step 1: Read zip

Read all files in zip to a dictionary. Each metric results in a separate dictionary entry.

These DataFrames then have to be (later on) combined together, while accounting for overlapping columns.
"""

# Open as zip
import zipfile
import time

names_container = {}
values_container = {}
timestamps_container = {}
with zipfile.ZipFile(dataset_root, 'r') as zip_ref:

    # Get a list of all files inside the zip
    items = zip_ref.namelist()

    # Filter a list of all json files inside the zip
    json_files = [x for x in items if x.endswith('.json')]
    
    i = len(json_files) # Change this to a small number to only ready a limited amount of data (for quick debugging)
    # i = 100
    count = 0
    start_time = time.time()

    # Iterate over all json files
    for path in json_files:
        count += 1
        print(f"Progress {count}/{len(json_files):6}, ({count/len(json_files)*100:5.3} %) (time_spent: {time.time() - start_time:.3} s  - avg: {(time.time() - start_time) / count} s)")
        with zip_ref.open(path) as json_file:
            json_data = json.load(json_file)

            # Iterate over each submetric
            for item in json_data['data']['result']:
                
                # Get name of the metric, metric values and metric timestamps
                header = item['metric']
                header_str = json.dumps(header)
                name = header["__name__"]
                values = dict(item['values'])
                stamps, vals = list(zip(*item['values']))

                # Add new dict key if it does not exist
                if header_str not in values_container:
                    values_container[header_str] = []
                    timestamps_container[header_str] = []

                # Add metric to dict
                values_container[header_str].extend(values)
                timestamps_container[header_str].extend(stamps)
                
                # Track sub_headers for each file (TODO: elaborate?)
                if name not in names_container:
                    names_container[name] = []
                names_container[name].append(header_str)
        i -= 1
        if i <= 0:
            print("Manually specified limit reached. Stopping here.")
        

In [None]:
"""
Step 2: Sort and filter metrics

Sort the metrics inside the dictionary 

- Filter out redundant features (e.g., if the "name" column is "worker1" for all rows of a metric, then that column is redundant)
- Sort metrics by worker (e.g., create a new dict for each worker, where each dict contains data only from that worker)  
"""

def get_unique_keys(headers) -> list:
    """ Get all keys that have differing values between each feature """
    # print(headers)
    filtered_data: list[dict] = [json.loads(x) for x in headers]

    # Verify that all features have the same keys
    key_sets = [set(x.keys()) for x in filtered_data]
    combined_set = set()
    for key_set in key_sets:
        combined_set.update(key_set)
    if len(key_sets[0]) != len(combined_set):
        print("WARNING: Features have different keys!")
        return []

    # Get all keys that have differing values between each feature
    different_keys = []
    for key in combined_set:
        # print(key)
        values = [data[key] for data in filtered_data if key in data]
        if len(set(values)) > 1:
            different_keys.append(key)
    return different_keys

def divide_to_workers(headers: list):
    headers_by_worker = {x: [] for x in worker_names}
    for header in headers:
        # print(header)
        json_data = json.loads(header)
        if "instance" not in json_data:
            continue
        instance = json_data["instance"]
        if instance not in worker_names:
            continue
        headers_by_worker[instance].append(header)
    return headers_by_worker

def get_unique_column_name(header, unique_keys):
    header_dict = json.loads(header)
    differing_keys = unique_keys
    postfix = ""
    for key in differing_keys:
        postfix += f"_{key}_{header_dict[key]}"
    return header_dict["__name__"] + postfix

def get_unique_column_name_2(header, unique_keys):
    """ Old style, without key """
    header_dict = json.loads(header)
    differing_keys = unique_keys
    postfix = ""
    for key in differing_keys:
        postfix += f"_{header_dict[key]}"
    return header_dict["__name__"] + postfix

df_by_worker = {}
count = 0
start_time = time.time()
import numpy as np

for name, headers in names_container.items():
    count += 1
    print(f"Progress {count}/{len(names_container)}, ({count/len(names_container)*100} %) (time_spent: {time.time() - start_time} s)")
    headers_by_worker = divide_to_workers(headers)
    # print(headers_by_worker)
    prev = set()
    for worker, worker_headers in headers_by_worker.items():
        if len(worker_headers) == 0:
            continue
        unique_keys = get_unique_keys(worker_headers)
        if len(unique_keys) > 1:
            continue  # TODO: This discards a lot of data
        
        for x in worker_headers:
            # unique_name = get_unique_column_name(x, unique_keys)
            unique_name = get_unique_column_name_2(x, unique_keys)
            # print(worker + "_" + unique_name)
            sub_df = pd.DataFrame({unique_name: values_container[x]}, index=timestamps_container[x], dtype=np.float32)
            # new = set(sub_df.index)
            new = set(timestamps_container[x])
            if prev != new:
                print(f"{min(new)} {max(new)}")
            prev = new
            # print(sub_df)
            if worker not in df_by_worker:
                df_by_worker[worker] = []
            # df_by_worker[worker].update(sub_df)
            df_by_worker[worker].append(sub_df)
    # for header in headers:
    #     print(header)
    #     json_data = json.loads(header)
    #     instance = json_data["instance"]
    #     if instance not in ["worker1", "worker2", "worker3", "worker4", "worker5"]:
    #         continue
    #     print(json_data["__name__"])
    #     print(json_data["instance"])
    # break

In [None]:
"""
Step 3: Combine data and save as separate DataFrames

Combine metrics for each worker into a DataFrame. Each DataFrame contains data relevant to only one worker.

- Creates multiple DataFrames
- Saves those DataFrames as feather-files (that is supported by pandas)
- Try to filter out duplicated rows, if they exist
"""

batch_len = 1500
dfs = {}
os.makedirs(output_path, exist_ok=True)
for worker in worker_names:
    df = pd.DataFrame()
    if worker not in df_by_worker:
        print(f"Error: {worker} not found in df_by_worker")
        continue
    subdfs = df_by_worker[worker]
    subdfs_nodup_index = [x[~x.index.duplicated(keep='first')] for x in subdfs]
    for i in range(1, len(subdfs_nodup_index), batch_len):
        print(f"{worker}: {i}: {i/len(subdfs_nodup_index)}")
        end = min(i+batch_len, len(subdfs_nodup_index))
        x = pd.concat(subdfs_nodup_index[i:end], axis=1)
        x = x.loc[:,~x.columns.duplicated()]
        # x = x.dropna(axis=0)
        df = pd.concat([df, x], axis=1)
    df = df.loc[:,~df.columns.duplicated()]  # TODO: Where are these duplicate columns coming from?
    dfs[worker] = df
    df.to_feather(os.path.join(output_path, worker + ".feather"))
    print(f"Saved {worker}.feather")
    # w1.drop_duplicates()
    # w1[~w1.columns.duplicated(keep='first')]
    # w1[~w1.index.duplicated(keep='first')]


## Too many cols, col names and values are duplicated
# len(set(w1.columns.to_list()))

In [None]:
"""
---------------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------------
Stuff below includes some unsorted testing and testing -- not directly related to processing the datasets
---------------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------------
"""

In [None]:
w1 = dfs[worker_names[0]]
print(len(w1.columns))
print(len(set(w1.columns)))
print(len(w1.index))
print(len(set(w1.index)))
w1.describe()
# w1.dropna(axis=0)

In [None]:
print(len(w1.loc[:,~w1.columns.duplicated()].columns))
print(len(set(w1.loc[:,~w1.columns.duplicated()].columns)))
print(len(w1.loc[:,~w1.columns.duplicated()].index))
print(len(set(w1.loc[:,~w1.columns.duplicated()].index)))
w1.loc[:,~w1.columns.duplicated()].dropna(axis=0)


In [None]:
w1_subdfs = df_by_worker["worker1"]
print(len(w1_subdfs))
# for i in range(100):
w1 = pd.concat(w1_subdfs[0:100], axis=0)


In [None]:
w1 = pd.concat(w1_subdfs[0:100], axis=0)
w1

## Too many rows, all other cols are always nan

In [None]:
w1 = pd.concat(w1_subdfs[0:100], axis=1)
w1

## Too many cols, col names and values are duplicated

In [None]:
w1 = w1_subdfs[0]
for i in range(1, 100):
    w1 = w1.join(w1_subdfs[i], how='left', lsuffix='_left', rsuffix='_right')
w1
# Crashes because it causes duplicate columns

In [None]:
w1 = w1_subdfs[0]
for i in range(1, 100):
    w1 = w1.merge(w1_subdfs[i])
w1
## MergeError: No common columns to perform merge on.

In [None]:
w1 = w1_subdfs[0]
for i in range(1, 100):
    w1.update(w1_subdfs[i])
w1
## MergeError: No common columns to perform merge on.

In [None]:
list(df_by_worker.values())[0]
splits = 100
w1 = []
for split in range(splits):
    w1.append(pd.concat(df_by_worker["worker1"][split:-1:splits]))
    print(split)


In [None]:
df_by_worker["worker1"][0].describe()

In [None]:
no_dup = [x[~x.index.duplicated(keep='first')] for x in df_by_worker["worker1"]]
b = pd.concat(no_dup, axis=1)
print(b.index)
print(len(set(b.index)))

In [None]:
len(b.columns) * len(b.index)

In [None]:
nan_counts = b.isna().sum().sum()
print(nan_counts)

In [None]:
b = b.loc[:, ~b.columns.duplicated()].copy()
static_columns = [column for column in b.columns if b[column].nunique() == 1]
#pprint(f"The following columns have static values: {static_columns}")
b.drop(columns=static_columns, inplace=True)

# Find all monotonically increasing columns.
# - These values might be accumulative and needs additional processing to be useful. (delta over time)
# - Some of these values as simple timers (thus useless)
# - For example, joules are shown as a cumulative sum since the start of the experiment.
#  -- To get the energy consumption at given time, we need to compute the delta between consecutive rows
monotonic_columns = [column for column in b.columns if b[column].is_monotonic_increasing]

In [None]:
monotonic_columns

In [None]:
b[column]
c_df = b.copy()
c_df.drop_duplicates()

In [None]:
c_df = c_df.loc[:, ~c_df.columns.duplicated()].copy()
c_df[column].describe()

In [None]:
no_dup[1]
print(no_dup[1].index)
print(len(set(no_dup[1].index)))

In [None]:
b.describe()

In [None]:
from collections import Counter



for i in range(len(df_by_worker["worker1"])):
    d = df_by_worker["worker1"][i]
    # print(f"{i}: {len(d.index)} == {len(set(d.index))}")
    c = Counter(d.index)
    print(c)
    break
df_by_worker["worker1"][i]
    

In [None]:
a = df_by_worker["worker1"][0].join(df_by_worker["worker1"][1])
print(a.index)
print(len(set(a.index)))

In [None]:
a = df_by_worker["worker1"][0].merge(df_by_worker["worker1"][1])
a.index

In [None]:
import pandas as pd

# Example dataframes
df1 = pd.DataFrame({'A': [10, 11], 'B': [20, 21]}, index=[1, 2])
df2 = pd.DataFrame({ 'C': [30, 31]}, index=[1, 3])

# Concatenate along columns
combined_df = pd.concat([df1, df2], axis=1)
print(combined_df)

# combined_df = pd.merge(df1, df2)
# print(combined_df)

combined_df = df1.join(df2)
print(combined_df)

In [None]:
w1 = []
a = pd.concat(df_by_worker["worker1"][0:1000])
# ww1 = pd.concat(w1)

In [None]:
print(len(set(a.index)))

In [None]:
print(len(a.index))

In [None]:
len(df_by_worker["worker1"])

In [None]:

os.makedirs(output_path, exist_ok=True)

for name, df in df_by_worker.items():
    # df.to_csv(os.path.join(output_path, name + ".csv"), index=False)
    df.to_feather(os.path.join(output_path, name + ".feather"))
    print(f"Saved {name}")

In [None]:
prev = set()
full = set()
for key, val in timestamps_container.items():
    new = set(val)
    if prev != new:
        print(f"{min(new)}-{max(new)} ({len(new)})")
    prev = new
    full = full.union(new)
print("fin")
print(f"{min(full)}-{max(full)} ({len(full)})")

In [None]:
batch_len = 100
prev = set()
w1 = pd.DataFrame()
w1_subdfs = df_by_worker["worker1"]
w1_subdfs_nodup = [x[~x.index.duplicated(keep='first')] for x in w1_subdfs]
for i in range(1, len(w1_subdfs_nodup), batch_len):
    print(f"{i}: {i/len(w1_subdfs_nodup)}")
    end = min(i+batch_len, len(w1_subdfs_nodup))
    x = pd.concat(w1_subdfs_nodup[i:end], axis=1)
    x = x.loc[:,~x.columns.duplicated()]
    new = set(x.index)
    if prev != new:
        print(f"{min(new)}-{max(new)}")
    prev = new
    # x = x.dropna(axis=0)
    w1 = pd.concat([w1,x], axis=1)
w1
# w1.drop_duplicates()
# w1[~w1.columns.duplicated(keep='first')]
# w1[~w1.index.duplicated(keep='first')]


## Too many cols, col names and values are duplicated
# len(set(w1.columns.to_list()))

In [None]:

prev = set()
w1 = pd.DataFrame()
w1_subdfs = df_by_worker["worker1"]
w1_subdfs_nodup = [x[~x.index.duplicated(keep='first')] for x in w1_subdfs]
for sub in w1_subdfs:
    # print(f"{i}: {i/len(w1_subdfs_nodup)}")

    new = set(sub.index)
    if prev != new:
        print(f"{min(new)}-{max(new)}")
    prev = new
    # x = x.dropna(axis=0)

# w1.drop_duplicates()
# w1[~w1.columns.duplicated(keep='first')]
# w1[~w1.index.duplicated(keep='first')]


## Too many cols, col names and values are duplicated
# len(set(w1.columns.to_list()))