In [6]:
import common_utils
import os
import pandas as pd
import difflib

# Example usage
root_folder = '../../../data_warehouse/minimized_warehouse_4'
filename = 'worker1.feather'
subfolders = common_utils.find_subfolders_with_file(root_folder, filename)
print(subfolders)
prom_data_paths = {os.path.basename(x): x for x in subfolders}
yolo_data_paths = {key: os.path.join(val, "yolo_qos.feather") for key, val in prom_data_paths.items()}

['../../../data_warehouse/minimized_warehouse_4/1735854367_(5.5000)', '../../../data_warehouse/minimized_warehouse_4/1735863576_(1.5000)', '../../../data_warehouse/minimized_warehouse_4/1735851224_(7.5000)', '../../../data_warehouse/minimized_warehouse_4/1735849172_(8.1000)', '../../../data_warehouse/minimized_warehouse_4/1735844045_(30.10000)', '../../../data_warehouse/minimized_warehouse_4/1735860195_(2.5000)', '../../../data_warehouse/minimized_warehouse_4/1735853373_(6.10000)', '../../../data_warehouse/minimized_warehouse_4/1735842931_(30.1000)', '../../../data_warehouse/minimized_warehouse_4/1735845629_(20.10000)', '../../../data_warehouse/minimized_warehouse_4/1735859344_(2.1000)', '../../../data_warehouse/minimized_warehouse_4/1735852779_(6.5000)', '../../../data_warehouse/minimized_warehouse_4/1735852257_(6.1000)', '../../../data_warehouse/minimized_warehouse_4/1735848689_(9.10000)', '../../../data_warehouse/minimized_warehouse_4/1735853852_(5.1000)', '../../../data_warehouse/m

In [7]:
# Clean dataframe and calculate power
def get_total_joules(dataframe):
    cleaned_df = dataframe
    
    """ Sort by timestamp to make sure it makes sense to compute difference between first and last values """
    cleaned_df.sort_values(by="timestamp", inplace=True)
    
    """ Get all relevant columns for power calculation """
    target_word = 'kepler node package joules total dynamic'
    closest_matches = difflib.get_close_matches(target_word, cleaned_df.columns, n=2, cutoff=0.05)
    
    """ Compute joules per match """
    joules_per_match = []
    for match in closest_matches:
        joules = cleaned_df[match].max() - cleaned_df[match].min()
        joules_per_match.append(joules)
    
    """ Compute total joules """
    total_joules = sum(joules_per_match)
    return total_joules

total_joules_per_model = {}
for key in prom_data_paths.keys():
    paths = []
    """ Get all workers """
    for work_num in range(1, 6):
        temp_path = os.path.join(prom_data_paths[key], f"worker{work_num}.feather")
        paths.append(temp_path)

    """ Get joules per image for each worker """
    joules_per_worker = [get_total_joules(common_utils.get_cleaned_df(x)) for x in paths]
    joules_total = sum(joules_per_worker)
    num_images = 1000 #common_utils.get_number_of_images(key)  # TODO: Get from somewhere
    joules_per_image = joules_total / num_images

    """ Add result to dict for current model and resolution """
    model_info = common_utils.path_to_workers_and_pcl_size(key)
    if model_info.resolution not in total_joules_per_model:
        total_joules_per_model[model_info.resolution] = {}
    total_joules_per_model[model_info.resolution][model_info.num_vehicles] = joules_per_image

max_joules = {}
for resolution in sorted(total_joules_per_model.keys()):
    joules = pd.DataFrame.from_dict(total_joules_per_model[resolution], orient='index', columns=['Joules'])
    joules.columns = [f'{resolution}']
    max_joules[resolution] = joules



In [8]:
# Grouped bars
import plotly.express as px

# Define width based on resolution
# resolution_to_width = {160: 0.2, 320: 0.4, 640: 0.6, 1280: 0.8}
max_joules_df = pd.concat(max_joules.values(), axis=1)
max_joules_df_sorted = max_joules_df
# max_joules_df_sorted = common_utils.sort_by_model_size_then_version(max_joules_df)

fig = px.bar(max_joules_df_sorted, barmode='group', title='Joules per PCL', labels={'value': 'Max Power (Watts)', 'index': 'Model'})
fig.update_layout(xaxis_title='Num_workers', yaxis_title='Joules', legend_title_text='Resolution')
fig.show()

fig = px.bar(max_joules_df_sorted, barmode='group', title='Joules per PCL (Log Scale)', labels={'value': 'Max Power (Watts)', 'index': 'Model'})
fig.update_layout(xaxis_title='Num_workers', yaxis_title='Joules', yaxis_type='log', legend_title_text='Resolution')
fig.show()

In [9]:
""" Stacked bars, adjusted to the correct heights for each resolution """

# Calculate differences in joules for each resolution to represent the stacked bar chart
diff_joules_per_model = {}
resolutions = sorted(total_joules_per_model.keys())

for resolution in resolutions:
    diff_joules_per_model[resolution] = total_joules_per_model[resolution].copy()
    if resolution > resolutions[0]:
        prev_resolution = resolutions[resolutions.index(resolution) - 1]
        for model in diff_joules_per_model[resolution].keys():
            if model in total_joules_per_model[prev_resolution]:
                diff_joules_per_model[resolution][model] -= total_joules_per_model[prev_resolution][model]

# Create DataFrame for the stacked bar graph
diff_max_joules = {resolution: pd.DataFrame.from_dict(diff_joules_per_model[resolution], orient='index', columns=[f'{resolution}']) for resolution in resolutions}

stacked_diff_max_joules_df = pd.concat(diff_max_joules.values(), axis=1)

# Create stacked bar graph
fig = px.bar(stacked_diff_max_joules_df, barmode='relative', title='Joules per image (Adjusted for Resolution Differences)', labels={'value': 'Joules', 'index': 'Model'})
fig.update_layout(xaxis_title='Model', yaxis_title='Joules', legend_title_text='Resolution')
fig.show()

In [10]:


max_joules_df_sorted = common_utils.sort_by_model_size_then_version(max_joules_df)

fig = px.bar(max_joules_df_sorted, title='Joules per image (relative stacked)', labels={'value': 'Max Power (Watts)', 'index': 'Model'})
fig.update_layout(xaxis_title='Model', yaxis_title='Joules', legend_title_text='Resolution')
fig.show()
fig = px.bar(max_joules_df_sorted, title='Joules per image (relative stacked)', labels={'value': 'Max Power (Watts)', 'index': 'Model'})
fig.update_layout(xaxis_title='Model', yaxis_title='Joules', yaxis_type='log', legend_title_text='Resolution')
fig.show()

AttributeError: 'int' object has no attribute 'replace'

In [None]:

# Define width based on resolution
# resolution_to_width = {160: 0.2, 320: 0.4, 640: 0.6, 1280: 0.8}
diff_joules_df_sorted = common_utils.sort_by_model_size_then_version(stacked_diff_max_joules_df)

fig = px.bar(
    diff_joules_df_sorted, 
    title='Joules per image (overlapping stacked)', 
    labels={'value': 'Max Power (Watts)', 'index': 'Model'},
    # width=[resolution_to_width.get(int(col), 0.5) for col in diff_joules_df_sorted.columns]
)
fig.update_layout(xaxis_title='Model', yaxis_title='Joules', legend_title_text='Resolution')
fig.show()
fig = px.bar(
    diff_joules_df_sorted, 
    title='Joules per image (overlapping stacked)', 
    labels={'value': 'Max Power (Watts)', 'index': 'Model'},
    # width=[resolution_to_width.get(int(col), 0.5) for col in diff_joules_df_sorted.columns]
)
fig.update_layout(xaxis_title='Model', yaxis_title='Joules', yaxis_type='log', legend_title_text='Resolution')
fig.show()

fig = px.bar(max_joules_df_sorted, barmode='group', title='Joules per image (Log Scale)', labels={'value': 'Max Power (Watts)', 'index': 'Model'})
fig.update_layout(xaxis_title='Model', yaxis_title='Joules', yaxis_type='log', legend_title_text='Resolution')
fig.show()

In [None]:
diff_joules_df_sorted

In [None]:
diff_joules_df_sorted.to_csv("data_diff_joules.csv")