In [44]:
import os

from pandas.core.interchange.dataframe_protocol import DataFrame


# Function to find the subfolders with the file names
def find_subfolders_with_file(root_folder, filename):
    result = []
    for root, dirs, files in os.walk(root_folder):
        if filename in files:
            result.append(root)
    return result

# Example usage
root_folder = '../../../data/minimized'
filename = 'worker1.feather'
subfolders = find_subfolders_with_file(root_folder, filename)
# subfolders = [x for x in subfolders if "yolov9" in x or "yolov10" in x]
print(subfolders)

['../../../data/minimized/1730280141_yolov9e_1280', '../../../data/minimized/1730215709_yolov9c_160', '../../../data/minimized/1730211503_yolov10l_160', '../../../data/minimized/1730251802_yolov8n_640', '../../../data/minimized/1730230004_yolov9t_320', '../../../data/minimized/1730257386_yolo11s_1280', '../../../data/minimized/1730214868_yolov9m_160', '../../../data/minimized/1730222435_yolo11s_320', '../../../data/minimized/1730263696_yolo11x_1280', '../../../data/minimized/1730206458_yolo11m_160', '../../../data/minimized/1730247262_yolov9t_640', '../../../data/minimized/1730272260_yolov9t_1280', '../../../data/minimized/1730221594_yolo11n_320', '../../../data/minimized/1730235892_yolov8m_320', '../../../data/minimized/1730253484_yolov8m_640', '../../../data/minimized/1730227481_yolov10m_320', '../../../data/minimized/1730249802_yolov9c_640', '../../../data/minimized/1730268739_yolov10l_1280', '../../../data/minimized/1730212344_yolov10x_160', '../../../data/minimized/1730265410_yolo

In [45]:
import os
import pandas as pd
from functools import lru_cache

@lru_cache(maxsize=None)
def read_feather_cached(file_path):
    return pd.read_feather(file_path)

In [46]:
from utils.header_cleaner import *
import difflib
import os

# It's really quite simple - we're comparing PyTorch and OpenVINO YOLOv8n performance
# Some easy things to compare would be power, and latency

def clean_and_calculate_power(dataframe):
    cleaned_df = clean_df(dataframe)
    cleaned_df.sort_values(by="timestamp", inplace=True)
    # Target word matching and plotting
    target_word = 'kepler node joules total dynamic'
    closest_matches = difflib.get_close_matches(target_word, cleaned_df.columns, n=6, cutoff=0.05)
    print(closest_matches)
    cleaned_df['total_joules'] = cleaned_df[closest_matches].sum(axis=1)
    ts = cleaned_df["timestamp"]
    interval = ts[1] - ts[0]
    cleaned_df['power_consumed'] = cleaned_df['total_joules'].diff() / interval
    return cleaned_df


prom_data_paths = {'ov_cpu_path': "../../../data/processed/ov_vs_pytorch/prom/ov-cpu_2mbps-rerun/",
                   'pytorch_path': "../../../data/processed/ov_vs_pytorch/prom/pytorch_2mbps/"}
prom_data_paths = {os.path.basename(x): x for x in subfolders}
yolo_data_paths = {key: os.path.join(val, "yolo_qos.feather") for key, val in prom_data_paths.items()}

total_power = {}
response_time = {}
for key in prom_data_paths.keys():
    paths = []
    for work_num in range(1, 6):
        temp_path = os.path.join(prom_data_paths[key], f"worker{work_num}.feather")
        print(temp_path)
        paths.append(temp_path)
    concatenated_power = pd.concat([df['power_consumed'] for df in [clean_and_calculate_power(read_feather_cached(x)) for x in paths]], axis=1)
    total_power[key] = concatenated_power.mean(axis=1)

for key in prom_data_paths.keys():
    yolo_df = read_feather_cached(yolo_data_paths[key])
    yolo_df['total_inference_time'] = yolo_df['inf'] + yolo_df['post'] + yolo_df['pre']
    yolo_df['end_to_end_response_time'] = yolo_df['total_inference_time'] + yolo_df['queue']
    yolo_df['start'] = pd.to_datetime(yolo_df['start_time'], unit='ms')  # Convert to datetime (optional)
    yolo_df.set_index('start', inplace=True)
    resampled_df = yolo_df.resample('5S')
    response_time[key] = resampled_df.agg({'end_to_end_response_time': 'mean'}).reset_index()['end_to_end_response_time'].rename(key)

../../../data/minimized/1730280141_yolov9e_1280/worker1.feather
../../../data/minimized/1730280141_yolov9e_1280/worker2.feather
../../../data/minimized/1730280141_yolov9e_1280/worker3.feather
../../../data/minimized/1730280141_yolov9e_1280/worker4.feather
../../../data/minimized/1730280141_yolov9e_1280/worker5.feather
Loaded 657 rows and 774 columns
Removing 0 static columns (774 remaining)
Unable to read timestamp as json
['kepler_node_core_joules_total_mode_dynamic', 'kepler_node_uncore_joules_total_mode_dynamic', 'kepler_node_package_joules_total_mode_dynamic', 'kepler_node_core_joules_total_mode_idle', 'kepler_node_uncore_joules_total_mode_idle', 'kepler_node_package_joules_total_mode_idle']
Loaded 657 rows and 720 columns
Removing 0 static columns (720 remaining)
Unable to read timestamp as json
['kepler_node_core_joules_total_mode_dynamic', 'kepler_node_uncore_joules_total_mode_dynamic', 'kepler_node_package_joules_total_mode_dynamic', 'kepler_node_core_joules_total_mode_idle', '

In [47]:
def calculate_rate(dataframe, target_word):
    cleaned_df = dataframe.copy()
    cleaned_df.sort_values(by="timestamp", inplace=True)
    closest_matches = difflib.get_close_matches(target_word, cleaned_df.columns, n=6, cutoff=0.05)
    match = closest_matches[0]
    print(match)
    cleaned_df['total_count'] = cleaned_df[match]
    ts = cleaned_df["timestamp"]
    interval = ts[1] - ts[0]
    cleaned_df['rate'] = cleaned_df['total_count'].diff() / interval
    return cleaned_df['rate']

if 'rate_df_cache' not in globals():
    rate_df_cache = {}

def precompute_rates(keyword):
    for key in prom_data_paths.keys():
        full_key = key + keyword
        if full_key in rate_df_cache:
            continue
        path = prom_data_paths[key] + "/" + "intermediate/full.feather"
        df = read_feather_cached(path)
        rate_df_cache[full_key] = calculate_rate(df, keyword)

precompute_rates("kafka_server_brokertopicmetrics_bytesin_total yolo_output")

In [48]:
import plotly.express as px

def plot_rate_by_model(keyword, title):
    grouped_df_dict = {}
    for key in prom_data_paths.keys():
        path = prom_data_paths[key] + "/" + "intermediate/full.feather"
        df = read_feather_cached(path)
        _, model_name, resolution = key.split('_')
        if model_name not in grouped_df_dict:
            grouped_df_dict[model_name] = {}
        grouped_df_dict[model_name][resolution] = key+keyword
    
    # Sort model names
    sorted_model_names = sorted(grouped_df_dict.keys())
    
    # Use the cached rates to plot for each model name
    for model_name in sorted_model_names:
        res_dict = grouped_df_dict[model_name]
        sorted_resolutions = sorted(res_dict.keys())
        rate_df_dict = {}
        for resolution in sorted_resolutions:
            key = res_dict[resolution]
            rate_df_dict[resolution] = rate_df_cache[key]
        
        rate_df = pd.DataFrame.from_dict(rate_df_dict)
        fig = px.line(rate_df, x=rate_df.index, y=rate_df.columns, title=f"{title} - {model_name}")
        fig.update_layout(xaxis_title='Time')
        fig.show()

def plot_resolution_by_model(keyword, title):
    resolution_df_dict = {}
    for key in prom_data_paths.keys():
        path = prom_data_paths[key] + "/" + "intermediate/full.feather"
        df = read_feather_cached(path)
        _, model_name, resolution = key.split('_')
        if resolution not in resolution_df_dict:
            resolution_df_dict[resolution] = {}
        resolution_df_dict[resolution][model_name] = key+keyword
    
    # Sort resolutions
    sorted_resolutions = sorted(resolution_df_dict.keys())
    
    # Use the cached rates to plot for each resolution
    for resolution in sorted_resolutions:
        model_dict = resolution_df_dict[resolution]
        sorted_model_names = sorted(model_dict.keys())
        rate_df_dict = {}
        for model_name in sorted_model_names:
            key = model_dict[model_name]
            rate_df_dict[model_name] = rate_df_cache[key]
        
        rate_df = pd.DataFrame.from_dict(rate_df_dict)
        fig = px.line(rate_df, x=rate_df.index, y=rate_df.columns, title=f"{title} - resolution: {resolution}")
        fig.update_layout(xaxis_title='Time')
        fig.show()

In [49]:
plot_resolution_by_model("kafka_server_brokertopicmetrics_bytesin_total yolo_output", "yolo output rate")

In [58]:
# Sort the rate_df_cache by the max value of each item
sorted_items = sorted(rate_df_cache.items(), key=lambda x: x[1].max(), reverse=True)

for key, item in sorted_items:
    cleaned_key = key.replace('kafka_server_brokertopicmetrics_bytesin_total yolo_output', '')
    timestamp, model, resolution = cleaned_key.split('_')
    resolution = resolution
    print(f"Model: {model}, Res: {resolution},  Max: {item.max()}")

Model: yolov8m, Res: 640,  Max: 7304.8
Model: yolo11x, Res: 320,  Max: 6983.6
Model: yolov9m, Res: 640,  Max: 6935.8
Model: yolo11n, Res: 1280,  Max: 6931.0
Model: yolov8x, Res: 320,  Max: 6921.0
Model: yolov10m, Res: 160,  Max: 6885.6
Model: yolov9t, Res: 1280,  Max: 6878.0
Model: yolov10l, Res: 320,  Max: 6856.4
Model: yolov9e, Res: 320,  Max: 6847.2
Model: yolov10m, Res: 640,  Max: 6835.4
Model: yolov8l, Res: 320,  Max: 6829.0
Model: yolov8s, Res: 640,  Max: 6826.4
Model: yolov8m, Res: 320,  Max: 6822.0
Model: yolo11s, Res: 640,  Max: 6821.8
Model: yolov8n, Res: 1280,  Max: 6809.2
Model: yolov10n, Res: 640,  Max: 6806.6
Model: yolov10m, Res: 320,  Max: 6804.6
Model: yolov10s, Res: 640,  Max: 6802.6
Model: yolov8n, Res: 640,  Max: 6768.6
Model: yolo11s, Res: 320,  Max: 6764.6
Model: yolov9t, Res: 640,  Max: 6759.6
Model: yolo11n, Res: 640,  Max: 6759.4
Model: yolov8s, Res: 320,  Max: 6757.0
Model: yolo11m, Res: 320,  Max: 6756.2
Model: yolo11l, Res: 320,  Max: 6752.2
Model: yolov9s, 