In [1]:
from codeflash.verification.test_results import InvocationId
from codeflash.models.models import OriginalCodeBaseline
import dill as pickle

data: list[OriginalCodeBaseline] = pickle.load(open("/Users/renaud/Desktop/baseline100.pkl", "rb"))
invocation_ids = {
    function_test_invocation.id for function_test_invocation in data[0].overall_test_results} # The first run represents the Oracle.

# Timing results where the test passed, and the runtime is not None or 0.
usable_runtime_results: list[dict[InvocationId, dict[int, int]]] = [{invocation_id: {
            function_test_invocation.loop_index: runtime for function_test_invocation in result.overall_test_results if (
            runtime := function_test_invocation.runtime) and function_test_invocation.id == invocation_id and function_test_invocation.did_pass}
        for invocation_id in invocation_ids}
    for result in data]
valid_invocation_ids = {invocation_id for invocation_id in invocation_ids if usable_runtime_results[0][invocation_id]}
# A run is invalid if one of its test invocations has no valid result.
nonempty_runtime_results: list[dict[InvocationId, dict[int, int]]] = [{invocation_id: run_runtimes[invocation_id] for invocation_id in valid_invocation_ids} for run_runtimes in usable_runtime_results if all(run_runtimes[invocation_id] for invocation_id in valid_invocation_ids)]

run_min_runtimes = [{invocation_id: min(runtimes[invocation_id].values()) for invocation_id in runtimes} for runtimes in nonempty_runtime_results]
run_total_runtimes = [sum(test_invocation_runtimes.values()) for test_invocation_runtimes in run_min_runtimes]
run_total_runtimes2 = [result.runtime for result in data]
    
print(f"Timing calculations are consistent: {run_total_runtimes == run_total_runtimes2}")
print(run_total_runtimes)

Timing calculations are consistent: True
[3427917, 3397916, 3395124, 3390412, 3303873, 3468999, 3434249, 3445252, 3342791, 3444248, 3549496, 3285623, 3617039, 3444914, 3540250, 3410374, 3539542, 3292583, 3413747, 3453915, 3597335, 3391166, 3355912, 3691122, 3467460, 3682375, 3522458, 3463334, 3717790, 3595706, 3375916, 3307706, 3388250, 3403586, 3393580, 3393750, 3369835, 3489581, 3363870, 3478123, 3417915, 3427583, 3390582, 3588542, 3508168, 3461457, 3479496, 3316957, 3461040, 3429001, 3650874, 3484789, 3667246, 3483750, 3358830, 3448291, 3456958, 3415290, 3181582, 3443668, 3361624, 3640580, 3410539, 3475081, 3510458, 3516707, 3369163, 3379706, 3694418, 3376625, 3485831, 3372290, 3424334, 3461540, 3630829, 3665957, 3474542, 3289749, 3358750, 3204707, 3449957, 3335665, 3364667, 3466831, 3616958, 3614122, 3543041, 3316167, 3466373, 3423167, 3403418, 3409211, 3402127, 3360996, 3388913, 3662916, 3423126, 3655789, 3287874, 3470374]


In [2]:
from typing import Callable, SupportsFloat
from codeflash.code_utils.time_utils import humanize_runtime
import numpy as np
from numpy.typing import ArrayLike

NumberType = type[SupportsFloat]

def analyze_num_array(
        num_array: ArrayLike,
        formatter: Callable[[NumberType], str]
)-> None:
    array = np.array(num_array)
    
    mean = np.mean(array)
    max_value = np.max(array)
    min_value = np.min(array)
    median = np.median(array)
    std_dev = np.std(array)
    
    percentages = [0, 5, 25, 50, 75, 95, 100]
    percentiles = np.percentile(array, percentages)
    q1 = percentiles[2]    # 25th percentile
    q3 = percentiles[4]    # 75th percentile
    iqr = q3 - q1
    outlier_min = (q1 - 1.5 * iqr)
    outlier_max = (q3 + 1.5 * iqr)
    small_outliers = sorted([value for value in array if value < outlier_min])
    large_outliers = sorted([value for value in array if value > outlier_max])
    
    print(f"Mean +- std dev: {formatter(mean)} +- {formatter(std_dev)}")
    print(f"Max: {formatter(max_value)}")
    print(f"Median: {formatter(median)}")
    print(f"Min: {formatter(min_value)}")
    print()
    for i, percentage in enumerate(percentages):
        print(f"{percentage}th percentile: {formatter(percentiles[i])} ("
              f"{(percentiles[i] - mean) / mean:.0%} of the mean)")
    print()
    # Outliers
    print(f"Small outliers (< {formatter(outlier_min)}): {[formatter(outlier) for outlier in small_outliers]}")
    print()
    print(f"Large outliers (> {formatter(outlier_max)}): {[formatter(outlier) for outlier in large_outliers]}")
    print()
    print(f"Total number of outliers: {len(small_outliers) + len(large_outliers)}")
    print(f"Number of small outliers: {len(small_outliers)}")
    print(f"Number of large outliers: {len(large_outliers)}")
    
analyze_num_array(run_total_runtimes, humanize_runtime)

Mean +- std dev: 3.45 milliseconds +- 110 microseconds
Max: 3.72 milliseconds
Median: 3.44 milliseconds
Min: 3.18 milliseconds

0th percentile: 3.18 milliseconds (-8% of the mean)
5th percentile: 3.29 milliseconds (-5% of the mean)
25th percentile: 3.39 milliseconds (-2% of the mean)
50th percentile: 3.44 milliseconds (-0% of the mean)
75th percentile: 3.49 milliseconds (1% of the mean)
95th percentile: 3.67 milliseconds (6% of the mean)
100th percentile: 3.72 milliseconds (8% of the mean)

Small outliers (< 3.22 milliseconds): ['3.18 milliseconds', '3.20 milliseconds']

Large outliers (> 3.66 milliseconds): ['3.66 milliseconds', '3.67 milliseconds', '3.67 milliseconds', '3.68 milliseconds', '3.69 milliseconds', '3.69 milliseconds', '3.72 milliseconds']

Total number of outliers: 9
Number of small outliers: 2
Number of large outliers: 7


In [3]:
run_loop_counts = [max([max(run_runtimes[invocation_id]) for invocation_id in run_runtimes]) for run_runtimes in nonempty_runtime_results]

print(f"Loop counts: {run_loop_counts}")
print()
analyze_num_array(run_loop_counts, str)

Loop counts: [106, 127, 116, 127, 123, 136, 132, 131, 130, 130, 127, 131, 130, 133, 123, 126, 133, 129, 120, 135, 114, 121, 121, 125, 127, 123, 116, 134, 122, 118, 128, 118, 128, 124, 116, 126, 130, 124, 118, 131, 124, 130, 123, 93, 115, 131, 127, 128, 132, 133, 131, 128, 132, 124, 131, 132, 130, 135, 130, 129, 128, 124, 129, 121, 129, 131, 129, 126, 128, 115, 131, 120, 122, 115, 125, 119, 130, 126, 124, 87, 126, 123, 108, 124, 130, 119, 121, 125, 119, 128, 121, 108, 90, 121, 125, 109, 104, 128, 123, 104]

Mean +- std dev: 123.62 +- 9.040774303122493
Max: 136
Median: 126.0
Min: 87

0th percentile: 87.0 (-30% of the mean)
5th percentile: 105.9 (-14% of the mean)
25th percentile: 121.0 (-2% of the mean)
50th percentile: 126.0 (2% of the mean)
75th percentile: 130.0 (5% of the mean)
95th percentile: 133.0 (8% of the mean)
100th percentile: 136.0 (10% of the mean)

Small outliers (< 107.5): ['87', '90', '93', '104', '104', '106']

Large outliers (> 143.5): []

Total number of outliers: 6
N

In [4]:
run_reach_min_loop = [max({invocation_id: min([loop_index for loop_index in run_runtimes[invocation_id] if run_runtimes[invocation_id][loop_index] == run_min_runtimes[run_index][invocation_id]]) for invocation_id in valid_invocation_ids}.values()) for run_index, run_runtimes in enumerate(nonempty_runtime_results)]

print(f"Loop count to reach min runtime: {run_reach_min_loop}")
print()
analyze_num_array(run_reach_min_loop, str)

Loop count to reach min runtime: [106, 127, 116, 125, 106, 135, 116, 131, 122, 107, 126, 127, 127, 131, 123, 124, 129, 122, 115, 127, 113, 112, 114, 108, 120, 114, 111, 127, 111, 113, 110, 118, 108, 77, 104, 121, 130, 116, 105, 111, 119, 127, 121, 93, 113, 117, 121, 98, 118, 110, 121, 118, 127, 114, 126, 125, 119, 129, 126, 84, 118, 122, 119, 101, 110, 114, 103, 119, 124, 108, 114, 115, 117, 103, 110, 93, 128, 119, 121, 72, 123, 90, 101, 110, 122, 111, 121, 107, 118, 126, 114, 107, 80, 99, 116, 104, 71, 101, 112, 86]

Mean +- std dev: 113.6 +- 12.804686642007294
Max: 135
Median: 116.0
Min: 71

0th percentile: 71.0 (-37% of the mean)
5th percentile: 85.9 (-24% of the mean)
25th percentile: 108.0 (-5% of the mean)
50th percentile: 116.0 (2% of the mean)
75th percentile: 122.0 (7% of the mean)
95th percentile: 129.0 (14% of the mean)
100th percentile: 135.0 (19% of the mean)

Small outliers (< 87.0): ['71', '72', '77', '80', '84', '86']

Large outliers (> 143.0): []

Total number of outli

In [8]:
# Allocated benchmarking time is 5 seconds.
time_reach_min_loop = [min_loop / loop_count * 5_000_000_000 for min_loop, loop_count in zip(run_reach_min_loop, run_loop_counts)] 
print(f"Times to reach min loop: {time_reach_min_loop}")
print()
analyze_num_array(time_reach_min_loop, humanize_runtime)

Times to reach min loop: [5000000000.0, 5000000000.0, 5000000000.0, 4921259842.519685, 4308943089.430895, 4963235294.117647, 4393939393.939394, 5000000000.0, 4692307692.307693, 4115384615.3846154, 4960629921.259843, 4847328244.274809, 4884615384.615384, 4924812030.075188, 5000000000.0, 4920634920.63492, 4849624060.150376, 4728682170.542636, 4791666666.666667, 4703703703.703704, 4956140350.8771925, 4628099173.5537195, 4710743801.652892, 4320000000.0, 4724409448.818897, 4634146341.463415, 4784482758.620689, 4738805970.149254, 4549180327.868853, 4788135593.220339, 4296875000.0, 5000000000.0, 4218750000.0, 3104838709.677419, 4482758620.689655, 4801587301.587302, 5000000000.0, 4677419354.83871, 4449152542.372881, 4236641221.374046, 4798387096.774194, 4884615384.615384, 4918699186.99187, 5000000000.0, 4913043478.26087, 4465648854.961832, 4763779527.559055, 3828125000.0, 4469696969.69697, 4135338345.864661, 4618320610.687023, 4609375000.0, 4810606060.606061, 4596774193.548387, 4809160305.3435