In [48]:
import pandas as pd
import re

from pathlib import Path
from datetime import datetime
from math import ceil, log10

In [26]:
TEST_DIR = "results"
ONE_TABLE_FILENAME = "one_table_results.txt"
MULTI_TABLE_FILENAME = "multi_table_results.txt"
TIMESTAMP_FORMAT = "%Y-%m-%d_%H:%M:%S.%f"

TEST_BASELINE_MS = 265000

In [45]:
def parse_explain_file(path: str, test_timestamp: str, testing_schema: str) -> list[dict]:
    """
    Parse the results of the tests analyze splitting the file into individual
    test blocks, and then looking for key elements in the text.

    What we are interested in;
        - The planning time,
        - The execution time, and
        - The time of algorithm used.

    The last one becomes tricky to implement since the UNION query uses multiple
    algorithms per part of the UNION, so we'll omit it for now.

    Returns, a list of dictionaries of the data parsed.
    """
    results = []
    
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()

    test_blocks = re.split(r"=+\s*TEST_NAME:", text)[1:]
    for block in test_blocks:
        name_match = re.search(r"(.*?)\n=+", block)
        test_name = name_match.group(1).strip() if name_match else None

        case_blocks = re.split(r"=+\s*TEST_CASE_NUM:", block)[1:]

        for case_block in case_blocks:
            case_match = re.search(r"(\d+)", case_block)
            case_num = int(case_match.group(1)) if case_match else None

            factors_match = re.search(r"({[\d,]*\d})", case_block)
            factors_str = factors_match.group(1).strip("{}") if factors_match else None

            plan_match = re.search(r"Planning Time:\s*([\d.]+)\s*ms", case_block)
            planning_time = float(plan_match.group(1)) if plan_match else None

            exec_match = re.search(r"Execution Time:\s*([\d.]+)\s*ms", case_block)
            execution_time = float(exec_match.group(1)) if exec_match else None

            results.append({
                "schema": testing_schema,
                "timestamp": datetime.strptime(test_timestamp, TIMESTAMP_FORMAT),
                "name": test_name,
                "case_num": case_num,
                "factors": factors_str,
                "planning_time_ms": planning_time,
                "execution_time_ms": execution_time,
            })

    return results

In [28]:
def collect_and_parse_data() -> list[list[dict]]:
    """
    Loop through the test directory, collect each folder name as the timestamp.
    Return the entirety of all the parsed data in the directory.
    """
    folder = Path(TEST_DIR)

    results = []
    for item in folder.iterdir():
        if not item.is_dir():
            continue
        
        # Add the results from both schemata
        results += parse_explain_file(item / ONE_TABLE_FILENAME, item.name, "One Table")
        results += parse_explain_file(item / MULTI_TABLE_FILENAME, item.name, "Multi Table")
    
    return results

In [None]:
def main():
    """
    This script will:
        - Parse the data from the explain analyze queries. The name of the
          directory indicates when the test was ran. Everything else will be
          collected from inside the file itself.
        - Organize the data onto a pandas dataframe with appropriate typing.
          This step also adds additional columns to the dataframe for
          analysis.
        - Visualize the results, comparing them against the baseline of
          4m:25s or 265000ms.

    The visualizations will be outputted onto PNGs to be used in the final
    report. The console will also output some summary statistics to give
    the user a brief preview of how the tests faired.
    """
    print("Analyzing database tests ran.")
    print(f"Comparing against the baseline of {TEST_BASELINE_MS:,}ms.")
    results_df = pd.DataFrame(collect_and_parse_data())

    # Add calculated columns for analysis
    results_df["total_time_ms"] = results_df["planning_time_ms"] + results_df["execution_time_ms"]
    results_df["improvement_ms"] = TEST_BASELINE_MS - results_df["total_time_ms"]
    results_df["improvement_percent"] = results_df["improvement_ms"] / TEST_BASELINE_MS
    results_df["num_of_factors"] = results_df["factors"].str.count(",") + 1

    # Get the min magnitude and max magnitude of each number in the tests
    results_df["min_ord_mag10"] = results_df["factors"].apply(
        lambda x: ceil(log10(min([int(p) for p in x.split(",")])))
    )
    results_df["max_ord_mag10"] = results_df["factors"].apply(
        lambda x: ceil(log10(max([int(p) for p in x.split(",")])))
    )

    # Balanced if the order of magnitudes equal from min to max
    results_df["balanced"] = results_df.apply(
        lambda x: "Balanced" if x["min_ord_mag10"] == x["max_ord_mag10"] else "Unbalanced",
        axis=1
    )

    # Sort the table by timestamp (asc), name (asc), case_num (asc), schema (asc)
    results_df.sort_values(
        ["timestamp", "name", "case_num", "schema"], 
        ascending=[True, True, True, False], 
        ignore_index=True,
        inplace=True, 
    )

    display(results_df)

    # Summary statistics on the data


if __name__ == "__main__":
    main()

Analyzing database tests ran.
Comparing against the baseline of 265,000ms.


Unnamed: 0,schema,timestamp,name,case_num,factors,planning_time_ms,execution_time_ms,total_time_ms,improvement_ms,improvement_percent,num_of_factors,min_ord_mag10,max_ord_mag10,balanced
0,One Table,2025-11-25 05:14:41.451604,1_primes.txt,1,5,1.501,19531.366,19532.867,245467.133,0.926291,1,1,1,Balanced
1,Multi Table,2025-11-25 05:14:41.451604,1_primes.txt,1,5,2.266,16939.897,16942.163,248057.837,0.936067,1,1,1,Balanced
2,One Table,2025-11-25 05:14:41.451604,1_primes.txt,2,11,1.517,56726.908,56728.425,208271.575,0.785930,1,2,2,Balanced
3,Multi Table,2025-11-25 05:14:41.451604,1_primes.txt,2,11,3.346,43709.184,43712.530,221287.470,0.835047,1,2,2,Balanced
4,One Table,2025-11-25 05:14:41.451604,1_primes.txt,3,29,1.497,57021.331,57022.828,207977.172,0.784820,1,2,2,Balanced
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,Multi Table,2025-11-25 06:00:15.790149,7_3_unbalanced.txt,1,111011009,3.229,15904.884,15908.113,249091.887,0.939969,3,2,4,Unbalanced
356,One Table,2025-11-25 06:00:15.790149,7_3_unbalanced.txt,2,171972749,1.549,342.765,344.314,264655.686,0.998701,3,2,4,Unbalanced
357,Multi Table,2025-11-25 06:00:15.790149,7_3_unbalanced.txt,2,171972749,3.237,15756.487,15759.724,249240.276,0.940529,3,2,4,Unbalanced
358,One Table,2025-11-25 06:00:15.790149,7_3_unbalanced.txt,3,234018009,1.550,438.246,439.796,264560.204,0.998340,3,2,4,Unbalanced
