In [2]:
import time
import uuid

from great_tables import GT, loc, style
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import uuid_utils as uuid_ut

# UUIDs with Python

Python has a built-in library for UUIDs.
But there's a [uuid-utils](https://pypi.org/project/uuid-utils/) library, built in low-level language (Rust) that's about 10 times faster and supports UUID 7, which Python's default doesn't so I wanted to give it a try.


In [3]:
# Original implementation:
print("uuid 1:", uuid.uuid1())
print("uuid 3:", uuid.uuid3(uuid.NAMESPACE_DNS, "Hello, how you do?"))
print("uuid 4:", uuid.uuid4())
print("uuid 5:", uuid.uuid5(uuid.NAMESPACE_DNS, "Hello, how you do?"))

uuid 1: 352396fc-6026-11f0-a4de-04d3b0d1c8fb
uuid 3: 47981fdb-ea35-39a6-940a-98904125b7e1
uuid 4: d9c76bf0-6e9f-49be-9bc2-35345492324d
uuid 5: 99bf4c71-0980-5f0a-9209-d6f8f0d43810


In [4]:
print("uuid 3:", uuid.uuid3(uuid.NAMESPACE_URL, "Hello, how you do?"))
print("uuid 5:", uuid.uuid5(uuid.NAMESPACE_URL, "Hello, how you do?"))

uuid 3: 1a636465-fa91-3ddd-86db-ce7c36bfb644
uuid 5: 2a479d81-9b51-5302-8ebe-de4c1f83dc84


In [6]:
u = uuid.UUID("123e4567-e89b-12d3-a456-426614174000")
print(u.hex)           # '123e4567e89b12d3a456426614174000'
print(u.urn) 

123e4567e89b12d3a456426614174000
urn:uuid:123e4567-e89b-12d3-a456-426614174000


In [16]:
u = uuid.UUID("123e4567e89b12d3a456426614174000")
print(u.urn)  

urn:uuid:123e4567-e89b-12d3-a456-426614174000


In [None]:
print("uuid 4:", uuid.uuid4().hex)

uuid 4: d70141102cae40509bc28862196ac634


In [8]:
type(uuid.uuid4().hex)

str

In [None]:
uuid.uuid4().int

181343017944126517047175740967256527944

In [20]:
try:
    u = uuid.UUID(181343017944126517047175740967256527944)
    print(u.urn)  
except:
    print("I can't do that")

I can't do that


In [19]:
try:
    u = uuid.UUID(b'\x1c\xd6\x9ee\xb1\xabF\xe5\xa1\xa4^A\xdahMv')
    print(u.urn)  
except:
    print("I can't do that")

I can't do that


In [15]:
print(uuid.uuid4().bytes)
print(type(uuid.uuid4().bytes))

b'\x1c\xd6\x9ee\xb1\xabF\xe5\xa1\xa4^A\xdahMv'
<class 'bytes'>


In [None]:
# The rust implementation:
print("uuid 1:", uuid_ut.uuid1())
print("uuid 3:", uuid_ut.uuid3(uuid_ut.NAMESPACE_DNS, "Hello, how you do?"))
print("uuid 4:", uuid_ut.uuid4())
print("uuid 5:", uuid_ut.uuid5(uuid_ut.NAMESPACE_DNS, "Hello, how you do?"))

## Quick Comparison

In [None]:
# Number of UUIDs to generate
x = 100000

# Standard uuid.uuid4() timing
start_std = time.perf_counter()
for _ in range(x):
    _ = uuid.uuid4()
end_std = time.perf_counter()
print(f"Standard uuid.uuid4(): {end_std - start_std:.4f} seconds")

# uuid_utils.uuid4() timing
start_ut = time.perf_counter()
for _ in range(x):
    _ = uuid_ut.uuid4()
end_ut = time.perf_counter()
print(f"uuid_utils.uuid4():   {end_ut - start_ut:.4f} seconds")

## Benchmark

Let's benchmark both implementations. I'll do 50 iteration in alternative order (I don't randomize), each iteration has 100K executions of either function. 

Let's start with UUID 4.

In [None]:
def time_uuid_function(fn, label, n_uuids, *args, **kwargs):
    start = time.perf_counter()
    for _ in range(n_uuids):
        _ = fn(*args, **kwargs)
    end = time.perf_counter()
    total_time = end - start
    us_per_uuid = (total_time / n_uuids) * 1_000_000
    return {"label": label, "uuid_count": n_uuids, "time": us_per_uuid}  # microseconds


def benchmark_uuid(fn_python, fn_rust, label_python, label_rust, n_uuids, n_runs):
    results = []

    for _ in range(n_runs):
        results.append(time_uuid_function(fn_python, label_python, n_uuids))
        results.append(time_uuid_function(fn_rust, label_rust, n_uuids))

    return pd.DataFrame(results)

In [None]:
df_compare_uuid_4 = benchmark_uuid(
    uuid.uuid4, uuid_ut.uuid4, "uuid4_python", "uuid4_rust", 100_000, 50
)

In [None]:
df_compare_uuid_4

In [None]:
def summarize_benchmark(df):
    summary = (
        df.groupby(["label"])["time"]
        .agg(
            mean="mean",
            median="median",
            std="std",
            min="min",
            q1=lambda x: x.quantile(0.25),
            q3=lambda x: x.quantile(0.75),
            max="max",
        )
        .reset_index()
    )
    return summary

In [None]:
summary_uuid_4 = summarize_benchmark(df_compare_uuid_4)

In [None]:
def plot_violin(df, title="UUID Benchmark Violin Plot"):
    plt.figure(figsize=(10, 6))
    sns.violinplot(data=df, x="label", y="time", inner="box")
    sns.swarmplot(data=df, x="label", y="time", color="k", size=2, alpha=0.5)
    plt.title(title)
    plt.ylabel("Time (µs)")
    plt.xlabel("UUID Generator")
    plt.grid(True, axis="y", linestyle="--", alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
plot_violin(df_compare_uuid_4, title="UUID Benchmark UUID4")

## Compare Them All!

Onlmy UUID versions 1, 3, 4 and 5 exist in both libraries.

In [None]:
def run_all_uuid_benchmarks(n_uuids=100_000, n_runs=50):
    benchmarks = []

    uuid_versions = {
        "uuid1": (lambda: uuid.uuid1(), lambda: uuid_ut.uuid1()),
        "uuid3": (
            lambda: uuid.uuid3(uuid.NAMESPACE_DNS, "Benchmarking"),
            lambda: uuid_ut.uuid3(uuid_ut.NAMESPACE_DNS, "Benchmarking"),
        ),
        "uuid4": (lambda: uuid.uuid4(), lambda: uuid_ut.uuid4()),
        "uuid5": (
            lambda: uuid.uuid5(uuid.NAMESPACE_DNS, "Benchmarking"),
            lambda: uuid_ut.uuid5(uuid_ut.NAMESPACE_DNS, "Benchmarking"),
        ),
    }

    for uuid_type, (fn_py, fn_rs) in uuid_versions.items():
        df = benchmark_uuid(
            fn_python=fn_py,
            fn_rust=fn_rs,
            label_python=f"{uuid_type}_python",
            label_rust=f"{uuid_type}_rust",
            n_uuids=n_uuids,
            n_runs=n_runs,
        )
        df["uuid_type"] = uuid_type
        benchmarks.append(df)

    return pd.concat(benchmarks, ignore_index=True)

In [None]:
df_all = run_all_uuid_benchmarks()

In [None]:
df_all["library"] = df_all["label"].apply(
    lambda x: "python" if "python" in x else "rust"
)
df_all["uuid_type"] = df_all["label"].str.extract(r"(uuid\d)")

In [None]:
df_all.head(2)

In [None]:
def plot_violin_all(df, title="UUID Benchmark — Python vs. Rust implementation"):
    plt.figure(figsize=(12, 6))
    sns.violinplot(
        data=df,
        x="uuid_type",
        y="time",
        hue="library",
        split=False,
        inner="box",
        palette="Set3",
    )
    sns.swarmplot(
        data=df,
        x="uuid_type",
        y="time",
        hue="library",
        dodge=True,
        size=2,
        alpha=0.4,
        legend=False,
        palette="Set3",
    )
    plt.title(title)
    plt.ylabel("Time per UUID (μs)")
    plt.xlabel("UUID Version")
    plt.grid(True, axis="y", linestyle="--", alpha=0.3)
    plt.tight_layout()
    plt.savefig("comparison_plot.png", dpi=300)
    plt.show()

In [None]:
plot_violin_all(df_all)

In [None]:
summary_all = summarize_benchmark(df_all)
summary_all["uuid_type"] = summary_all["label"].str.extract(r"(uuid\d)")
summary_all["library"] = summary_all["label"].apply(
    lambda x: "🐍 Python" if "python" in x else "🦀 Rust"
)
summary_all = summary_all[
    ["uuid_type", "library", "mean", "median", "std", "min", "q1", "q3", "max"]
]

In [None]:
summary_table = GT(
    summary_all.round(2), rowname_col="library", groupname_col="uuid_type"
).tab_header(
    title="UUID implementations - 🐍 Python std. lib vs uuid-utils (Rust 🦀)",
    subtitle="Units: µs | 50 alternative iterations, average of 100K repetions",
)
#    ).data_color(
#         domain=[0, 7],
#         palette=["#77DD77", "white", "#FF6961"],
#         na_color="white",
#     ).tab_style(
#         style=style.fill(color="#E8EAF5"),
#         locations=loc.body(columns="std"),
#     ).tab_style(
#         style=style.text(color="black"),
#         locations=loc.body(columns="std"),
#     )

In [None]:
summary_table.save("comparison_table.png", scale=10)