In [None]:
import time
import uuid

import duckdb
import numpy as np
import pandas as pd
import polars as pl
import uuid_utils as uuid_ut

# Using UUIDs in tabular datasets

I want to compare different methods to do generate and store UUIDs in tabular structures (DataFrames).

In [None]:
csv_file_path = "./data/test_data.csv"

In [None]:
pandas_df = pd.read_csv(csv_file_path)
polars_df = pl.read_csv(csv_file_path)

In [None]:
polars_df.head(2)

In [None]:
start_std = time.perf_counter()

pandas_df["id_comprehension_native"] = [uuid.uuid4() for _ in range(len(pandas_df))]

end_std = time.perf_counter()
total_time = end_std - start_std

print(f"Total time: {total_time} seconds")

In [None]:
# Vectorize UUID generation --> Slower

start_std = time.perf_counter()

vectorized_uuid = np.vectorize(lambda _: str(uuid.uuid4()))
pandas_df["id_comprehension_native"] = vectorized_uuid(np.empty(len(pandas_df)))

end_std = time.perf_counter()
total_time = end_std - start_std

print(f"Total time: {total_time} seconds")

In [None]:
# Precomputing UUIDs in a NumPy array --> slower/same as list comprehension

start_std = time.perf_counter()

n = len(pandas_df)
uuids = np.array([str(uuid.uuid4()) for _ in range(n)])
pandas_df["id_comprehension_native"] = uuids

end_std = time.perf_counter()
total_time = end_std - start_std

print(f"Total time: {total_time} seconds")

#### Using the Rust implementation (`uuid-utils`):

In [None]:
start_std = time.perf_counter()

pandas_df["id_comprehension_rust"] = [uuid_ut.uuid4() for _ in range(len(pandas_df))]

end_std = time.perf_counter()
total_time = end_std - start_std

print(f"Total time: {total_time} seconds")

## Using Polars

We could use list comprehension as well. In that case, the result would be roughly the same (there is no major difference brought by using pands or Polars if we generate a list and then transform to a DataFrame column).

The difference Polars could bring is for bigger DataFrames: we could leverage Lazy evaluation and get faster results, let's see if that's true:

In [None]:
start_std = time.perf_counter()

lazy_df = polars_df.lazy().with_columns(
    pl.int_range(pl.len())
    .map_elements(lambda _: str(uuid.uuid4()), return_dtype=pl.String)
    .alias("uuid_v4")
)

polars_df = lazy_df.collect()
end_std = time.perf_counter()
total_time = end_std - start_std

total_time

In [None]:
start_std = time.perf_counter()

lazy_df = polars_df.lazy().with_columns(
    pl.int_range(pl.len())
    .map_elements(lambda _: str(uuid_ut.uuid4()), return_dtype=pl.String)
    .alias("uuid_v4")
)

polars_df = lazy_df.collect()
end_std = time.perf_counter()
total_time = end_std - start_std

total_time

Using Polars in this way doesn't seem to increase performance (we have the same very notorious difference between the Python and the Rust implementation).

However, Lazy evaluation enables working with larger datasets.

## Using Duck DB

Duck DB does a fantastic job genearing the uuids. It looks 5 to 7 times faster. Notice that the comparison isn't equivalent:

Here, all the generation happens within the "DuckDB world", in the pandas/Polars example, we add the data from outside (Python functions). 

Also, the time we measured includes reading the data from the CSV file!!

In [None]:
query_v4 = f"""
            SELECT
                *,
                uuidv4() AS uuid_v4
            FROM read_csv('{csv_file_path}');
        """
query_v7 = f"""
            SELECT
                *,
                uuidv7() AS uuid_v7
            FROM read_csv('{csv_file_path}');
        """

In [None]:
start_std = time.perf_counter()

duckdb_df_with_uuid = duckdb.sql(query_v4)

end_std = time.perf_counter()
total_time = end_std - start_std

total_time

In [None]:
start_std = time.perf_counter()

duckdb_df_with_uuid = duckdb.sql(query_v7)

end_std = time.perf_counter()
total_time = end_std - start_std

total_time

In [None]:
duckdb_df_with_uuid.dtypes

In [None]:
duckdb_df_with_uuid.df()