From 699100f17ce8df99da36a943f4f7d86ce5eca74a Mon Sep 17 00:00:00 2001 From: Paul Timmins Date: Fri, 19 Sep 2025 14:24:10 +0000 Subject: [PATCH 1/7] test: add pytest-xdist (multi-process) and pytest-randomly (random order) --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index bcbb24f6..fd34ccc0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -227,6 +227,8 @@ test = [ # dependencies used for running tests "pytest", "pytest-reraise", "pytest-timeout", + "pytest-xdist", # multi-processed tests, if `-n | auto` + "pytest-randomly", # randomizes test order to ensure no test dependencies, enabled on install "mypy", "coverage", "gcovr; python_version < '3.14'", @@ -306,6 +308,7 @@ filterwarnings = [ "ignore:distutils Version classes are deprecated:DeprecationWarning", "ignore:is_datetime64tz_dtype is deprecated:DeprecationWarning", ] +timeout = 600 # don't let individual tests "hang" [tool.coverage.run] branch = true From 5bac3795b7723dd90568e33ca07a16906d26ca67 Mon Sep 17 00:00:00 2001 From: Paul Timmins Date: Fri, 19 Sep 2025 14:26:49 +0000 Subject: [PATCH 2/7] tests: add a fixture for default_connection to avoid leakage across tests --- tests/conftest.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5e297aee..27f1f8f8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -267,10 +267,18 @@ def spark(): @pytest.fixture(scope='function') -def duckdb_cursor(): - connection = duckdb.connect('') - yield connection - connection.close() +def duckdb_cursor(tmp_path): + with duckdb.connect(tmp_path / "mytest") as connection: + yield connection + + +@pytest.fixture(scope='function') +def default_con(): + # ensures each test uses a fresh default connection to avoid test leakage + # threading_unsafe fixture + duckdb.default_connection().close() + with duckdb.default_connection() as conn: + yield conn @pytest.fixture(scope='function') From 3922560fa651d60e1ec52d3ea316f799f036322f Mon Sep 17 00:00:00 2001 From: Paul Timmins Date: Fri, 19 Sep 2025 14:27:39 +0000 Subject: [PATCH 3/7] tests: use a tmp_path fixture to isolate test data --- tests/fast/api/test_to_csv.py | 182 ++++++++++++++------------ tests/fast/test_many_con_same_file.py | 29 ++-- tests/slow/test_h2oai_arrow.py | 16 +-- 3 files changed, 113 insertions(+), 114 deletions(-) diff --git a/tests/fast/api/test_to_csv.py b/tests/fast/api/test_to_csv.py index e48ae1b8..8a791c14 100644 --- a/tests/fast/api/test_to_csv.py +++ b/tests/fast/api/test_to_csv.py @@ -1,5 +1,4 @@ import duckdb -import tempfile import os import pandas._testing as tm import datetime @@ -10,63 +9,63 @@ class TestToCSV(object): @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_basic_to_csv(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_basic_to_csv(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") df = pandas.DataFrame({'a': [5, 3, 23, 2], 'b': [45, 234, 234, 2]}) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name) - csv_rel = duckdb.read_csv(temp_file_name) + csv_rel = default_con.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_sep(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_sep(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") df = pandas.DataFrame({'a': [5, 3, 23, 2], 'b': [45, 234, 234, 2]}) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, sep=',') - csv_rel = duckdb.read_csv(temp_file_name, sep=',') + csv_rel = default_con.read_csv(temp_file_name, sep=',') assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_na_rep(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_na_rep(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") df = pandas.DataFrame({'a': [5, None, 23, 2], 'b': [45, 234, 234, 2]}) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, na_rep="test") - csv_rel = duckdb.read_csv(temp_file_name, na_values="test") + csv_rel = default_con.read_csv(temp_file_name, na_values="test") assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_header(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_header(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") df = pandas.DataFrame({'a': [5, None, 23, 2], 'b': [45, 234, 234, 2]}) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name) - csv_rel = duckdb.read_csv(temp_file_name) + csv_rel = default_con.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quotechar(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_quotechar(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") df = pandas.DataFrame({'a': ["\'a,b,c\'", None, "hello", "bye"], 'b': [45, 234, 234, 2]}) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, quotechar='\'', sep=',') - csv_rel = duckdb.read_csv(temp_file_name, sep=',', quotechar='\'') + csv_rel = default_con.read_csv(temp_file_name, sep=',', quotechar='\'') assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_escapechar(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_escapechar(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") df = pandas.DataFrame( { "c_bool": [True, False], @@ -75,97 +74,102 @@ def test_to_csv_escapechar(self, pandas): "c_string": ["a", "b,c"], } ) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, quotechar='"', escapechar='!') - csv_rel = duckdb.read_csv(temp_file_name, quotechar='"', escapechar='!') + csv_rel = default_con.read_csv(temp_file_name, quotechar='"', escapechar='!') assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_date_format(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_date_format(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") df = pandas.DataFrame(getTimeSeriesData()) dt_index = df.index df = pandas.DataFrame({"A": dt_index, "B": dt_index.shift(1)}, index=dt_index) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, date_format="%Y%m%d") - csv_rel = duckdb.read_csv(temp_file_name, date_format="%Y%m%d") + csv_rel = default_con.read_csv(temp_file_name, date_format="%Y%m%d") assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_timestamp_format(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_timestamp_format(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") data = [datetime.time(hour=23, minute=1, second=34, microsecond=234345)] df = pandas.DataFrame({'0': pandas.Series(data=data, dtype='object')}) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, timestamp_format='%m/%d/%Y') - csv_rel = duckdb.read_csv(temp_file_name, timestamp_format='%m/%d/%Y') + csv_rel = default_con.read_csv(temp_file_name, timestamp_format='%m/%d/%Y') assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quoting_off(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_quoting_off(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") df = pandas.DataFrame({'a': ['string1', 'string2', 'string3']}) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, quoting=None) - csv_rel = duckdb.read_csv(temp_file_name) + csv_rel = default_con.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quoting_on(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_quoting_on(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") df = pandas.DataFrame({'a': ['string1', 'string2', 'string3']}) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, quoting="force") - csv_rel = duckdb.read_csv(temp_file_name) + csv_rel = default_con.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quoting_quote_all(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_quoting_quote_all(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") + df = pandas.DataFrame({'a': ['string1', 'string2', 'string3']}) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, quoting=csv.QUOTE_ALL) - csv_rel = duckdb.read_csv(temp_file_name) + csv_rel = default_con.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_encoding_incorrect(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_encoding_incorrect(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") + df = pandas.DataFrame({'a': ['string1', 'string2', 'string3']}) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) with pytest.raises( duckdb.InvalidInputException, match="Invalid Input Error: The only supported encoding option is 'UTF8" ): rel.to_csv(temp_file_name, encoding="nope") @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_encoding_correct(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_encoding_correct(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") + df = pandas.DataFrame({'a': ['string1', 'string2', 'string3']}) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, encoding="UTF-8") - csv_rel = duckdb.read_csv(temp_file_name) + csv_rel = default_con.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_compression_gzip(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_compression_gzip(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") + df = pandas.DataFrame({'a': ['string1', 'string2', 'string3']}) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, compression="gzip") - csv_rel = duckdb.read_csv(temp_file_name, compression="gzip") + csv_rel = default_con.read_csv(temp_file_name, compression="gzip") assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_partition(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_partition(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") + df = pandas.DataFrame( { "c_category": ['a', 'a', 'b', 'b'], @@ -175,9 +179,9 @@ def test_to_csv_partition(self, pandas): "c_string": ["a", "b,c", "e", "f"], } ) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, header=True, partition_by=["c_category"]) - csv_rel = duckdb.sql( + csv_rel = default_con.sql( f'''FROM read_csv_auto('{temp_file_name}/*/*.csv', hive_partitioning=TRUE, header=TRUE);''' ) expected = [ @@ -190,8 +194,9 @@ def test_to_csv_partition(self, pandas): assert csv_rel.execute().fetchall() == expected @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_partition_with_columns_written(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_partition_with_columns_written(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") + df = pandas.DataFrame( { "c_category": ['a', 'a', 'b', 'b'], @@ -201,17 +206,18 @@ def test_to_csv_partition_with_columns_written(self, pandas): "c_string": ["a", "b,c", "e", "f"], } ) - rel = duckdb.from_df(df) - res = duckdb.sql("FROM rel order by all") + rel = default_con.from_df(df) + res = default_con.sql("FROM rel order by all") rel.to_csv(temp_file_name, header=True, partition_by=["c_category"], write_partition_columns=True) - csv_rel = duckdb.sql( + csv_rel = default_con.sql( f'''FROM read_csv_auto('{temp_file_name}/*/*.csv', hive_partitioning=TRUE, header=TRUE) order by all;''' ) assert res.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_overwrite(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_overwrite(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") + df = pandas.DataFrame( { "c_category_1": ['a', 'a', 'b', 'b'], @@ -222,10 +228,10 @@ def test_to_csv_overwrite(self, pandas): "c_string": ["a", "b,c", "e", "f"], } ) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, header=True, partition_by=["c_category_1"]) # csv to be overwritten rel.to_csv(temp_file_name, header=True, partition_by=["c_category_1"], overwrite=True) - csv_rel = duckdb.sql( + csv_rel = default_con.sql( f'''FROM read_csv_auto('{temp_file_name}/*/*.csv', hive_partitioning=TRUE, header=TRUE);''' ) # When partition columns are read from directory names, column order become different from original @@ -238,8 +244,9 @@ def test_to_csv_overwrite(self, pandas): assert csv_rel.execute().fetchall() == expected @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_overwrite_with_columns_written(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_overwrite_with_columns_written(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") + df = pandas.DataFrame( { "c_category_1": ['a', 'a', 'b', 'b'], @@ -250,22 +257,23 @@ def test_to_csv_overwrite_with_columns_written(self, pandas): "c_string": ["a", "b,c", "e", "f"], } ) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv( temp_file_name, header=True, partition_by=["c_category_1"], write_partition_columns=True ) # csv to be overwritten rel.to_csv( temp_file_name, header=True, partition_by=["c_category_1"], overwrite=True, write_partition_columns=True ) - csv_rel = duckdb.sql( + csv_rel = default_con.sql( f'''FROM read_csv_auto('{temp_file_name}/*/*.csv', hive_partitioning=TRUE, header=TRUE) order by all;''' ) - res = duckdb.sql("FROM rel order by all") + res = default_con.sql("FROM rel order by all") assert res.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_overwrite_not_enabled(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_overwrite_not_enabled(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") + df = pandas.DataFrame( { "c_category_1": ['a', 'a', 'b', 'b'], @@ -276,15 +284,16 @@ def test_to_csv_overwrite_not_enabled(self, pandas): "c_string": ["a", "b,c", "e", "f"], } ) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, header=True, partition_by=["c_category_1"]) with pytest.raises(duckdb.IOException, match="OVERWRITE"): rel.to_csv(temp_file_name, header=True, partition_by=["c_category_1"]) @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_per_thread_output(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) - num_threads = duckdb.sql("select current_setting('threads')").fetchone()[0] + def test_to_csv_per_thread_output(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") + + num_threads = default_con.sql("select current_setting('threads')").fetchone()[0] print('num_threads:', num_threads) df = pandas.DataFrame( { @@ -295,14 +304,15 @@ def test_to_csv_per_thread_output(self, pandas): "c_string": ["a", "b,c", "e", "f"], } ) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, header=True, per_thread_output=True) - csv_rel = duckdb.read_csv(f'{temp_file_name}/*.csv', header=True) + csv_rel = default_con.read_csv(f'{temp_file_name}/*.csv', header=True) assert rel.execute().fetchall() == csv_rel.execute().fetchall() @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_to_csv_use_tmp_file(self, pandas): - temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) + def test_to_csv_use_tmp_file(self, pandas, tmp_path, default_con): + temp_file_name = str(tmp_path / "test.csv") + df = pandas.DataFrame( { "c_category_1": ['a', 'a', 'b', 'b'], @@ -313,8 +323,8 @@ def test_to_csv_use_tmp_file(self, pandas): "c_string": ["a", "b,c", "e", "f"], } ) - rel = duckdb.from_df(df) + rel = default_con.from_df(df) rel.to_csv(temp_file_name, header=True) # csv to be overwritten rel.to_csv(temp_file_name, header=True, use_tmp_file=True) - csv_rel = duckdb.read_csv(temp_file_name, header=True) + csv_rel = default_con.read_csv(temp_file_name, header=True) assert rel.execute().fetchall() == csv_rel.execute().fetchall() diff --git a/tests/fast/test_many_con_same_file.py b/tests/fast/test_many_con_same_file.py index 6b7362a6..fd825c76 100644 --- a/tests/fast/test_many_con_same_file.py +++ b/tests/fast/test_many_con_same_file.py @@ -10,29 +10,20 @@ def get_tables(con): return tbls -def test_multiple_writes(): - try: - os.remove("test.db") - except: - pass - con1 = duckdb.connect("test.db") - con2 = duckdb.connect("test.db") +def test_multiple_writes(tmp_path): + con1 = duckdb.connect(tmp_path / "test.db") + con2 = duckdb.connect(tmp_path / "test.db") con1.execute("CREATE TABLE foo1 as SELECT 1 as a, 2 as b") con2.execute("CREATE TABLE bar1 as SELECT 2 as a, 3 as b") con2.close() con1.close() - con3 = duckdb.connect("test.db") + con3 = duckdb.connect(tmp_path / "test.db") tbls = get_tables(con3) assert tbls == ['bar1', 'foo1'] del con1 del con2 del con3 - try: - os.remove("test.db") - except: - pass - def test_multiple_writes_memory(): con1 = duckdb.connect() @@ -64,23 +55,23 @@ def test_multiple_writes_named_memory(): del con3 -def test_diff_config(): - con1 = duckdb.connect("test.db", False) +def test_diff_config(tmp_path): + con1 = duckdb.connect(tmp_path / "test.db", False) with pytest.raises( duckdb.ConnectionException, match="Can't open a connection to same database file with a different configuration than existing connections", ): - con2 = duckdb.connect("test.db", True) + con2 = duckdb.connect(tmp_path / "test.db", True) con1.close() del con1 -def test_diff_config_extended(): - con1 = duckdb.connect("test.db", config={'null_order': 'NULLS FIRST'}) +def test_diff_config_extended(tmp_path): + con1 = duckdb.connect(tmp_path / "test.db", config={'null_order': 'NULLS FIRST'}) with pytest.raises( duckdb.ConnectionException, match="Can't open a connection to same database file with a different configuration than existing connections", ): - con2 = duckdb.connect("test.db") + con2 = duckdb.connect(tmp_path / "test.db") con1.close() del con1 diff --git a/tests/slow/test_h2oai_arrow.py b/tests/slow/test_h2oai_arrow.py index 40bde07b..7ff37d01 100644 --- a/tests/slow/test_h2oai_arrow.py +++ b/tests/slow/test_h2oai_arrow.py @@ -194,8 +194,10 @@ def test_join(self, threads, function, large_data): @fixture(scope="module") -def arrow_dataset_register(): +def arrow_dataset_register(tmp_path_factory): """Single fixture to download files and register them on the given connection""" + temp_dir = tmp_path_factory.mktemp("h2oai_data") + session = requests.Session() retries = urllib3_util.Retry( allowed_methods={'GET'}, # only retry on GETs (all we do) @@ -212,19 +214,15 @@ def arrow_dataset_register(): respect_retry_after_header=True, # respect Retry-After headers ) session.mount('https://', requests_adapters.HTTPAdapter(max_retries=retries)) - saved_filenames = set() def _register(url, filename, con, tablename): + file_path = temp_dir / filename r = session.get(url) - with open(filename, 'wb') as f: - f.write(r.content) - con.register(tablename, read_csv(filename)) - saved_filenames.add(filename) + file_path.write_bytes(r.content) + con.register(tablename, read_csv(str(file_path))) yield _register - for filename in saved_filenames: - os.remove(filename) session.close() @@ -269,4 +267,4 @@ def group_by_data(arrow_dataset_register): "x", ) yield con - con.close() + con.close() \ No newline at end of file From 5ca7b584194932cab69bd0006a15b890652583d8 Mon Sep 17 00:00:00 2001 From: Paul Timmins Date: Fri, 19 Sep 2025 14:28:07 +0000 Subject: [PATCH 4/7] tests: use a "clean" default_connection due to tests contaminating each other --- tests/fast/api/test_duckdb_connection.py | 119 +++++++++++------------ 1 file changed, 59 insertions(+), 60 deletions(-) diff --git a/tests/fast/api/test_duckdb_connection.py b/tests/fast/api/test_duckdb_connection.py index 4cb565c1..6ebd948e 100644 --- a/tests/fast/api/test_duckdb_connection.py +++ b/tests/fast/api/test_duckdb_connection.py @@ -24,23 +24,23 @@ def tmp_database(tmp_path_factory): # wrapped by the 'duckdb' module, to execute with the 'default_connection' class TestDuckDBConnection(object): @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_append(self, pandas): - duckdb.execute("Create table integers (i integer)") + def test_append(self, pandas, default_con): + default_con.execute("Create table integers (i integer)") df_in = pandas.DataFrame( { 'numbers': [1, 2, 3, 4, 5], } ) - duckdb.append('integers', df_in) - assert duckdb.execute('select count(*) from integers').fetchone()[0] == 5 + default_con.append('integers', df_in) + assert default_con.execute('select count(*) from integers').fetchone()[0] == 5 # cleanup - duckdb.execute("drop table integers") + default_con.execute("drop table integers") - def test_default_connection_from_connect(self): - duckdb.sql('create or replace table connect_default_connect (i integer)') + def test_default_connection_from_connect(self, default_con): + default_con.sql('create or replace table connect_default_connect (i integer)') con = duckdb.connect(':default:') con.sql('select i from connect_default_connect') - duckdb.sql('drop table connect_default_connect') + default_con.sql('drop table connect_default_connect') with pytest.raises(duckdb.Error): con.sql('select i from connect_default_connect') @@ -57,31 +57,31 @@ def test_arrow(self): def test_begin_commit(self): duckdb.begin() - duckdb.execute("create table tbl as select 1") + duckdb.execute("create table tbl_1 as select 1") duckdb.commit() - res = duckdb.table("tbl") - duckdb.execute("drop table tbl") + res = duckdb.table("tbl_1") + duckdb.execute("drop table tbl_1") - def test_begin_rollback(self): - duckdb.begin() - duckdb.execute("create table tbl as select 1") - duckdb.rollback() + def test_begin_rollback(self, default_con): + default_con.begin() + default_con.execute("create table tbl_1rb as select 1") + default_con.rollback() with pytest.raises(duckdb.CatalogException): # Table does not exist - res = duckdb.table("tbl") + res = default_con.table("tbl_1rb") - def test_cursor(self): - duckdb.execute("create table tbl as select 3") + def test_cursor(self, default_con): + default_con.execute("create table tbl_3 as select 3") duckdb_cursor = duckdb.cursor() - res = duckdb_cursor.table("tbl").fetchall() + res = duckdb_cursor.table("tbl_3").fetchall() assert res == [(3,)] - duckdb_cursor.execute("drop table tbl") + duckdb_cursor.execute("drop table tbl_3") with pytest.raises(duckdb.CatalogException): # 'tbl' no longer exists - duckdb.table("tbl") + default_con.table("tbl_3") - def test_cursor_lifetime(self): - con = duckdb.connect() + def test_cursor_lifetime(self, default_con): + con = default_con def use_cursors(): cursors = [] @@ -103,12 +103,12 @@ def test_df(self): assert res == ref def test_duplicate(self): - duckdb.execute("create table tbl as select 5") + duckdb.execute("create table tbl_5 as select 5") dup_conn = duckdb.duplicate() - dup_conn.table("tbl").fetchall() - duckdb.execute("drop table tbl") + dup_conn.table("tbl_5").fetchall() + duckdb.execute("drop table tbl_5") with pytest.raises(duckdb.CatalogException): - dup_conn.table("tbl").fetchall() + dup_conn.table("tbl_5").fetchall() def test_readonly_properties(self): duckdb.execute("select 42") @@ -123,11 +123,11 @@ def test_execute(self): def test_executemany(self): # executemany does not keep an open result set # TODO: shouldn't we also have a version that executes a query multiple times with different parameters, returning all of the results? - duckdb.execute("create table tbl (i integer, j varchar)") - duckdb.executemany("insert into tbl VALUES (?, ?)", [(5, 'test'), (2, 'duck'), (42, 'quack')]) - res = duckdb.table("tbl").fetchall() + duckdb.execute("create table tbl_many (i integer, j varchar)") + duckdb.executemany("insert into tbl_many VALUES (?, ?)", [(5, 'test'), (2, 'duck'), (42, 'quack')]) + res = duckdb.table("tbl_many").fetchall() assert res == [(5, 'test'), (2, 'duck'), (42, 'quack')] - duckdb.execute("drop table tbl") + duckdb.execute("drop table tbl_many") def test_pystatement(self): with pytest.raises(duckdb.ParserException, match='seledct'): @@ -163,8 +163,8 @@ def test_pystatement(self): duckdb.execute(statements[0]) assert duckdb.execute(statements[0], {'1': 42}).fetchall() == [(42,)] - duckdb.execute("create table tbl(a integer)") - statements = duckdb.extract_statements('insert into tbl select $1') + duckdb.execute("create table tbl_a(a integer)") + statements = duckdb.extract_statements('insert into tbl_a select $1') assert statements[0].expected_result_type == [ duckdb.ExpectedResultType.CHANGED_ROWS, duckdb.ExpectedResultType.QUERY_RESULT, @@ -174,23 +174,23 @@ def test_pystatement(self): ): duckdb.executemany(statements[0]) duckdb.executemany(statements[0], [(21,), (22,), (23,)]) - assert duckdb.table('tbl').fetchall() == [(21,), (22,), (23,)] - duckdb.execute("drop table tbl") + assert duckdb.table('tbl_a').fetchall() == [(21,), (22,), (23,)] + duckdb.execute("drop table tbl_a") def test_fetch_arrow_table(self): # Needed for 'fetch_arrow_table' pyarrow = pytest.importorskip("pyarrow") - duckdb.execute("Create Table test (a integer)") + duckdb.execute("Create Table test_arrow_tble (a integer)") for i in range(1024): for j in range(2): - duckdb.execute("Insert Into test values ('" + str(i) + "')") - duckdb.execute("Insert Into test values ('5000')") - duckdb.execute("Insert Into test values ('6000')") + duckdb.execute("Insert Into test_arrow_tble values ('" + str(i) + "')") + duckdb.execute("Insert Into test_arrow_tble values ('5000')") + duckdb.execute("Insert Into test_arrow_tble values ('6000')") sql = ''' SELECT a, COUNT(*) AS repetitions - FROM test + FROM test_arrow_tble GROUP BY a ''' @@ -200,7 +200,7 @@ def test_fetch_arrow_table(self): arrow_df = arrow_table.to_pandas() assert result_df['repetitions'].sum() == arrow_df['repetitions'].sum() - duckdb.execute("drop table test") + duckdb.execute("drop table test_arrow_tble") def test_fetch_df(self): ref = [([1, 2, 3],)] @@ -210,22 +210,22 @@ def test_fetch_df(self): assert res == ref def test_fetch_df_chunk(self): - duckdb.execute("CREATE table t as select range a from range(3000);") - query = duckdb.execute("SELECT a FROM t") + duckdb.execute("CREATE table t_df_chunk as select range a from range(3000);") + query = duckdb.execute("SELECT a FROM t_df_chunk") cur_chunk = query.fetch_df_chunk() assert cur_chunk['a'][0] == 0 assert len(cur_chunk) == 2048 cur_chunk = query.fetch_df_chunk() assert cur_chunk['a'][0] == 2048 assert len(cur_chunk) == 952 - duckdb.execute("DROP TABLE t") + duckdb.execute("DROP TABLE t_df_chunk") def test_fetch_record_batch(self): # Needed for 'fetch_arrow_table' pyarrow = pytest.importorskip("pyarrow") - duckdb.execute("CREATE table t as select range a from range(3000);") - duckdb.execute("SELECT a FROM t") + duckdb.execute("CREATE table t_record_batch as select range a from range(3000);") + duckdb.execute("SELECT a FROM t_record_batch") record_batch_reader = duckdb.fetch_record_batch(1024) chunk = record_batch_reader.read_all() assert len(chunk) == 3000 @@ -286,13 +286,13 @@ def test_query(self): def test_register(self): assert None != duckdb.register - def test_register_relation(self): - con = duckdb.connect() + def test_register_relation(self, default_con): + con = default_con rel = con.sql('select [5,4,3]') - con.register("relation", rel) + con.register("relation_rr", rel) - con.sql("create table tbl as select * from relation") - assert con.table('tbl').fetchall() == [([5, 4, 3],)] + con.sql("create table tbl_reg_rel as select * from relation_rr") + assert con.table('tbl_reg_rel').fetchall() == [([5, 4, 3],)] def test_unregister_problematic_behavior(self, duckdb_cursor): # We have a VIEW called 'vw' in the Catalog @@ -314,10 +314,10 @@ def test_unregister_problematic_behavior(self, duckdb_cursor): assert duckdb_cursor.execute("select * from vw").fetchone() == (0,) @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()]) - def test_relation_out_of_scope(self, pandas): + def test_relation_out_of_scope(self, pandas, default_con): def temporary_scope(): # Create a connection, we will return this - con = duckdb.connect() + con = default_con # Create a dataframe df = pandas.DataFrame({'a': [1, 2, 3]}) # The dataframe has to be registered as well @@ -333,8 +333,8 @@ def temporary_scope(): def test_table(self): con = duckdb.connect() - con.execute("create table tbl as select 1") - assert [(1,)] == con.table("tbl").fetchall() + con.execute("create table tbl_test_table as select 1") + assert [(1,)] == con.table("tbl_test_table").fetchall() def test_table_function(self): assert None != duckdb.table_function @@ -356,16 +356,15 @@ def test_close(self): def test_interrupt(self): assert None != duckdb.interrupt - def test_wrap_shadowing(self): + def test_wrap_shadowing(self, default_con): pd = NumpyPandas() - import duckdb df = pd.DataFrame({"a": [1, 2, 3]}) - res = duckdb.sql("from df").fetchall() + res = default_con.sql("from df").fetchall() assert res == [(1,), (2,), (3,)] - def test_wrap_coverage(self): - con = duckdb.default_connection + def test_wrap_coverage(self, default_con): + con = default_con # Skip all of the initial __xxxx__ methods connection_methods = dir(con) From a9b8ba27c98e4ccce7c36040bdaef0ec36b111dd Mon Sep 17 00:00:00 2001 From: Paul Timmins Date: Fri, 19 Sep 2025 14:28:33 +0000 Subject: [PATCH 5/7] tests: move 10M row test to tests/slow - takes 1min --- tests/fast/test_relation.py | 8 -------- tests/slow/test_relation_slow.py | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) create mode 100644 tests/slow/test_relation_slow.py diff --git a/tests/fast/test_relation.py b/tests/fast/test_relation.py index 8e68c149..2d9b3b4b 100644 --- a/tests/fast/test_relation.py +++ b/tests/fast/test_relation.py @@ -1,6 +1,5 @@ import duckdb import numpy as np -import platform import tempfile import os import pandas as pd @@ -527,13 +526,6 @@ def test_relation_print(self): 2048, 5000, 1000000, - pytest.param( - 10000000, - marks=pytest.mark.skipif( - condition=platform.system() == "Emscripten", - reason="Emscripten/Pyodide builds run out of memory at this scale, and error might not thrown reliably", - ), - ), ], ) def test_materialized_relation(self, duckdb_cursor, num_rows): diff --git a/tests/slow/test_relation_slow.py b/tests/slow/test_relation_slow.py new file mode 100644 index 00000000..cd892985 --- /dev/null +++ b/tests/slow/test_relation_slow.py @@ -0,0 +1,20 @@ +import platform +import pytest + + +class TestRelationSlow(object): + @pytest.mark.skipif( + condition=platform.system() == "Emscripten", + reason="Emscripten/Pyodide builds run out of memory at this scale, and error might not thrown reliably", + ) + def test_materialized_relation_large(self, duckdb_cursor): + """Test materialized relation with 10M rows - moved from fast tests due to 1+ minute runtime""" + # Import the implementation function from the fast test + import sys + import os + sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'fast')) + from test_relation import TestRelation + + # Create instance and call the test with large parameter + test_instance = TestRelation() + test_instance.test_materialized_relation(duckdb_cursor, 10000000) \ No newline at end of file From 8b228be3cf269bc3247d9e7304c6d7c5215478a7 Mon Sep 17 00:00:00 2001 From: Paul Timmins Date: Fri, 19 Sep 2025 14:29:19 +0000 Subject: [PATCH 6/7] tests: query interrupt was both slow and possibly incorrect - added a timeout and use pytest.raises with a more interruptable query --- tests/fast/api/test_query_interrupt.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/tests/fast/api/test_query_interrupt.py b/tests/fast/api/test_query_interrupt.py index 6334e475..86274e7f 100644 --- a/tests/fast/api/test_query_interrupt.py +++ b/tests/fast/api/test_query_interrupt.py @@ -1,35 +1,31 @@ import duckdb import time import pytest - import platform import threading import _thread as thread def send_keyboard_interrupt(): - # Wait a little, so we're sure the 'execute' has started time.sleep(0.1) - # Send an interrupt to the main thread thread.interrupt_main() class TestQueryInterruption(object): + @pytest.mark.xfail( condition=platform.system() == "Emscripten", reason="Emscripten builds cannot use threads", ) - def test_query_interruption(self): + @pytest.mark.timeout(15) + def test_keyboard_interruption(self): con = duckdb.connect() thread = threading.Thread(target=send_keyboard_interrupt) # Start the thread thread.start() try: - res = con.execute('select count(*) from range(100000000000)').fetchall() - except RuntimeError: - # If this is not reached, we could not cancel the query before it completed - # indicating that the query interruption functionality is broken - assert True - except KeyboardInterrupt: - pytest.fail() - thread.join() + with pytest.raises((KeyboardInterrupt, RuntimeError)): + res = con.execute('select * from range(100000) t1,range(100000) t2').fetchall() + finally: + # Ensure the thread completes regardless of what happens + thread.join() From ca15b9ee55624c2cc5f003eb6181161c756b4a30 Mon Sep 17 00:00:00 2001 From: Paul Timmins Date: Fri, 19 Sep 2025 14:32:46 +0000 Subject: [PATCH 7/7] ci: multiprocessing testing w/ xdist -n 2 --- .github/workflows/packaging_wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/packaging_wheels.yml b/.github/workflows/packaging_wheels.yml index ea13b674..5f1a1494 100644 --- a/.github/workflows/packaging_wheels.yml +++ b/.github/workflows/packaging_wheels.yml @@ -57,7 +57,7 @@ jobs: uv export --only-group test --no-emit-project --output-file pylock.toml --directory {project} && uv pip install -r pylock.toml CIBW_TEST_COMMAND: > - uv run -v pytest ${{ inputs.testsuite == 'fast' && './tests/fast' || './tests' }} --verbose --ignore=./tests/stubs + uv run -v pytest -n 2 ${{ inputs.testsuite == 'fast' && './tests/fast' || './tests' }} --verbose --ignore=./tests/stubs steps: - name: Checkout DuckDB Python