duckdb · Mytherin · May 17, 2024 · May 2, 2024 · May 7, 2024 · May 7, 2024
diff --git a/.github/workflows/Pyodide.yml b/.github/workflows/Pyodide.yml
@@ -44,8 +44,10 @@ jobs:
         version:
           - python: "3.10"
             pyodide-build: "0.22.1"
+            node: "16"
           - python: "3.11"
             pyodide-build: "0.25.1"
+            node: "18"
     steps:
       - uses: actions/checkout@v4
         with:
@@ -76,22 +78,30 @@ jobs:
           CFLAGS: "-fexceptions"
           LDFLAGS: "-fexceptions"
 
-      - name: smoke test duckdb on pyodide
+      - name: install node
+        uses: actions/setup-node@v4
+        with:
+          node-version: ${{ matrix.version.node }}
+
+      - name: create pyodide environment
+        run: pyodide venv .venv-pyodide
+
+      - name: install deps into environment
         run: |
-          pyodide venv .venv-pyodide
           source .venv-pyodide/bin/activate
-          pip install ./tools/pythonpkg/dist/*.whl
-
-          python -V
+          pip install pytest numpy pandas mypy
 
-          python <<EOF
-          import duckdb
-          print(duckdb.__version__)
-          print(duckdb.sql("SELECT 1 AS a"))
+      - name: install duckdb wasm wheel into environment
+        run: |
+          source .venv-pyodide/bin/activate
+          pip install ./tools/pythonpkg/dist/*.whl
 
-          (platform,) = duckdb.execute("PRAGMA platform").fetchone()
-          assert platform == "wasm_eh_pyodide", platform
-          EOF
+      - name: run tests using pyodide
+        run: |
+          source .venv-pyodide/bin/activate
+          python -m pytest ./tools/pythonpkg/tests
+        env:
+          TZ: UTC
 
       - name: Wheel sizes
         run: |

diff --git a/tools/pythonpkg/tests/extensions/test_extensions_loading.py b/tools/pythonpkg/tests/extensions/test_extensions_loading.py
@@ -1,10 +1,17 @@
 import os
+import platform
 
 import duckdb
 from pytest import raises
 import pytest
 
 
+pytestmark = pytest.mark.skipif(
+    platform.system() == "Emscripten",
+    reason="Extensions are not supported on Emscripten",
+)
+
+
 def test_extension_loading(require):
     if not os.getenv('DUCKDB_PYTHON_TEST_EXTENSION_REQUIRED', False):
         return

diff --git a/tools/pythonpkg/tests/fast/api/test_connection_interrupt.py b/tools/pythonpkg/tests/fast/api/test_connection_interrupt.py
@@ -1,3 +1,4 @@
+import platform
 import threading
 import time
 
@@ -6,6 +7,10 @@
 
 
 class TestConnectionInterrupt(object):
+    @pytest.mark.xfail(
+        condition=platform.system() == "Emscripten",
+        reason="threads not allowed on Emscripten",
+    )
     def test_connection_interrupt(self):
         conn = duckdb.connect()
 

diff --git a/tools/pythonpkg/tests/fast/api/test_query_interrupt.py b/tools/pythonpkg/tests/fast/api/test_query_interrupt.py
@@ -2,6 +2,7 @@
 import time
 import pytest
 
+import platform
 import threading
 import _thread as thread
 
@@ -14,6 +15,10 @@ def send_keyboard_interrupt():
 
 
 class TestQueryInterruption(object):
+    @pytest.mark.xfail(
+        condition=platform.system() == "Emscripten",
+        reason="Emscripten builds cannot use threads",
+    )
     def test_query_interruption(self):
         con = duckdb.connect()
         thread = threading.Thread(target=send_keyboard_interrupt)

diff --git a/tools/pythonpkg/tests/fast/api/test_read_csv.py b/tools/pythonpkg/tests/fast/api/test_read_csv.py
@@ -1,11 +1,9 @@
 from multiprocessing.sharedctypes import Value
-import numpy
 import datetime
-import pandas
 import pytest
+import platform
 import duckdb
 from io import StringIO, BytesIO
-from duckdb.typing import BIGINT, VARCHAR, INTEGER
 
 
 def TestFile(name):
@@ -280,7 +278,8 @@ def test_read_pathlib_path(self, duckdb_cursor):
         assert res == (1, 'Action', datetime.datetime(2006, 2, 15, 4, 46, 27))
 
     def test_read_filelike(self, duckdb_cursor):
-        _ = pytest.importorskip("fsspec")
+        pytest.importorskip("fsspec")
+
         string = StringIO("c1,c2,c3\na,b,c")
         res = duckdb_cursor.read_csv(string).fetchall()
         assert res == [('a', 'b', 'c')]
@@ -441,6 +440,7 @@ def test_read_csv_glob(self, tmp_path, create_temp_csv):
         res = con.sql("select * from rel order by all").fetchall()
         assert res == [(1,), (2,), (3,), (4,), (5,), (6,)]
 
+    @pytest.mark.xfail(condition=platform.system() == "Emscripten", reason="time zones not working")
     def test_read_csv_combined(self, duckdb_cursor):
         CSV_FILE = TestFile('stress_test.csv')
         COLUMNS = {
@@ -469,10 +469,12 @@ def test_read_csv_combined(self, duckdb_cursor):
         assert rel.columns == rel2.columns
         assert rel.types == rel2.types
 
-    def test_read_csv_names(self):
+    def test_read_csv_names(self, tmp_path):
+        file = tmp_path / "file.csv"
+        file.write_text('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')
+
         con = duckdb.connect()
-        file = StringIO('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')
-        rel = con.read_csv(file, names=['a', 'b', 'c'])
+        rel = con.read_csv(str(file), names=['a', 'b', 'c'])
         assert rel.columns == ['a', 'b', 'c', 'four']
 
         with pytest.raises(duckdb.InvalidInputException, match="read_csv only accepts 'names' as a list of strings"):
@@ -487,9 +489,11 @@ def test_read_csv_names(self):
             rel = con.read_csv(file, names=['a', 'b', 'a', 'b'])
             assert rel.columns == ['a', 'b', 'a', 'b']
 
-    def test_read_csv_names_mixed_with_dtypes(self):
+    def test_read_csv_names_mixed_with_dtypes(self, tmp_path):
+        file = tmp_path / "file.csv"
+        file.write_text('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')
+
         con = duckdb.connect()
-        file = StringIO('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')
         rel = con.read_csv(
             file,
             names=['a', 'b', 'c'],
@@ -517,12 +521,18 @@ def test_read_csv_names_mixed_with_dtypes(self):
                 },
             )
 
-    def test_read_csv_multi_file(self):
+    def test_read_csv_multi_file(self, tmp_path):
+        file1 = tmp_path / "file1.csv"
+        file1.write_text('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')
+
+        file2 = tmp_path / "file2.csv"
+        file2.write_text('one,two,three,four\n5,6,7,8\n5,6,7,8\n5,6,7,8')
+
+        file3 = tmp_path / "file3.csv"
+        file3.write_text('one,two,three,four\n9,10,11,12\n9,10,11,12\n9,10,11,12')
+
         con = duckdb.connect()
-        file1 = StringIO('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')
-        file2 = StringIO('one,two,three,four\n5,6,7,8\n5,6,7,8\n5,6,7,8')
-        file3 = StringIO('one,two,three,four\n9,10,11,12\n9,10,11,12\n9,10,11,12')
-        files = [file1, file2, file3]
+        files = [str(file1), str(file2), str(file3)]
         rel = con.read_csv(files)
         res = rel.fetchall()
         assert res == [
@@ -546,13 +556,16 @@ def test_read_csv_empty_list(self):
             rel = con.read_csv(files)
             res = rel.fetchall()
 
-    def test_read_csv_list_invalid_path(self):
+    def test_read_csv_list_invalid_path(self, tmp_path):
         con = duckdb.connect()
-        files = [
-            StringIO('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4'),
-            'not_valid_path',
-            StringIO('one,two,three,four\n9,10,11,12\n9,10,11,12\n9,10,11,12'),
-        ]
+
+        file1 = tmp_path / "file1.csv"
+        file1.write_text('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')
+
+        file3 = tmp_path / "file3.csv"
+        file3.write_text('one,two,three,four\n9,10,11,12\n9,10,11,12\n9,10,11,12')
+
+        files = [str(file1), 'not_valid_path', str(file3)]
         with pytest.raises(duckdb.IOException, match='No files found that match the pattern "not_valid_path"'):
             rel = con.read_csv(files)
             res = rel.fetchall()
diff --git a/tools/pythonpkg/tests/fast/pandas/test_df_object_resolution.py b/tools/pythonpkg/tests/fast/pandas/test_df_object_resolution.py
@@ -1,6 +1,7 @@
 import duckdb
 import datetime
 import numpy as np
+import platform
 import pytest
 import decimal
 import math
@@ -528,6 +529,10 @@ def test_double_object_conversion(self, pandas, duckdb_cursor):
         assert isinstance(converted_col['0'].dtype, double_dtype.__class__) == True
 
     @pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()])
+    @pytest.mark.xfail(
+        condition=platform.system() == "Emscripten",
+        reason="older numpy raises a warning when running with Pyodide",
+    )
     def test_numpy_object_with_stride(self, pandas, duckdb_cursor):
         df = pandas.DataFrame(columns=["idx", "evens", "zeros"])
 

diff --git a/tools/pythonpkg/tests/fast/pandas/test_timedelta.py b/tools/pythonpkg/tests/fast/pandas/test_timedelta.py
@@ -1,7 +1,7 @@
+import platform
 import pandas as pd
 import duckdb
 import datetime
-import numpy as np
 import pytest
 
 
@@ -47,6 +47,7 @@ def test_timedelta_negative(self, duckdb_cursor):
     @pytest.mark.parametrize('minutes', [0, 60])
     @pytest.mark.parametrize('hours', [0, 24])
     @pytest.mark.parametrize('weeks', [0, 51])
+    @pytest.mark.skipif(platform.system() == "Emscripten", reason="Bind parameters are broken when running on Pyodide")
     def test_timedelta_coverage(self, duckdb_cursor, days, seconds, microseconds, milliseconds, minutes, hours, weeks):
         def create_duck_interval(days, seconds, microseconds, milliseconds, minutes, hours, weeks) -> str:
             instant = f"""

diff --git a/tools/pythonpkg/tests/fast/pandas/test_timestamp.py b/tools/pythonpkg/tests/fast/pandas/test_timestamp.py
@@ -1,8 +1,9 @@
 import duckdb
-import os
 import datetime
+import os
 import pytest
 import pandas as pd
+import platform
 from conftest import pandas_2_or_higher
 
 
@@ -64,6 +65,10 @@ def test_timestamp_timedelta(self):
         df_from_duck = duckdb.from_df(df).df()
         assert df_from_duck.equals(df)
 
+    @pytest.mark.xfail(
+        condition=platform.system() == "Emscripten" and os.environ.get("TZ") != "UTC",
+        reason="time zones other than UTC don't seem to work on Pyodide",
+    )
     def test_timestamp_timezone(self, duckdb_cursor):
         rel = duckdb_cursor.query(
             """

diff --git a/tools/pythonpkg/tests/fast/relational_api/test_rapi_query.py b/tools/pythonpkg/tests/fast/relational_api/test_rapi_query.py
@@ -1,5 +1,7 @@
 import duckdb
 import pytest
+import platform
+import sys
 
 
 @pytest.fixture()
@@ -117,11 +119,12 @@ def test_query_non_select_result(self, duckdb_cursor):
 
     def test_replacement_scan_recursion(self, duckdb_cursor):
         depth_limit = 1000
-        import sys
 
-        if sys.platform.startswith('win'):
-            # With the default we reach a stack overflow in the CI
+        if sys.platform.startswith('win') or platform.system() == "Emscripten":
+            # With the default we reach a stack overflow in the CI for windows
+            # and also outside of it for Pyodide
             depth_limit = 250
+
         duckdb_cursor.execute(f"SET max_expression_depth TO {depth_limit}")
         rel = duckdb_cursor.sql('select 42')
         rel = duckdb_cursor.sql('select * from rel')

diff --git a/tools/pythonpkg/tests/fast/spark/test_spark_union.py b/tools/pythonpkg/tests/fast/spark/test_spark_union.py
@@ -1,21 +1,10 @@
+import platform
 import pytest
 
 _ = pytest.importorskip("duckdb.experimental.spark")
 
-from duckdb.experimental.spark.sql.types import (
-    LongType,
-    StructType,
-    BooleanType,
-    StructField,
-    StringType,
-    IntegerType,
-    LongType,
-    Row,
-    ArrayType,
-    MapType,
-)
-from duckdb.experimental.spark.sql.functions import col, struct, when, lit, array_contains
-from duckdb.experimental.spark.sql.functions import sum, avg, max, min, mean, count
+from duckdb.experimental.spark.sql.types import Row
+from duckdb.experimental.spark.sql.functions import col
 
 
 @pytest.fixture
@@ -65,6 +54,7 @@ def test_merge_with_union(self, df, df2):
         res2 = unionDF.collect()
         assert res == res2
 
+    @pytest.mark.xfail(condition=platform.system() == "Emscripten", reason="Broken on Pyodide")
     def test_merge_without_duplicates(self, df, df2):
         # 'sort' has been added to make the result deterministic
         disDF = df.union(df2).distinct().sort(col("employee_name"))

diff --git a/tools/pythonpkg/tests/fast/test_alex_multithread.py b/tools/pythonpkg/tests/fast/test_alex_multithread.py
@@ -1,10 +1,16 @@
+import platform
 import duckdb
 from threading import Thread, current_thread
-import pandas as pd
-import os
 import pytest
 
 
+pytestmark = pytest.mark.xfail(
+    condition=platform.system() == "Emscripten",
+    reason="Emscripten builds cannot use threads",
+    raises=RuntimeError,
+)
+
+
 @pytest.fixture(scope="session")
 def tmp_database(tmp_path_factory):
     database = tmp_path_factory.mktemp("databases", numbered=True) / "tmp.duckdb"

diff --git a/tools/pythonpkg/tests/fast/test_memory_leaks.py b/tools/pythonpkg/tests/fast/test_memory_leaks.py
@@ -1,9 +1,10 @@
 import gc
-import duckdb
 import pytest
-import os, psutil
+import os
 import pandas as pd
 
+psutil = pytest.importorskip("psutil")
+
 
 @pytest.fixture
 def check_leaks():