Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run python tests in Pyodide build #11914

Merged
merged 6 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
34 changes: 22 additions & 12 deletions .github/workflows/Pyodide.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,10 @@ jobs:
version:
- python: "3.10"
pyodide-build: "0.22.1"
node: "16"
- python: "3.11"
pyodide-build: "0.25.1"
node: "18"
cpcloud marked this conversation as resolved.
Show resolved Hide resolved
steps:
- uses: actions/checkout@v4
with:
Expand Down Expand Up @@ -76,22 +78,30 @@ jobs:
CFLAGS: "-fexceptions"
LDFLAGS: "-fexceptions"

- name: smoke test duckdb on pyodide
- name: install node
uses: actions/setup-node@v4
with:
node-version: ${{ matrix.version.node }}

- name: create pyodide environment
run: pyodide venv .venv-pyodide

- name: install deps into environment
run: |
pyodide venv .venv-pyodide
source .venv-pyodide/bin/activate
pip install ./tools/pythonpkg/dist/*.whl

python -V
pip install pytest numpy pandas mypy

python <<EOF
import duckdb
print(duckdb.__version__)
print(duckdb.sql("SELECT 1 AS a"))
- name: install duckdb wasm wheel into environment
run: |
source .venv-pyodide/bin/activate
pip install ./tools/pythonpkg/dist/*.whl

(platform,) = duckdb.execute("PRAGMA platform").fetchone()
assert platform == "wasm_eh_pyodide", platform
EOF
- name: run tests using pyodide
run: |
source .venv-pyodide/bin/activate
python -m pytest ./tools/pythonpkg/tests
env:
TZ: UTC

- name: Wheel sizes
run: |
Expand Down
7 changes: 7 additions & 0 deletions tools/pythonpkg/tests/extensions/test_extensions_loading.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
import os
import platform

import duckdb
from pytest import raises
import pytest


pytestmark = pytest.mark.skipif(
platform.system() == "Emscripten",
reason="Extensions are not supported on Emscripten",
)


def test_extension_loading(require):
if not os.getenv('DUCKDB_PYTHON_TEST_EXTENSION_REQUIRED', False):
return
Expand Down
5 changes: 5 additions & 0 deletions tools/pythonpkg/tests/fast/api/test_connection_interrupt.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import platform
import threading
import time

Expand All @@ -6,6 +7,10 @@


class TestConnectionInterrupt(object):
@pytest.mark.xfail(
condition=platform.system() == "Emscripten",
reason="threads not allowed on Emscripten",
)
def test_connection_interrupt(self):
conn = duckdb.connect()

Expand Down
5 changes: 5 additions & 0 deletions tools/pythonpkg/tests/fast/api/test_query_interrupt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import time
import pytest

import platform
import threading
import _thread as thread

Expand All @@ -14,6 +15,10 @@ def send_keyboard_interrupt():


class TestQueryInterruption(object):
@pytest.mark.xfail(
condition=platform.system() == "Emscripten",
reason="Emscripten builds cannot use threads",
)
def test_query_interruption(self):
con = duckdb.connect()
thread = threading.Thread(target=send_keyboard_interrupt)
Expand Down
53 changes: 33 additions & 20 deletions tools/pythonpkg/tests/fast/api/test_read_csv.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from multiprocessing.sharedctypes import Value
import numpy
import datetime
import pandas
import pytest
import platform
import duckdb
from io import StringIO, BytesIO
from duckdb.typing import BIGINT, VARCHAR, INTEGER


def TestFile(name):
Expand Down Expand Up @@ -280,7 +278,8 @@ def test_read_pathlib_path(self, duckdb_cursor):
assert res == (1, 'Action', datetime.datetime(2006, 2, 15, 4, 46, 27))

def test_read_filelike(self, duckdb_cursor):
_ = pytest.importorskip("fsspec")
pytest.importorskip("fsspec")

string = StringIO("c1,c2,c3\na,b,c")
res = duckdb_cursor.read_csv(string).fetchall()
assert res == [('a', 'b', 'c')]
Expand Down Expand Up @@ -441,6 +440,7 @@ def test_read_csv_glob(self, tmp_path, create_temp_csv):
res = con.sql("select * from rel order by all").fetchall()
assert res == [(1,), (2,), (3,), (4,), (5,), (6,)]

@pytest.mark.xfail(condition=platform.system() == "Emscripten", reason="time zones not working")
def test_read_csv_combined(self, duckdb_cursor):
CSV_FILE = TestFile('stress_test.csv')
COLUMNS = {
Expand Down Expand Up @@ -469,10 +469,12 @@ def test_read_csv_combined(self, duckdb_cursor):
assert rel.columns == rel2.columns
assert rel.types == rel2.types

def test_read_csv_names(self):
def test_read_csv_names(self, tmp_path):
Copy link
Contributor

@Tishj Tishj May 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not entirely confident this will be unique for every test

I have more faith in this fixture, do you mind using this?

@pytest.fixture(scope="function")
def temp_file_name(request, tmp_path_factory):
    return str(tmp_path_factory.mktemp(request.function.__name__, numbered=True) / 'file.csv')

tmp_path_factory creates unique paths, and numbers them, + for good measure I included the name of the test as another unique token

We are creating more than one files in certain places, perhaps this fixture should return a generator then

Copy link
Contributor Author

@cpcloud cpcloud May 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not entirely confident this will be unique for every test

The pytest documentation for tmp_path says almost verbatim exactly that, which is why I chose it :)

image

Copy link
Contributor Author

@cpcloud cpcloud May 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The implementation of tmp_path is very similar to your proposed implementation (I removed the docstring and clean up code that follows the yield):

def _mk_tmp(request: FixtureRequest, factory: TempPathFactory) -> Path:
    name = request.node.name
    name = re.sub(r"[\W]", "_", name)
    MAXVAL = 30
    name = name[:MAXVAL]
    return factory.mktemp(name, numbered=True)


@fixture
def tmp_path(
    request: FixtureRequest, tmp_path_factory: TempPathFactory
) -> Generator[Path, None, None]:
    path = _mk_tmp(request, tmp_path_factory)
    yield path

file = tmp_path / "file.csv"
file.write_text('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')

con = duckdb.connect()
file = StringIO('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')
rel = con.read_csv(file, names=['a', 'b', 'c'])
rel = con.read_csv(str(file), names=['a', 'b', 'c'])
assert rel.columns == ['a', 'b', 'c', 'four']

with pytest.raises(duckdb.InvalidInputException, match="read_csv only accepts 'names' as a list of strings"):
Expand All @@ -487,9 +489,11 @@ def test_read_csv_names(self):
rel = con.read_csv(file, names=['a', 'b', 'a', 'b'])
assert rel.columns == ['a', 'b', 'a', 'b']

def test_read_csv_names_mixed_with_dtypes(self):
def test_read_csv_names_mixed_with_dtypes(self, tmp_path):
file = tmp_path / "file.csv"
file.write_text('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')

con = duckdb.connect()
file = StringIO('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')
rel = con.read_csv(
file,
names=['a', 'b', 'c'],
Expand Down Expand Up @@ -517,12 +521,18 @@ def test_read_csv_names_mixed_with_dtypes(self):
},
)

def test_read_csv_multi_file(self):
def test_read_csv_multi_file(self, tmp_path):
file1 = tmp_path / "file1.csv"
file1.write_text('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')

file2 = tmp_path / "file2.csv"
file2.write_text('one,two,three,four\n5,6,7,8\n5,6,7,8\n5,6,7,8')

file3 = tmp_path / "file3.csv"
file3.write_text('one,two,three,four\n9,10,11,12\n9,10,11,12\n9,10,11,12')

con = duckdb.connect()
file1 = StringIO('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')
file2 = StringIO('one,two,three,four\n5,6,7,8\n5,6,7,8\n5,6,7,8')
file3 = StringIO('one,two,three,four\n9,10,11,12\n9,10,11,12\n9,10,11,12')
files = [file1, file2, file3]
files = [str(file1), str(file2), str(file3)]
rel = con.read_csv(files)
res = rel.fetchall()
assert res == [
Expand All @@ -546,13 +556,16 @@ def test_read_csv_empty_list(self):
rel = con.read_csv(files)
res = rel.fetchall()

def test_read_csv_list_invalid_path(self):
def test_read_csv_list_invalid_path(self, tmp_path):
con = duckdb.connect()
files = [
StringIO('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4'),
'not_valid_path',
StringIO('one,two,three,four\n9,10,11,12\n9,10,11,12\n9,10,11,12'),
]

file1 = tmp_path / "file1.csv"
file1.write_text('one,two,three,four\n1,2,3,4\n1,2,3,4\n1,2,3,4')

file3 = tmp_path / "file3.csv"
file3.write_text('one,two,three,four\n9,10,11,12\n9,10,11,12\n9,10,11,12')

files = [str(file1), 'not_valid_path', str(file3)]
with pytest.raises(duckdb.IOException, match='No files found that match the pattern "not_valid_path"'):
rel = con.read_csv(files)
res = rel.fetchall()
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import duckdb
import datetime
import numpy as np
import platform
import pytest
import decimal
import math
Expand Down Expand Up @@ -528,6 +529,10 @@ def test_double_object_conversion(self, pandas, duckdb_cursor):
assert isinstance(converted_col['0'].dtype, double_dtype.__class__) == True

@pytest.mark.parametrize('pandas', [NumpyPandas(), ArrowPandas()])
@pytest.mark.xfail(
condition=platform.system() == "Emscripten",
reason="older numpy raises a warning when running with Pyodide",
)
def test_numpy_object_with_stride(self, pandas, duckdb_cursor):
df = pandas.DataFrame(columns=["idx", "evens", "zeros"])

Expand Down
3 changes: 2 additions & 1 deletion tools/pythonpkg/tests/fast/pandas/test_timedelta.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import platform
import pandas as pd
import duckdb
import datetime
import numpy as np
import pytest


Expand Down Expand Up @@ -47,6 +47,7 @@ def test_timedelta_negative(self, duckdb_cursor):
@pytest.mark.parametrize('minutes', [0, 60])
@pytest.mark.parametrize('hours', [0, 24])
@pytest.mark.parametrize('weeks', [0, 51])
@pytest.mark.skipif(platform.system() == "Emscripten", reason="Bind parameters are broken when running on Pyodide")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message is a bit coarse-grained here:

What fails is:

con.execute("SELECT $2::date - $1::date", ['2024-01-02', '2024-03-04'])

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you expand on this?

It seems to be working on https://duckdb.github.io/duckdb-pyodide/console by doing:

import duckdb
duckdb.execute("SELECT $2::date - $1::date", ['2024-01-02', '2024-03-04'])

unsure if the problem is in the testing (possibly due to handling of timezones / else) or this is actually broken.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, try the code from the test this is marking.

Regardless, I was hoping this could be fixed in a follow-up, as this bug existed before we were running the test suite, so adding the tests is not making anything worse 😅

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@carlopi Here's an example that fails:

import duckdb
days = 1
weeks = 2
seconds = 3
microseconds = 4
milliseconds = 5
minutes = 6
hours = 7
instant = f"""
                (INTERVAL {days + (weeks * 7)} DAYS +
                INTERVAL {seconds} SECONDS +
                INTERVAL {microseconds} MICROSECONDS +
                INTERVAL {milliseconds} MILLISECONDS +
                INTERVAL {minutes} MINUTE +
                INTERVAL {hours} HOURS)
            """
query = f"select '1990/02/11'::DATE - {instant}, '1990/02/11'::DATE - $1"
con = duckdb.connect()

import datetime
val = datetime.timedelta(days,seconds,microseconds,milliseconds,minutes,hours,wee
ks)

con.execute(query, [val])

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error I get is:

Pyodide has suffered a fatal error. Please report this to the Pyodide maintainers.
The cause of the fatal error was:
TypeError: Cannot convert undefined to a BigInt
Look in the browser console for more details.

def test_timedelta_coverage(self, duckdb_cursor, days, seconds, microseconds, milliseconds, minutes, hours, weeks):
def create_duck_interval(days, seconds, microseconds, milliseconds, minutes, hours, weeks) -> str:
instant = f"""
Expand Down
7 changes: 6 additions & 1 deletion tools/pythonpkg/tests/fast/pandas/test_timestamp.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import duckdb
import os
import datetime
import os
import pytest
import pandas as pd
import platform
from conftest import pandas_2_or_higher


Expand Down Expand Up @@ -64,6 +65,10 @@ def test_timestamp_timedelta(self):
df_from_duck = duckdb.from_df(df).df()
assert df_from_duck.equals(df)

@pytest.mark.xfail(
condition=platform.system() == "Emscripten" and os.environ.get("TZ") != "UTC",
reason="time zones other than UTC don't seem to work on Pyodide",
)
def test_timestamp_timezone(self, duckdb_cursor):
rel = duckdb_cursor.query(
"""
Expand Down
9 changes: 6 additions & 3 deletions tools/pythonpkg/tests/fast/relational_api/test_rapi_query.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import duckdb
import pytest
import platform
import sys


@pytest.fixture()
Expand Down Expand Up @@ -117,11 +119,12 @@ def test_query_non_select_result(self, duckdb_cursor):

def test_replacement_scan_recursion(self, duckdb_cursor):
depth_limit = 1000
import sys

if sys.platform.startswith('win'):
# With the default we reach a stack overflow in the CI
if sys.platform.startswith('win') or platform.system() == "Emscripten":
# With the default we reach a stack overflow in the CI for windows
# and also outside of it for Pyodide
depth_limit = 250

duckdb_cursor.execute(f"SET max_expression_depth TO {depth_limit}")
rel = duckdb_cursor.sql('select 42')
rel = duckdb_cursor.sql('select * from rel')
Expand Down
18 changes: 4 additions & 14 deletions tools/pythonpkg/tests/fast/spark/test_spark_union.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,10 @@
import platform
import pytest

_ = pytest.importorskip("duckdb.experimental.spark")

from duckdb.experimental.spark.sql.types import (
LongType,
StructType,
BooleanType,
StructField,
StringType,
IntegerType,
LongType,
Row,
ArrayType,
MapType,
)
from duckdb.experimental.spark.sql.functions import col, struct, when, lit, array_contains
from duckdb.experimental.spark.sql.functions import sum, avg, max, min, mean, count
from duckdb.experimental.spark.sql.types import Row
from duckdb.experimental.spark.sql.functions import col


@pytest.fixture
Expand Down Expand Up @@ -65,6 +54,7 @@ def test_merge_with_union(self, df, df2):
res2 = unionDF.collect()
assert res == res2

@pytest.mark.xfail(condition=platform.system() == "Emscripten", reason="Broken on Pyodide")
def test_merge_without_duplicates(self, df, df2):
# 'sort' has been added to make the result deterministic
disDF = df.union(df2).distinct().sort(col("employee_name"))
Expand Down
10 changes: 8 additions & 2 deletions tools/pythonpkg/tests/fast/test_alex_multithread.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import platform
import duckdb
from threading import Thread, current_thread
import pandas as pd
import os
import pytest


pytestmark = pytest.mark.xfail(
condition=platform.system() == "Emscripten",
reason="Emscripten builds cannot use threads",
raises=RuntimeError,
)


@pytest.fixture(scope="session")
def tmp_database(tmp_path_factory):
database = tmp_path_factory.mktemp("databases", numbered=True) / "tmp.duckdb"
Expand Down
5 changes: 3 additions & 2 deletions tools/pythonpkg/tests/fast/test_memory_leaks.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import gc
import duckdb
import pytest
import os, psutil
import os
import pandas as pd

psutil = pytest.importorskip("psutil")


@pytest.fixture
def check_leaks():
Expand Down