diff --git a/.gitignore b/.gitignore index 3a2f2a59f..81012e898 100644 --- a/.gitignore +++ b/.gitignore @@ -125,3 +125,5 @@ docs/source/auto_examples/ docs/source/examples/mydask.png dask-worker-space +.direnv +dask_ml/_version.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e2319a077..6e4534116 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,19 +1,10 @@ repos: -- repo: https://github.com/psf/black - rev: 23.12.1 - hooks: - - id: black - language_version: python3 - args: - - --target-version=py39 -- repo: https://github.com/pycqa/flake8 - rev: 7.0.0 - hooks: - - id: flake8 - language_version: python3 - args: ["--ignore=E501,W503,E203,E741,E731"] -- repo: https://github.com/pycqa/isort - rev: 5.13.2 - hooks: - - id: isort - language_version: python3 +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.3.4 + hooks: + # Run the linter. + - id: ruff + args: [ --fix ] + # Run the formatter. + - id: ruff-format \ No newline at end of file diff --git a/dask_ml/__init__.py b/dask_ml/__init__.py index ab3113573..aea5b72e1 100644 --- a/dask_ml/__init__.py +++ b/dask_ml/__init__.py @@ -1,18 +1,6 @@ -from pkg_resources import DistributionNotFound, get_distribution - # Ensure we always register tokenizers -from dask_ml.model_selection import _normalize - -__all__ = [] - -try: - __version__ = get_distribution(__name__).version - __all__.append("__version__") -except DistributionNotFound: - # package is not installed - pass +from dask_ml.model_selection import _normalize # noqa: F401 +from ._version import __version__ -del DistributionNotFound -del get_distribution -del _normalize +__all__ = ["__version__"] diff --git a/dask_ml/cluster/spectral.py b/dask_ml/cluster/spectral.py index 3f1ef1ea1..5171695e9 100644 --- a/dask_ml/cluster/spectral.py +++ b/dask_ml/cluster/spectral.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -"""Algorithms for spectral clustering -""" +"""Algorithms for spectral clustering""" + import logging import dask.array as da @@ -272,9 +272,7 @@ def fit(self, X, y=None): # Eq 16. This is OK when V2 is orthogonal V2 = da.sqrt(float(n_components) / n) * da.vstack([A2, B2.T]).dot( U_A[:, :n_clusters] - ).dot( - da.diag(1.0 / da.sqrt(S_A[:n_clusters])) - ) # (n, k) + ).dot(da.diag(1.0 / da.sqrt(S_A[:n_clusters]))) # (n, k) _log_array(logger, V2, "V2.1") if isinstance(B2, da.Array): @@ -366,9 +364,9 @@ def _slice_mostly_sorted(array, keep, rest, ind=None): slices.append([keep[0]]) windows = zip(keep[:-1], keep[1:]) - for l, r in windows: - if r > l + 1: # avoid creating empty slices - slices.append(slice(l + 1, r)) + for left, r in windows: + if r > left + 1: # avoid creating empty slices + slices.append(slice(left + 1, r)) slices.append([r]) if keep[-1] < len(array) - 1: # avoid creating empty slices diff --git a/dask_ml/decomposition/truncated_svd.py b/dask_ml/decomposition/truncated_svd.py index a9c0b2be8..fe426bcb5 100644 --- a/dask_ml/decomposition/truncated_svd.py +++ b/dask_ml/decomposition/truncated_svd.py @@ -148,8 +148,9 @@ def fit(self, X, y=None): def _check_array(self, X): if self.n_components >= X.shape[1]: raise ValueError( - "n_components must be < n_features; " - "got {} >= {}".format(self.n_components, X.shape[1]) + "n_components must be < n_features; " "got {} >= {}".format( + self.n_components, X.shape[1] + ) ) return X diff --git a/dask_ml/ensemble/_blockwise.py b/dask_ml/ensemble/_blockwise.py index f559b941a..359f96346 100644 --- a/dask_ml/ensemble/_blockwise.py +++ b/dask_ml/ensemble/_blockwise.py @@ -41,7 +41,7 @@ def fit(self, X, y, **kwargs): ] results = [ estimator_.fit(X_, y_, **kwargs) - for estimator_, X_, y_, in zip(estimators, Xs, ys) + for estimator_, X_, y_ in zip(estimators, Xs, ys) ] results = list(dask.compute(*results)) self.estimators_ = results diff --git a/dask_ml/impute.py b/dask_ml/impute.py index 76ef02578..2bee08873 100644 --- a/dask_ml/impute.py +++ b/dask_ml/impute.py @@ -35,8 +35,9 @@ def fit(self, X, y=None): allowed_strategies = ["mean", "median", "most_frequent", "constant"] if self.strategy not in allowed_strategies: raise ValueError( - "Can only use these strategies: {0} " - " got strategy={1}".format(allowed_strategies, self.strategy) + "Can only use these strategies: {0} " " got strategy={1}".format( + allowed_strategies, self.strategy + ) ) if not (pd.isna(self.missing_values) or self.strategy == "constant"): diff --git a/dask_ml/linear_model/glm.py b/dask_ml/linear_model/glm.py index 070e6aa8e..9bf675f69 100644 --- a/dask_ml/linear_model/glm.py +++ b/dask_ml/linear_model/glm.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- """Generalized Linear Models for large datasets.""" + import textwrap from dask_glm import algorithms, families diff --git a/dask_ml/linear_model/utils.py b/dask_ml/linear_model/utils.py index 761841468..e2bcad463 100644 --- a/dask_ml/linear_model/utils.py +++ b/dask_ml/linear_model/utils.py @@ -1,5 +1,4 @@ -""" -""" +""" """ import dask.array as da import dask.dataframe as dd diff --git a/dask_ml/metrics/scorer.py b/dask_ml/metrics/scorer.py index 314abddb8..84b3c0648 100644 --- a/dask_ml/metrics/scorer.py +++ b/dask_ml/metrics/scorer.py @@ -39,8 +39,9 @@ def get_scorer(scoring: Union[str, Callable], compute: bool = True) -> Callable: scorer, kwargs = SCORERS[scoring] except KeyError: raise ValueError( - "{} is not a valid scoring value. " - "Valid options are {}".format(scoring, sorted(SCORERS)) + "{} is not a valid scoring value. " "Valid options are {}".format( + scoring, sorted(SCORERS) + ) ) else: scorer = scoring diff --git a/dask_ml/model_selection/_split.py b/dask_ml/model_selection/_split.py index bc2ae064b..ed1ac30ca 100644 --- a/dask_ml/model_selection/_split.py +++ b/dask_ml/model_selection/_split.py @@ -1,5 +1,4 @@ -"""Utilities for splitting datasets. -""" +"""Utilities for splitting datasets.""" import itertools import logging diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index ab7b0ca63..6cd247e80 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -171,16 +171,18 @@ def fit(self, X, y, **fit_params): self.classes_ = np.unique(check_array(y, ensure_2d=False, allow_nd=True)) if self.expected_fit_params: missing = set(self.expected_fit_params) - set(fit_params) - assert ( - len(missing) == 0 - ), "Expected fit parameter(s) %s not " "seen." % list(missing) + assert len(missing) == 0, ( + "Expected fit parameter(s) %s not " "seen." % list(missing) + ) for key, value in fit_params.items(): - assert len(value) == len( - X - ), "Fit parameter %s has length" "%d; expected %d." % ( - key, - len(value), - len(X), + assert len(value) == len(X), ( + "Fit parameter %s has length" + "%d; expected %d." + % ( + key, + len(value), + len(X), + ) ) return self diff --git a/dask_ml/preprocessing/__init__.py b/dask_ml/preprocessing/__init__.py index 0abe6b632..ebc532472 100644 --- a/dask_ml/preprocessing/__init__.py +++ b/dask_ml/preprocessing/__init__.py @@ -1,5 +1,4 @@ -"""Utilties for Preprocessing data. -""" +"""Utilties for Preprocessing data.""" from ._block_transformer import BlockTransformer from ._encoders import OneHotEncoder diff --git a/dask_ml/preprocessing/label.py b/dask_ml/preprocessing/label.py index 906d14aaa..866837789 100644 --- a/dask_ml/preprocessing/label.py +++ b/dask_ml/preprocessing/label.py @@ -219,8 +219,9 @@ def _check_and_search_block(arr, uniques, onehot_dtype=None, block_info=None): if diff: msg = ( - "Block contains previously unseen values {}.\nBlock info:\n\n" - "{}".format(diff, block_info) + "Block contains previously unseen values {}.\nBlock info:\n\n" "{}".format( + diff, block_info + ) ) raise ValueError(msg) diff --git a/dask_ml/utils.py b/dask_ml/utils.py index 53a12815f..6d3a56058 100644 --- a/dask_ml/utils.py +++ b/dask_ml/utils.py @@ -119,9 +119,9 @@ def assert_estimator_equal(left, right, exclude=None, **kwargs): assert left_attrs2 == right_attrs2, left_attrs2 ^ right_attrs2 for attr in left_attrs2: - l = getattr(left, attr) - r = getattr(right, attr) - _assert_eq(l, r, name=attr, **kwargs) + lattr = getattr(left, attr) + rattr = getattr(right, attr) + _assert_eq(lattr, rattr, name=attr, **kwargs) def check_array( @@ -218,7 +218,7 @@ def check_array( return sk_validation.check_array(array, *args, **kwargs) -def _assert_eq(l, r, name=None, **kwargs): +def _assert_eq(lattr, rattr, name=None, **kwargs): array_types = (np.ndarray, da.Array) if getattr(dd, "_dask_expr_enabled", lambda: False)(): from dask_expr import FrameBase @@ -226,19 +226,19 @@ def _assert_eq(l, r, name=None, **kwargs): frame_types = (pd.core.generic.NDFrame, FrameBase) else: frame_types = (pd.core.generic.NDFrame, dd._Frame) - if isinstance(l, array_types): - assert_eq_ar(l, r, **kwargs) - elif isinstance(l, frame_types): - assert_eq_df(l, r, **kwargs) - elif isinstance(l, Sequence) and any( - isinstance(x, array_types + frame_types) for x in l + if isinstance(lattr, array_types): + assert_eq_ar(lattr, rattr, **kwargs) + elif isinstance(lattr, frame_types): + assert_eq_df(lattr, rattr, **kwargs) + elif isinstance(lattr, Sequence) and any( + isinstance(x, array_types + frame_types) for x in lattr ): - for a, b in zip(l, r): + for a, b in zip(lattr, rattr): _assert_eq(a, b, **kwargs) - elif np.isscalar(r) and np.isnan(r): - assert np.isnan(l), (name, l, r) + elif np.isscalar(rattr) and np.isnan(rattr): + assert np.isnan(lattr), (name, lattr, rattr) else: - assert l == r, (name, l, r) + assert lattr == rattr, (name, lattr, rattr) def check_random_state(random_state): diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..f364d8332 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,105 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "dask-ml" +dynamic = ["version"] +description = "A library for distributed and parallel machine learning" +readme = "README.rst" +license = {file = 'LICENSE.txt'} +requires-python = ">=3.8" +authors = [{ name = "Tom Augspurger", email = "taugspurger@anaconda.com" }] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Database", + "Topic :: Scientific/Engineering", +] +dependencies = [ + "dask-glm>=0.2.0", + "dask[array,dataframe]>=2.4.0", + "distributed>=2.4.0", + "multipledispatch>=0.4.9", + "numba>=0.51.0", + "numpy>=1.20.0", + "packaging", + "pandas>=0.24.2", + "scikit-learn>=1.2.0", + "scipy", +] + +[project.optional-dependencies] +complete = ["dask-xgboost", "xgboost"] +dev = [ + "black", + "coverage", + "flake8", + "isort", + "nbsphinx", + "numpydoc", + "pytest", + "pytest-cov", + "pytest-mock", + "sphinx", + "sphinx-gallery", + "sphinx-rtd-theme", +] +docs = ["nbsphinx", "numpydoc", "sphinx", "sphinx-gallery", "sphinx-rtd-theme"] +test = [ + "black", + "coverage", + "flake8", + "isort", + "pytest", + "pytest-cov", + "pytest-mock", +] +xgboost = ["dask-xgboost", "xgboost"] + +[project.urls] +Homepage = "https://github.com/dask/dask-ml" + +[tool.hatch.version] +source = "vcs" + +[tool.hatch.build.hooks.vcs] +version-file = "dask_ml/_version.py" + +[tool.hatch.build.targets.sdist] +include = ["/dask_ml"] + +[tool.mypy] +ignore_missing_imports = true +no_implicit_optional = true +check_untyped_defs = true +strict_equality = true + +[[tool.mypy-dask_ml.metrics]] +check_untyped_defs = false + +[[tool.mypy.overrides]] +module = "dask_ml.model_selection" +follow_imports = "skip" + +[tool.coverage] +source = "dask_ml" + +[tool.pytest] +addopts = "-rsx -v --durations=10 --color=yes" +minversion = "3.2" +xfail_strict = true +junit_family = "xunit2" +filterwarnings = [ + "error:::dask_ml[.*]", + "error:::sklearn[.*]", +] + + +[tool.ruff.lint] +ignore = ["E721", "E731", "E741"] \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index c01a43d30..000000000 --- a/setup.cfg +++ /dev/null @@ -1,47 +0,0 @@ -[flake8] -exclude = tests/data,docs,benchmarks,scripts,.tox,env,.eggs,build -max-line-length = 88 -ignore = - # Assigning lambda expression - E731 - # Ambiguous variable names - E741 - # line break before binary operator - W503 - # whitespace before : - E203 - -[isort] -known_first_party=dask_ml -known_third_party=sklearn,dask,distributed,dask_glm,pandas,coloredlogs,git,packaging.version,packaging,numpy,pytest,scipy,toolz,multipledispatch,numba,tornado -multi_line_output=3 -include_trailing_comma=True -force_grid_wrap=0 -combine_as_imports=True -line_length=88 -skip= - docs/source/conf.py - -[coverage:run] -source=dask_ml - -[mypy] -ignore_missing_imports=True -no_implicit_optional=True -check_untyped_defs=True -strict_equality=True - -[mypy-dask_ml.metrics] -check_untyped_defs=False - -[mypy-dask_ml.model_selection.*] -follow_imports=skip - -[tool:pytest] -addopts = -rsx -v --durations=10 --color=yes -minversion = 3.2 -xfail_strict = true -junit_family = xunit2 -filterwarnings = - error:::dask_ml[.*] - error:::sklearn[.*] diff --git a/setup.py b/setup.py deleted file mode 100644 index 960413ddc..000000000 --- a/setup.py +++ /dev/null @@ -1,74 +0,0 @@ -import os -from codecs import open - -from setuptools import find_packages, setup - -here = os.path.dirname(__file__) - - -# Get the long description from the README file -with open(os.path.join(here, "README.rst"), encoding="utf-8") as f: - long_description = f.read() - -install_requires = [ - "dask[array,dataframe]>=2.4.0", - "distributed>=2.4.0", - "numba>=0.51.0", - "numpy>=1.20.0", - "pandas>=0.24.2", - "scikit-learn>=1.2.0", - "scipy", - "dask-glm>=0.2.0", - "multipledispatch>=0.4.9", - "packaging", -] - -# Optional Requirements -doc_requires = ["sphinx", "numpydoc", "sphinx-rtd-theme", "nbsphinx", "sphinx-gallery"] -test_requires = [ - "black", - "coverage", - "flake8", - "isort", - "pytest", - "pytest-cov", - "pytest-mock", -] -dev_requires = doc_requires + test_requires -xgboost_requires = ["dask-xgboost", "xgboost"] -complete_requires = xgboost_requires - -extras_require = { - "docs": doc_requires, - "test": test_requires, - "dev": dev_requires, - "xgboost": xgboost_requires, - "complete": complete_requires, -} - -setup( - name="dask-ml", - description="A library for distributed and parallel machine learning", - long_description=long_description, - url="https://github.com/dask/dask-ml", - author="Tom Augspurger", - author_email="taugspurger@anaconda.com", - license="BSD", - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Topic :: Database", - "Topic :: Scientific/Engineering", - "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - ], - packages=find_packages(exclude=["docs", "tests", "tests.*", "docs.*"]), - use_scm_version=True, - setup_requires=["setuptools_scm"], - install_requires=install_requires, - extras_require=extras_require, - python_requires=">=3.8", -) diff --git a/tests/conftest.py b/tests/conftest.py index 8f9c16c01..5666eb2a1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -54,8 +54,10 @@ def Xl_blobs(): Tuple of (X, labels) for a classification task. `X` and `l` are both dask arrays """ - X, l = make_classification(n_samples=1000, n_features=4, chunks=500, random_state=1) - return X, l + X, label = make_classification( + n_samples=1000, n_features=4, chunks=500, random_state=1 + ) + return X, label @pytest.fixture