verde/model_selection.py

"""
Functions for automating model selection through cross-validation.
"""
import warnings

import numpy as np
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.base import clone
from sklearn.utils import check_random_state

from .base import check_fit_input, n_1d_arrays, BaseBlockCrossValidator
from .base.utils import score_estimator
from .coordinates import block_split
from .utils import dispatch, partition_by_sum


# Pylint doesn't like X, y scikit-learn argument names.
# pylint: disable=invalid-name,unused-argument


class BlockShuffleSplit(BaseBlockCrossValidator):
    """
    Random permutation of spatial blocks cross-validator.

    Yields indices to split data into training and test sets. Data are first
    grouped into rectangular blocks of size given by the *spacing* argument.
    Alternatively, blocks can be defined by the number of blocks in each
    dimension using the *shape* argument instead of *spacing*. The blocks are
    then split into testing and training sets randomly.

    The proportion of blocks assigned to each set is controlled by *test_size*
    and/or *train_size*. However, the total amount of actual data points in
    each set could be different from these values since blocks can have
    a different number of data points inside them. To guarantee that the
    proportion of actual data is as close as possible to the proportion of
    blocks, this cross-validator generates an extra number of splits and
    selects the one with proportion of data points in each set closer to the
    desired amount [Valavi_etal2019]_. The number of balancing splits per
    iteration is controlled by the *balancing* argument.

    This cross-validator is preferred over
    :class:`sklearn.model_selection.ShuffleSplit` for spatial data to avoid
    overestimating cross-validation scores. This can happen because of the
    inherent autocorrelation that is usually associated with this type of data
    (points that are close together are more likely to have similar values).
    See [Roberts_etal2017]_ for an overview of this topic.

    .. note::

        Like :class:`sklearn.model_selection.ShuffleSplit`, this
        cross-validator cannot guarantee that all folds will be different,
        although this is still very likely for sizeable datasets.

    Parameters
    ----------
    spacing : float, tuple = (s_north, s_east), or None
        The block size in the South-North and West-East directions,
        respectively. A single value means that the spacing is equal in both
        directions. If None, then *shape* **must be provided**.
    shape : tuple = (n_north, n_east) or None
        The number of blocks in the South-North and West-East directions,
        respectively. If None, then *spacing* **must be provided**.
    n_splits : int, default 10
        Number of re-shuffling & splitting iterations.
    test_size : float, int, None, default=None
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. If ``train_size`` is also None, it will
        be set to 0.1.
    train_size : float, int, or None, default=None
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    balancing : int
        The number of splits generated per iteration to try to balance the
        amount of data in each set so that *test_size* and *train_size* are
        respected. If 1, then no extra splits are generated (essentially
        disabling the balancing). Must be >= 1.

    See also
    --------
    train_test_split : Split a dataset into a training and a testing set.
    cross_val_score : Score an estimator/gridder using cross-validation.

    Examples
    --------

    >>> from verde import grid_coordinates
    >>> import numpy as np
    >>> # Make a regular grid of data points
    >>> coords = grid_coordinates(region=(0, 3, -10, -7), spacing=1)
    >>> # Need to convert the coordinates into a feature matrix
    >>> X = np.transpose([i.ravel() for i in coords])
    >>> shuffle = BlockShuffleSplit(spacing=1.5, n_splits=3, random_state=0)
    >>> # These are the 1D indices of the points belonging to each set
    >>> for train, test in shuffle.split(X):
    ...     print("Train: {} Test: {}".format(train, test))
    Train: [ 0  1  2  3  4  5  6  7 10 11 14 15] Test: [ 8  9 12 13]
    Train: [ 2  3  6  7  8  9 10 11 12 13 14 15] Test: [0 1 4 5]
    Train: [ 0  1  4  5  8  9 10 11 12 13 14 15] Test: [2 3 6 7]
    >>> # A better way to visualize this is to create a 2D array and put
    >>> # "train" or "test" in the corresponding locations.
    >>> shape = coords[0].shape
    >>> mask = np.full(shape=shape, fill_value="     ")
    >>> for iteration, (train, test) in enumerate(shuffle.split(X)):
    ...     # The index needs to be converted to 2D so we can index our matrix.
    ...     mask[np.unravel_index(train, shape)] = "train"
    ...     mask[np.unravel_index(test, shape)] = " test"
    ...     print("Iteration {}:".format(iteration))
    ...     print(mask)
    Iteration 0:
    [['train' 'train' 'train' 'train']
     ['train' 'train' 'train' 'train']
     [' test' ' test' 'train' 'train']
     [' test' ' test' 'train' 'train']]
    Iteration 1:
    [[' test' ' test' 'train' 'train']
     [' test' ' test' 'train' 'train']
     ['train' 'train' 'train' 'train']
     ['train' 'train' 'train' 'train']]
    Iteration 2:
    [['train' 'train' ' test' ' test']
     ['train' 'train' ' test' ' test']
     ['train' 'train' 'train' 'train']
     ['train' 'train' 'train' 'train']]

    """

    def __init__(
        self,
        spacing=None,
        shape=None,
        n_splits=10,
        test_size=0.1,
        train_size=None,
        random_state=None,
        balancing=10,
    ):
        super().__init__(spacing=spacing, shape=shape, n_splits=n_splits)
        if balancing < 1:
            raise ValueError(
                "The *balancing* argument must be >= 1. To disable balancing, use 1."
            )
        self.test_size = test_size
        self.train_size = train_size
        self.random_state = random_state
        self.balancing = balancing

    def _iter_test_indices(self, X=None, y=None, groups=None):
        """
        Generates integer indices corresponding to test sets.

        Runs several iterations until a split is found that yields blocks with
        the right amount of data points in it.

        Parameters
        ----------
        X : array-like, shape (n_samples, 2)
            Columns should be the easting and northing coordinates of data
            points, respectively.
        y : array-like, shape (n_samples,)
            The target variable for supervised learning problems. Always
            ignored.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set. Always ignored.

        Yields
        ------
        test : ndarray
            The testing set indices for that split.

        """
        labels = block_split(
            coordinates=(X[:, 0], X[:, 1]),
            spacing=self.spacing,
            shape=self.shape,
            region=None,
            adjust="spacing",
        )[1]
        block_ids = np.unique(labels)
        # Generate many more splits so that we can pick and choose the ones
        # that have the right balance of training and testing data.
        shuffle = ShuffleSplit(
            n_splits=self.n_splits * self.balancing,
            test_size=self.test_size,
            train_size=self.train_size,
            random_state=self.random_state,
        ).split(block_ids)
        for _ in range(self.n_splits):
            test_sets, balance = [], []
            for _ in range(self.balancing):
                # This is a false positive in pylint which is why the warning
                # is disabled at the top of this file:
                # https://github.com/PyCQA/pylint/issues/1830
                # pylint: disable=stop-iteration-return
                train_blocks, test_blocks = next(shuffle)
                # pylint: enable=stop-iteration-return
                train_points = np.where(np.isin(labels, block_ids[train_blocks]))[0]
                test_points = np.where(np.isin(labels, block_ids[test_blocks]))[0]
                # The proportion of data points assigned to each group should
                # be close the proportion of blocks assigned to each group.
                balance.append(
                    abs(
                        train_points.size / test_points.size
                        - train_blocks.size / test_blocks.size
                    )
                )
                test_sets.append(test_points)
            best = np.argmin(balance)
            yield test_sets[best]


class BlockKFold(BaseBlockCrossValidator):
    """
    K-Folds over spatial blocks cross-validator.

    Yields indices to split data into training and test sets. Data are first
    grouped into rectangular blocks of size given by the *spacing* argument.
    Alternatively, blocks can be defined by the number of blocks in each
    dimension using the *shape* argument instead of *spacing*. The blocks are
    then split into testing and training sets iteratively along k folds of the
    data (k is given by *n_splits*).

    By default, the blocks are split into folds in a way that makes each fold
    have approximately the same number of data points. Sometimes this might not
    be possible, which can happen if the number of splits is close to the
    number of blocks. In these cases, each fold will have the same number of
    blocks, regardless of how many data points are in each block. This
    behaviour can also be disabled by setting ``balance=False``.

    Shuffling the blocks prior to splitting is strongly encouraged. Not
    shuffling will essentially lead to the creation of *n_splits* large blocks
    since blocks are spatially adjacent when not shuffled. The default
    behaviour is not to shuffle for compatibility with similar cross-validators
    in scikit-learn.

    This cross-validator is preferred over
    :class:`sklearn.model_selection.KFold` for spatial data to avoid
    overestimating cross-validation scores. This can happen because of the
    inherent autocorrelation that is usually associated with this type of data
    (points that are close together are more likely to have similar values).
    See [Roberts_etal2017]_ for an overview of this topic.

    Parameters
    ----------
    spacing : float, tuple = (s_north, s_east), or None
        The block size in the South-North and West-East directions,
        respectively. A single value means that the spacing is equal in both
        directions. If None, then *shape* **must be provided**.
    shape : tuple = (n_north, n_east) or None
        The number of blocks in the South-North and West-East directions,
        respectively. If None, then *spacing* **must be provided**.
    n_splits : int
        Number of folds. Must be at least 2.
    shuffle : bool
        Whether to shuffle the data before splitting into batches.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    balance : bool
        Whether or not to split blocks into fold with approximately equal
        number of data points. If False, each fold will have the same number of
        blocks (which can have different number of data points in them).

    See also
    --------
    train_test_split : Split a dataset into a training and a testing set.
    cross_val_score : Score an estimator/gridder using cross-validation.

    Examples
    --------

    >>> from verde import grid_coordinates
    >>> import numpy as np
    >>> # Make a regular grid of data points
    >>> coords = grid_coordinates(region=(0, 3, -10, -7), spacing=1)
    >>> # Need to convert the coordinates into a feature matrix
    >>> X = np.transpose([i.ravel() for i in coords])
    >>> kfold = BlockKFold(spacing=1.5, n_splits=4)
    >>> # These are the 1D indices of the points belonging to each set
    >>> for train, test in kfold.split(X):
    ...     print("Train: {} Test: {}".format(train, test))
    Train: [ 2  3  6  7  8  9 10 11 12 13 14 15] Test: [0 1 4 5]
    Train: [ 0  1  4  5  8  9 10 11 12 13 14 15] Test: [2 3 6 7]
    Train: [ 0  1  2  3  4  5  6  7 10 11 14 15] Test: [ 8  9 12 13]
    Train: [ 0  1  2  3  4  5  6  7  8  9 12 13] Test: [10 11 14 15]
    >>> # A better way to visualize this is to create a 2D array and put
    >>> # "train" or "test" in the corresponding locations.
    >>> shape = coords[0].shape
    >>> mask = np.full(shape=shape, fill_value="     ")
    >>> for iteration, (train, test) in enumerate(kfold.split(X)):
    ...     # The index needs to be converted to 2D so we can index our matrix.
    ...     mask[np.unravel_index(train, shape)] = "train"
    ...     mask[np.unravel_index(test, shape)] = " test"
    ...     print("Iteration {}:".format(iteration))
    ...     print(mask)
    Iteration 0:
    [[' test' ' test' 'train' 'train']
     [' test' ' test' 'train' 'train']
     ['train' 'train' 'train' 'train']
     ['train' 'train' 'train' 'train']]
    Iteration 1:
    [['train' 'train' ' test' ' test']
     ['train' 'train' ' test' ' test']
     ['train' 'train' 'train' 'train']
     ['train' 'train' 'train' 'train']]
    Iteration 2:
    [['train' 'train' 'train' 'train']
     ['train' 'train' 'train' 'train']
     [' test' ' test' 'train' 'train']
     [' test' ' test' 'train' 'train']]
    Iteration 3:
    [['train' 'train' 'train' 'train']
     ['train' 'train' 'train' 'train']
     ['train' 'train' ' test' ' test']
     ['train' 'train' ' test' ' test']]

    For spatial data, it's often good to shuffle the blocks before assigning
    them to folds:

    >>> # Set the random_state to make sure we always get the same result.
    >>> kfold = BlockKFold(
    ...     spacing=1.5, n_splits=4, shuffle=True, random_state=123,
    ... )
    >>> for train, test in kfold.split(X):
    ...     print("Train: {} Test: {}".format(train, test))
    Train: [ 0  1  2  3  4  5  6  7  8  9 12 13] Test: [10 11 14 15]
    Train: [ 2  3  6  7  8  9 10 11 12 13 14 15] Test: [0 1 4 5]
    Train: [ 0  1  4  5  8  9 10 11 12 13 14 15] Test: [2 3 6 7]
    Train: [ 0  1  2  3  4  5  6  7 10 11 14 15] Test: [ 8  9 12 13]

    These should be the same splits as we got before but in a different order.
    This only happens because in this example we have the number of splits
    equal to the number of blocks (4). With real data the effects would be more
    dramatic.

    """

    def __init__(
        self,
        spacing=None,
        shape=None,
        n_splits=5,
        shuffle=False,
        random_state=None,
        balance=True,
    ):
        super().__init__(spacing=spacing, shape=shape, n_splits=n_splits)
        if n_splits < 2:
            raise ValueError(
                "Number of splits must be >=2 for BlockKFold. Given {}.".format(
                    n_splits
                )
            )
        self.shuffle = shuffle
        self.random_state = random_state
        self.balance = balance

    def _iter_test_indices(self, X=None, y=None, groups=None):
        """
        Generates integer indices corresponding to test sets.

        Parameters
        ----------
        X : array-like, shape (n_samples, 2)
            Columns should be the easting and northing coordinates of data
            points, respectively.
        y : array-like, shape (n_samples,)
            The target variable for supervised learning problems. Always
            ignored.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set. Always ignored.

        Yields
        ------
        test : ndarray
            The testing set indices for that split.

        """
        labels = block_split(
            coordinates=(X[:, 0], X[:, 1]),
            spacing=self.spacing,
            shape=self.shape,
            region=None,
            adjust="spacing",
        )[1]
        block_ids = np.unique(labels)
        if self.n_splits > block_ids.size:
            raise ValueError(
                "Number of k-fold splits ({}) cannot be greater than the number of "
                "blocks ({}). Either decrease n_splits or increase the number of "
                "blocks.".format(self.n_splits, block_ids.size)
            )
        if self.shuffle:
            check_random_state(self.random_state).shuffle(block_ids)
        if self.balance:
            block_sizes = [np.isin(labels, i).sum() for i in block_ids]
            try:
                split_points = partition_by_sum(block_sizes, parts=self.n_splits)
                folds = np.split(np.arange(block_ids.size), split_points)
            except ValueError:
                warnings.warn(
                    "Could not balance folds to have approximately the same "
                    "number of data points. Dividing into folds with equal "
                    "number of blocks instead. Decreasing n_splits or increasing "
                    "the number of blocks may help.",
                    UserWarning,
                )
                folds = [i for _, i in KFold(n_splits=self.n_splits).split(block_ids)]
        else:
            folds = [i for _, i in KFold(n_splits=self.n_splits).split(block_ids)]
        for test_blocks in folds:
            test_points = np.where(np.isin(labels, block_ids[test_blocks]))[0]
            yield test_points


# pylint: enable=invalid-name,unused-argument


def train_test_split(
    coordinates, data, weights=None, spacing=None, shape=None, **kwargs
):
    r"""
    Split a dataset into a training and a testing set for cross-validation.

    Similar to :func:`sklearn.model_selection.train_test_split` but is tuned to
    work on single- or multi-component spatial data with optional weights.

    If arguments *shape* or *spacing* are provided, will group the data by
    spatial blocks before random splitting (using
    :class:`verde.BlockShuffleSplit` instead of
    :class:`sklearn.model_selection.ShuffleSplit`). The argument *spacing*
    specifies the size of the spatial blocks. Alternatively, use *shape* to
    specify the number of blocks in each dimension.

    Extra keyword arguments will be passed to the cross-validation class. The
    exception is ``n_splits`` which is always 1.

    Grouping by spatial blocks is preferred over plain random splits for
    spatial data to avoid overestimating validation scores. This can happen
    because of the inherent autocorrelation that is usually associated with
    this type of data (points that are close together are more likely to have
    similar values). See [Roberts_etal2017]_ for an overview of this topic. To
    use spatial blocking, you **must provide** a *spacing* or *shape* argument
    (see below).

    Parameters
    ----------
    coordinates : tuple of arrays
        Arrays with the coordinates of each data point. Should be in the
        following order: (easting, northing, vertical, ...).
    data : array or tuple of arrays
        the data values of each data point. If the data has more than one
        component, *data* must be a tuple of arrays (one for each component).
    weights : none or array or tuple of arrays
        if not none, then the weights assigned to each data point. If more than
        one data component is provided, you must provide a weights array for
        each data component (if not none).
    spacing : float, tuple = (s_north, s_east), or None
        The spatial block size in the South-North and West-East directions,
        respectively. A single value means that the spacing is equal in both
        directions. If None, then *shape* must be provided in order to use
        spatial blocking.
    shape : tuple = (n_north, n_east) or None
        The number of blocks in the South-North and West-East directions,
        respectively. If None, then *spacing* must be provided in order to use
        spatial blocking.

    Returns
    -------
    train, test : tuples
        Each is a tuple = (coordinates, data, weights) generated by separating
        the input values randomly.

    See also
    --------
    cross_val_score : Score an estimator/gridder using cross-validation.
    BlockShuffleSplit : Random permutation of spatial blocks cross-validator.

    Examples
    --------

    To randomly split the data between training and testing sets:

    >>> import numpy as np
    >>> # Make some data
    >>> data = np.array([10, 30, 50, 70])
    >>> coordinates = (np.arange(4), np.arange(-4, 0))
    >>> train, test = train_test_split(coordinates, data, random_state=0)
    >>> # The training set:
    >>> print("coords:", train[0])
    coords: (array([3, 1, 0]), array([-1, -3, -4]))
    >>> print("data:", train[1])
    data: (array([70, 30, 10]),)
    >>> # The testing set:
    >>> print("coords:", test[0])
    coords: (array([2]), array([-2]))
    >>> print("data:", test[1])
    data: (array([50]),)

    If weights are given, they will also be split among the sets:

    >>> weights = np.array([4, 3, 2, 5])
    >>> train, test = train_test_split(
    ...     coordinates, data, weights, random_state=0,
    ... )
    >>> # The training set:
    >>> print("coords:", train[0])
    coords: (array([3, 1, 0]), array([-1, -3, -4]))
    >>> print("data:", train[1])
    data: (array([70, 30, 10]),)
    >>> print("weights:", train[2])
    weights: (array([5, 3, 4]),)
    >>> # The testing set:
    >>> print("coords:", test[0])
    coords: (array([2]), array([-2]))
    >>> print("data:", test[1])
    data: (array([50]),)
    >>> print("weights:", test[2])
    weights: (array([2]),)

    Data with multiple components can also be split:

    >>> data = (np.array([10, 30, 50, 70]), np.array([-70, -50, -30, -10]))
    >>> train, test = train_test_split(coordinates, data, random_state=0)
    >>> # The training set:
    >>> print("coords:", train[0])
    coords: (array([3, 1, 0]), array([-1, -3, -4]))
    >>> print("data:", train[1])
    data: (array([70, 30, 10]), array([-10, -50, -70]))
    >>> # The testing set:
    >>> print("coords:", test[0])
    coords: (array([2]), array([-2]))
    >>> print("data:", test[1])
    data: (array([50]), array([-30]))

    To split data grouped in spatial blocks:

    >>> from verde import grid_coordinates
    >>> # Make a regular grid of data points
    >>> coordinates = grid_coordinates(region=(0, 3, 4, 7), spacing=1)
    >>> data = np.arange(16).reshape((4, 4))
    >>> # We must specify the size of the blocks via the spacing argument.
    >>> # Blocks of 1.5 will split the domain into 4 blocks.
    >>> train, test = train_test_split(
    ...     coordinates, data, random_state=0, spacing=1.5,
    ... )
    >>> # The training set:
    >>> print("coords:", train[0][0], train[0][1], sep="\n")
    coords:
    [0. 1. 2. 3. 0. 1. 2. 3. 2. 3. 2. 3.]
    [4. 4. 4. 4. 5. 5. 5. 5. 6. 6. 7. 7.]
    >>> print("data:", train[1])
    data: (array([ 0,  1,  2,  3,  4,  5,  6,  7, 10, 11, 14, 15]),)
    >>> # The testing set:
    >>> print("coords:", test[0][0], test[0][1])
    coords: [0. 1. 0. 1.] [6. 6. 7. 7.]
    >>> print("data:", test[1])
    data: (array([ 8,  9, 12, 13]),)

    """
    args = check_fit_input(coordinates, data, weights, unpack=False)
    if spacing is None and shape is None:
        indices = np.arange(args[1][0].size)
        shuffle = ShuffleSplit(n_splits=1, **kwargs).split(indices)
    else:
        feature_matrix = np.transpose(n_1d_arrays(coordinates, 2))
        shuffle = BlockShuffleSplit(
            n_splits=1, spacing=spacing, shape=shape, **kwargs
        ).split(feature_matrix)
    split = next(shuffle)
    train, test = (tuple(select(i, index) for i in args) for index in split)
    return train, test


def cross_val_score(
    estimator,
    coordinates,
    data,
    weights=None,
    cv=None,
    client=None,
    delayed=False,
    scoring=None,
):
    """
    Score an estimator/gridder using cross-validation.

    Similar to :func:`sklearn.model_selection.cross_val_score` but modified to
    accept spatial multi-component data with weights.

    By default, will use :class:`sklearn.model_selection.KFold` with
    ``n_splits=5`` and ``random_state=0`` to split the dataset. Any other
    cross-validation class from scikit-learn or Verde can be passed in through
    the *cv* argument.

    The score is calculated using the estimator/gridder's ``.score`` method by
    default. Alternatively, use the *scoring* parameter to specify a different
    scoring function (e.g., mean square error, mean absolute error, etc).

    Can optionally run in parallel using :mod:`dask`. To do this, use
    ``delayed=True`` to dispatch computations with :func:`dask.delayed` instead
    of running them. The returned scores will be "lazy" objects instead of the
    actual scores. To trigger the computation (which Dask will run in parallel)
    call the `.compute()` method of each score or :func:`dask.compute` with the
    entire list of scores.

    .. warning::

        The ``client`` parameter is deprecated and will be removed in Verde
        v2.0.0. Use ``delayed`` instead.

    Parameters
    ----------
    estimator : verde gridder
        Any verde gridder class that has the ``fit`` and ``score`` methods.
    coordinates : tuple of arrays
        Arrays with the coordinates of each data point. Should be in the
        following order: (easting, northing, vertical, ...).
    data : array or tuple of arrays
        the data values of each data point. If the data has more than one
        component, *data* must be a tuple of arrays (one for each component).
    weights : none or array or tuple of arrays
        if not none, then the weights assigned to each data point. If more than
        one data component is provided, you must provide a weights array for
        each data component (if not none).
    cv : None or cross-validation generator
        Any scikit-learn or Verde cross-validation generator. Defaults to
        :class:`sklearn.model_selection.KFold`.
    client : None or dask.distributed.Client
        **DEPRECATED:** This option is deprecated and will be removed in Verde
        v2.0.0. If None, then computations are run serially. Otherwise, should
        be a dask ``Client`` object. It will be used to dispatch computations
        to the dask cluster.
    delayed : bool
        If True, will use :func:`dask.delayed` to dispatch computations without
        actually executing them. The returned scores will be a list of delayed
        objects. Call `.compute()` on each score or :func:`dask.compute` on the
        entire list to trigger the actual computations.
    scoring : None, str, or callable
        A scoring function (or name of a function) known to scikit-learn. See
        the description of *scoring* in
        :func:`sklearn.model_selection.cross_val_score` for details. If None,
        will fall back to the estimator's ``.score`` method.

    Returns
    -------
    scores : array
        Array of scores for each split of the cross-validation generator. If
        *delayed*, will be a list of Dask delayed objects (see the *delayed*
        option). If *client* is not None, then the scores will be futures.

    See also
    --------
    train_test_split : Split a dataset into a training and a testing set.
    BlockShuffleSplit : Random permutation of spatial blocks cross-validator.

    Examples
    --------

    As an example, we can score :class:`verde.Trend` on data that actually
    follows a linear trend.

    >>> from verde import grid_coordinates, Trend
    >>> coords = grid_coordinates((0, 10, -10, -5), spacing=0.1)
    >>> data = 10 - coords[0] + 0.5*coords[1]
    >>> model = Trend(degree=1)

    In this case, the model should perfectly predict the data and R² scores
    should be equal to 1.

    >>> scores = cross_val_score(model, coords, data)
    >>> print(', '.join(['{:.2f}'.format(score) for score in scores]))
    1.00, 1.00, 1.00, 1.00, 1.00

    There are 5 scores because the default cross-validator is
    :class:`sklearn.model_selection.KFold` with ``n_splits=5``.

    To calculate the score with a different metric, use the *scoring* argument:

    >>> scores = cross_val_score(
    ...     model, coords, data, scoring="neg_mean_squared_error",
    ... )
    >>> print(', '.join(['{:.2f}'.format(-score) for score in scores]))
    0.00, 0.00, 0.00, 0.00, 0.00

    In this case, we calculated the (negative) mean squared error (MSE) which
    measures the distance between test data and predictions. This way, 0 is the
    best possible value meaning that the data and prediction are the same. The
    "neg" part indicates that this is the negative mean square error. This is
    required because scikit-learn assumes that higher scores are always treated
    as better (which is the opposite for MSE). For display, we take the
    negative of the score to get the actual MSE.

    We can use different cross-validators by assigning them to the ``cv``
    argument:

    >>> from sklearn.model_selection import ShuffleSplit
    >>> # Set the random state to get reproducible results
    >>> cross_validator = ShuffleSplit(n_splits=3, random_state=0)
    >>> scores = cross_val_score(model, coords, data, cv=cross_validator)
    >>> print(', '.join(['{:.2f}'.format(score) for score in scores]))
    1.00, 1.00, 1.00

    Often, spatial data are autocorrelated (points that are close together are
    more likely to have similar values), which can cause cross-validation with
    random splits to overestimate the prediction accuracy [Roberts_etal2017]_.
    To account for the autocorrelation, we can split the data in blocks rather
    than randomly with :class:`verde.BlockShuffleSplit`:

    >>> from verde import BlockShuffleSplit
    >>> # spacing controls the size of the spatial blocks
    >>> cross_validator = BlockShuffleSplit(
    ...     spacing=2, n_splits=3, random_state=0
    ... )
    >>> scores = cross_val_score(model, coords, data, cv=cross_validator)
    >>> print(', '.join(['{:.2f}'.format(score) for score in scores]))
    1.00, 1.00, 1.00

    We didn't see a difference here since our model and data are perfect. See
    :ref:`model_evaluation` for an example with real data.

    If using many splits, we can speed up computations by running them in
    parallel with Dask:

    >>> cross_validator = ShuffleSplit(n_splits=10, random_state=0)
    >>> scores_delayed = cross_val_score(
    ...     model, coords, data, cv=cross_validator, delayed=True
    ... )
    >>> # The scores are delayed objects.
    >>> # To actually run the computations, call dask.compute
    >>> import dask
    >>> scores = dask.compute(*scores_delayed)
    >>> print(', '.join(['{:.2f}'.format(score) for score in scores]))
    1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00

    Note that you must have enough RAM to fit multiple models simultaneously.
    So this is best used when fitting several smaller models.

    """
    if client is not None:
        warnings.warn(
            "The 'client' parameter of 'verde.cross_val_score' is deprecated "
            "and will be removed in Verde 2.0.0. "
            "Use the 'delayed' parameter instead.",
            FutureWarning,
        )
    coordinates, data, weights = check_fit_input(
        coordinates, data, weights, unpack=False
    )
    if cv is None:
        cv = KFold(shuffle=True, random_state=0, n_splits=5)
    feature_matrix = np.transpose(n_1d_arrays(coordinates, 2))
    fit_args = (coordinates, data, weights)
    scores = []
    for train_index, test_index in cv.split(feature_matrix):
        # Clone the estimator to avoid fitting the same object simultaneously
        # when delayed=True.
        score = dispatch(fit_score, client=client, delayed=delayed)(
            clone(estimator),
            tuple(select(i, train_index) for i in fit_args),
            tuple(select(i, test_index) for i in fit_args),
            scoring,
        )
        scores.append(score)
    if not delayed and client is None:
        scores = np.asarray(scores)
    return scores


def fit_score(estimator, train_data, test_data, scoring):
    """
    Fit an estimator on the training data and then score it on the testing data
    """
    estimator.fit(*train_data)
    if scoring is None:
        score = estimator.score(*test_data)
    else:
        score = score_estimator(scoring, estimator, *test_data)
    return score


def select(arrays, index):
    """
    Index each array in a tuple of arrays.

    If the arrays tuple contains a ``None``, the entire tuple will be returned
    as is.

    Parameters
    ----------
    arrays : tuple of arrays
    index : array
        An array of indices to select from arrays.

    Returns
    -------
    indexed_arrays : tuple of arrays

    Examples
    --------

    >>> import numpy as np
    >>> select((np.arange(5), np.arange(-3, 2, 1)), [1, 3])
    (array([1, 3]), array([-2,  0]))
    >>> select((None, None, None, None), [1, 2])
    (None, None, None, None)

    """
    if arrays is None or any(i is None for i in arrays):
        return arrays
    return tuple(i.ravel()[index] for i in arrays)