Skip to content

Commit

Permalink
Log loss metric (#318)
Browse files Browse the repository at this point in the history
* log_loss sample weight

* log_loss sample weight

* cleanup

* sample_weight

* sort
  • Loading branch information
TomAugspurger committed Jul 27, 2018
1 parent 6c6eeb4 commit 3834756
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 20 deletions.
2 changes: 1 addition & 1 deletion dask_ml/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
euclidean_distances,
)
from .regression import mean_absolute_error, mean_squared_error, r2_score # noqa
from .classification import accuracy_score # noqa
from .classification import accuracy_score, log_loss # noqa

from .scorer import get_scorer, check_scoring, SCORERS # noqa
48 changes: 47 additions & 1 deletion dask_ml/metrics/classification.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

import dask.array as da
import numpy as np
import packaging.version
import sklearn.metrics

from .._compat import DASK_VERSION

Expand Down Expand Up @@ -91,3 +92,48 @@ def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None, compute=T
if compute:
score = score.compute()
return score


def _log_loss_inner(x, y, sample_weight, **kwargs):
# da.map_blocks wasn't able to concatenate together the results
# when we reduce down to a scalar per block. So we make an
# array with 1 element.
if sample_weight is not None:
sample_weight = sample_weight.ravel()
return np.array(
[sklearn.metrics.log_loss(x, y, sample_weight=sample_weight, **kwargs)]
)


def log_loss(
y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None
):
if y_true.ndim == 1:
y_true = y_true.reshape(-1, 1)
if sample_weight is not None:
sample_weight = sample_weight.reshape(-1, 1)
assert y_pred.ndim == 2

result = da.map_blocks(
_log_loss_inner,
y_true,
y_pred,
sample_weight,
chunks=(1,),
drop_axis=1,
dtype="f8",
eps=eps,
normalize=normalize,
labels=labels,
)
if normalize and sample_weight is not None:
sample_weight = sample_weight.ravel()
block_weights = sample_weight.map_blocks(np.sum, chunks=(1,), keepdims=True)
return da.average(result, 0, weights=block_weights)
elif normalize:
return result.mean()
else:
return result.sum()


log_loss.__doc__ = getattr(sklearn.metrics.log_loss, "__doc__")
4 changes: 3 additions & 1 deletion dask_ml/metrics/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,20 @@
from sklearn.metrics import make_scorer
from sklearn.metrics.scorer import check_scoring as sklearn_check_scoring

from . import accuracy_score, mean_squared_error, r2_score
from . import accuracy_score, log_loss, mean_squared_error, r2_score

# Scorers
accuracy_scorer = make_scorer(accuracy_score)
neg_mean_squared_error_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)
neg_log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)


SCORERS = dict(
accuracy=accuracy_scorer,
neg_mean_squared_error=neg_mean_squared_error_scorer,
r2=r2_scorer,
neg_log_loss=neg_log_loss_scorer,
)


Expand Down
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Enhancements
------------

- Automatically replace default scikit-learn scorers with dask-aware versions in Incremental (:issue:`200`)
- Added the :func:`dask_ml.metrics.log_loss` loss function and ``neg_log_loss`` scorer (:pr:`318`)

Bug Fixes
---------
Expand Down
1 change: 1 addition & 0 deletions docs/source/modules/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ Classification Metrics
:toctree: generated/

metrics.accuracy_score
metrics.log_loss


:mod:`dask_ml.tensorflow`: Tensorflow
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ ignore =

[isort]
known_first_party=dask_ml
known_third_party=sklearn,dask,distributed,dask_glm,pandas,coloredlogs,git,packaging.version,packaging,numpy,pytest,scipy
known_third_party=sklearn,dask,distributed,dask_glm,pandas,coloredlogs,git,packaging.version,packaging,numpy,pytest,scipy,six
multi_line_output=3
include_trailing_comma=True
force_grid_wrap=0
Expand Down
69 changes: 53 additions & 16 deletions tests/metrics/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@
import packaging.version
import pytest
import sklearn
import sklearn.metrics as sm
import sklearn.metrics
from dask.array.utils import assert_eq

import dask_ml.metrics as dm
import dask_ml.metrics
from dask_ml._compat import SK_VERSION, dummy_context


def test_pairwise_distances(X_blobs):
centers = X_blobs[::100].compute()
result = dm.pairwise_distances(X_blobs, centers)
expected = sm.pairwise_distances(X_blobs.compute(), centers)
result = dask_ml.metrics.pairwise_distances(X_blobs, centers)
expected = sklearn.metrics.pairwise_distances(X_blobs.compute(), centers)
assert_eq(result, expected, atol=1e-4)


Expand All @@ -32,8 +32,10 @@ def test_pairwise_distances_argmin_min(X_blobs):
ctx = dummy_context()

with ctx:
a_, b_ = sm.pairwise_distances_argmin_min(X_blobs.compute(), centers)
a, b = dm.pairwise_distances_argmin_min(X_blobs, centers)
a_, b_ = sklearn.metrics.pairwise_distances_argmin_min(
X_blobs.compute(), centers
)
a, b = dask_ml.metrics.pairwise_distances_argmin_min(X_blobs, centers)
a, b = dask.compute(a, b)

npt.assert_array_equal(a, a_)
Expand All @@ -43,25 +45,25 @@ def test_pairwise_distances_argmin_min(X_blobs):
def test_euclidean_distances():
X = da.random.uniform(size=(100, 4), chunks=50)
Y = da.random.uniform(size=(100, 4), chunks=50)
a = dm.euclidean_distances(X, Y)
b = sm.euclidean_distances(X, Y)
a = dask_ml.metrics.euclidean_distances(X, Y)
b = sklearn.metrics.euclidean_distances(X, Y)
assert_eq(a, b)

x_norm_squared = (X ** 2).sum(axis=1).compute()[:, np.newaxis]
a = dm.euclidean_distances(X, Y, X_norm_squared=x_norm_squared)
b = sm.euclidean_distances(X, Y, X_norm_squared=x_norm_squared)
a = dask_ml.metrics.euclidean_distances(X, Y, X_norm_squared=x_norm_squared)
b = sklearn.metrics.euclidean_distances(X, Y, X_norm_squared=x_norm_squared)
assert_eq(a, b)

y_norm_squared = (Y ** 2).sum(axis=1).compute()[np.newaxis, :]
a = dm.euclidean_distances(X, Y, Y_norm_squared=y_norm_squared)
b = sm.euclidean_distances(X, Y, Y_norm_squared=y_norm_squared)
a = dask_ml.metrics.euclidean_distances(X, Y, Y_norm_squared=y_norm_squared)
b = sklearn.metrics.euclidean_distances(X, Y, Y_norm_squared=y_norm_squared)
assert_eq(a, b)


def test_euclidean_distances_same():
X = da.random.uniform(size=(100, 4), chunks=50)
a = dm.euclidean_distances(X, X)
b = sm.euclidean_distances(X, X)
a = dask_ml.metrics.euclidean_distances(X, X)
b = sklearn.metrics.euclidean_distances(X, X)
assert_eq(a, b, atol=1e-4)

x_norm_squared = (X ** 2).sum(axis=1).compute()[:, np.newaxis]
Expand All @@ -71,10 +73,45 @@ def test_euclidean_distances_same():
@pytest.mark.parametrize("kernel", ["linear", "polynomial", "rbf", "sigmoid"])
def test_pairwise_kernels(kernel):
X = da.random.uniform(size=(100, 4), chunks=(50, 4))
a = dm.pairwise.PAIRWISE_KERNEL_FUNCTIONS[kernel]
b = sm.pairwise.PAIRWISE_KERNEL_FUNCTIONS[kernel]
a = dask_ml.metrics.pairwise.PAIRWISE_KERNEL_FUNCTIONS[kernel]
b = sklearn.metrics.pairwise.PAIRWISE_KERNEL_FUNCTIONS[kernel]

r1 = a(X)
r2 = b(X.compute())
assert isinstance(X, da.Array)
assert_eq(r1, r2)


@pytest.mark.parametrize("sample_weight", [True, False])
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize("labels", [[0, 1], [0, 1, 3], [1, 0]])
def test_log_loss(labels, normalize, sample_weight):
n = 100
c = 25
y_true = np.random.choice(labels, size=n)
y_pred = np.random.uniform(size=(n, len(labels)))
y_pred /= y_pred.sum(1, keepdims=True)

if sample_weight:
sample_weight = np.random.uniform(size=n)
sample_weight /= sample_weight.sum()
dsample_weight = da.from_array(sample_weight, chunks=c)
else:
sample_weight = None
dsample_weight = None

dy_true = da.from_array(y_true, chunks=c)
dy_pred = da.from_array(y_pred, chunks=c)

a = sklearn.metrics.log_loss(
y_true, y_pred, normalize=normalize, sample_weight=sample_weight
)
b = dask_ml.metrics.log_loss(
dy_true,
dy_pred,
labels=labels,
normalize=normalize,
sample_weight=dsample_weight,
)

assert_eq(a, b)

0 comments on commit 3834756

Please sign in to comment.