Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Masked NLL interface #466

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 46 additions & 26 deletions src/loss.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,18 +84,18 @@ mutable structdef enum
CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */
} cudnnSoftmaxMode_t;

=#
=#


"""

softmax(x; dims=1, algo=1)

The softmax function typically used in classification.
Gives the same results as to `exp.(logp(x, dims))`.
Gives the same results as to `exp.(logp(x, dims))`.

If `algo=1` computation is more accurate, if `algo=0` it is
faster.
If `algo=1` computation is more accurate, if `algo=0` it is
faster.

See also `logsoftmax`.

Expand All @@ -108,7 +108,7 @@ function _softmax(x; dims=:, algo=1)
@assert algo ∈ [0, 1]
if algo == 1
x = x .- maximum(x, dims=dims)
end
end
x = exp.(x)
return x ./ sum(x;dims=dims)
end
Expand All @@ -122,7 +122,7 @@ end
"""
logsoftmax(x; dims=:)

Equivalent to `logp(x; dims=:)`. See also `sotfmax`.
Equivalent to `logp(x; dims=:)`. See also `sotfmax`.
"""
const logsoftmax = logp

Expand Down Expand Up @@ -177,7 +177,7 @@ function csb1(y,dy,dx,ddx;algo=0,mode=0,o...)
cdim = ndims(y) - 1
dims = (mode == 0 ? ((1:cdim)...,) : (cdim,))
if algo==0 || algo==1
ddx .* dy - dy .* sum(y .* ddx, dims=dims) - ddx .* sum(y .* dy, dims=dims)
ddx .* dy - dy .* sum(y .* ddx, dims=dims) - ddx .* sum(y .* dy, dims=dims)
elseif algo==2
-ddx .* exp.(y) .* sum(dy,dims=dims)
else
Expand Down Expand Up @@ -229,23 +229,38 @@ end
@primitive logsumexp(x;dims=:),dy,y (dy .* exp.(x .- y))


"""
Mask = Union{Nothing, AbstractArray{<:Bool}}
Ignore = Union{Nothing,<:Integer,AbstractArray{<:Integer},
Tuple{<:Integer, Vararg{<:Integer}}}
Answers = AbstractArray{<:Integer}
build_nll_mask(a::Answers, ignore::Nothing) = nothing
build_nll_mask(a::Answers, ignore::Ignore) = map(ai->ai ∉ ignore, a)
MaskError = DimensionMismatch("nll mask must have the same size with answers")

nll(scores, answers; dims=1, average=true)

Given an unnormalized `scores` matrix and an `Integer` array of
correct `answers`, return the per-instance negative log
likelihood. `dims=1` means instances are in columns, `dims=2` means
instances are in rows. Use `average=false` to return the sum instead
of per-instance average.
"""
nll(scores, answers; dims=1, average=true, mask, ignore)

Given an unnormalized `scores` matrix, an `Integer` array of correct `answers` return per-instance negative log likelihood. `dims=1` means instances are in columns, `dims=2` means instances are in rows. Use `average=false` to return the sum instead of per-instance average. Masked loss computation can be implemented by using `mask` and `ignore` keyword arguments and they cannot be used at the same time. `mask` can be `nothing` (default value, means no masking) or `Bool` array has same size with `answers`. If `mask[i]` is `false`, then `answers[i]` does not affect the computation. `ignore` can be `nothing` (default value, means no masking), `Integer`, `Integer` array or `Integer` tuple. If `ignore` contains `answers[i]` or ignore equals to `answers[i]`, then `answers[i]` does not affect the computation. `
p
"""
function nll(y,a::AbstractArray{<:Integer}; dims=1, average=true)
function nll(y, a::Answers; dims=1, average=true, mask::Mask=nothing,
ignore::Ignore=nothing)
if !isnothing(ignore) && !isnothing(mask)
error("ignore and mask arguments cannot be used at the same time.")
elseif !isnothing(ignore)
mask = map(ai->ai ∉ ignore, a)
elseif !isnothing(mask) && size(a) != size(mask)
throw(DimensionMismatch("mask must have the same size with answers"))
end

indices = findindices(y,a,dims=dims)
lp = logp(y,dims=dims)[indices]
lp = isnothing(mask) ? lp : lp[mask]
average ? -mean(lp) : -sum(lp)
end


"""
logistic(scores, answers; average=true)
Computes logistic loss given scores(predicted values) and answer labels.
Expand All @@ -265,7 +280,7 @@ Computes binary cross entropy given scores(predicted values) and answer labels.
answer values should be {0,1}, then it returns negative of `mean|sum(answers * log(p) + (1-answers)*log(1-p))`
where `p` is equal to `1/(1 + exp.(scores))`. See also `logistic`.
"""
function bce(x̂,x;average=true)
function bce(x̂,x;average=true)
ε = eltype(x̂)(1e-12)
p = 1 ./ (1 .+ exp.(-x̂))
l = x .* log.(p .+ ε) .+ (1 .- x).*log.((1-ε) .- p)
Expand All @@ -283,7 +298,7 @@ answer has the maximum score. `dims=1` means instances are in columns,
the number of correct answers instead of the ratio.

"""
function accuracy(y,a::AbstractArray{<:Integer}; dims=1, average=true)
function accuracy(y, a::Answers; dims=1, average=true)
indices = findindices(y,a,dims=dims)
ycpu = convert(Array,y)
(maxval,maxind) = findmax(ycpu,dims=dims)
Expand All @@ -292,7 +307,8 @@ function accuracy(y,a::AbstractArray{<:Integer}; dims=1, average=true)
average ? mean(correct) : sum(correct)
end

function findindices(y,a::AbstractArray{<:Integer}; dims=1)

function findindices(y, a::Answers; dims=1)
n = length(a)
indices = Vector{Int}(undef,n)
if dims == 1 # instances in first dimension
Expand All @@ -318,21 +334,21 @@ end


"""
nll(model, data; dims=1, average=true, o...)
nll(model, data; dims=1, average=true, ignore=nothing, o...)

Compute `nll(model(x; o...), y; dims)` for `(x,y)` in `data` and return the per-instance
average (if average=true) or total (if average=false) negative log likelihood.
Compute `nll(model(x; o...), y; dims, ignore)` for `(x,y)` in `data` and return the per-instance average (if average=true) or total (if average=false) negative log likelihood. If `y[i]` is an element of `ignore`, it does not affect the computation. `ignore` can be an `nothing` (default value, means no masking), `Integer`, an `Integer` array or an `Integer` tuple.
"""
function nll(model, data; dims=1, average=true, o...)
function nll(model, data; dims=1, average=true, ignore::Ignore=nothing, o...)
sum = cnt = 0
for (x,y) in data
sum += nll(model(x; o...), y; dims=dims, average=false)
cnt += length(y)
sum += nll(model(x; o...), y; dims=dims, average=false, ignore=ignore)
cnt += isnothing(ignore) ? length(y) : count(i->i ∉ ignore, y)
end
average ? sum / cnt : sum
end



"""
accuracy(model, data; dims=1, average=true, o...)

Expand All @@ -352,9 +368,13 @@ end
zeroone(x...; o...) = 1 - accuracy(x...; o...)

# We need the (model,x,y) interface to implement regularization:
nll(f, x, y; dims=1, average=true, o...)=nll(f(x; o...), y; dims=dims, average=average)
function nll(f, x, y; ignore::Ignore=nothing, mask::Mask=nothing,
dims=1, average=true, o...)
nll(f(x; o...), y; ignore=ignore, mask=mask, dims=dims, average=average)
end
accuracy(f, x, y; dims=1, average=true, o...)=accuracy(f(x; o...), y; dims=dims, average=average)

# We need the (weights,data,predict) interface to support the old interface:
nll(w, data, f::Function; dims=1, average=true, o...)=nll(x->f(w,x;o...), data; dims=dims, average=average)
nll(w,data,f::Function;ignore::Ignore=nothing,dims=1,average=true,o...) = nll(
x->f(w,x;o...), data; ignore=ignore, dims=dims, average=average)
accuracy(w, data, f::Function; dims=1, average=true, o...)=accuracy(x->f(w,x;o...), data; dims=dims, average=average)
97 changes: 85 additions & 12 deletions test/loss.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ include("header.jl")
@test isapprox(f(a,dims=1),f(k,dims=1))
@test isapprox(f(a,dims=2),f(k,dims=2))
end

a = rand(10,10,10)
@test gradcheck(f,a)
@test gradcheck(f,a,kw=(:dims=>1,))
Expand All @@ -24,7 +24,7 @@ include("header.jl")
@test gradcheck(f,a,kw=(:dims=>(1,2),))
@test gradcheck(f,a,kw=(:dims=>(3,2),))
@test gradcheck(f,a,kw=(:dims=>(1,3),))

if gpu() >= 0
k = KnetArray(a)
@test gradcheck(f,k)
Expand All @@ -50,17 +50,90 @@ include("header.jl")
end
end

a = rand(10,10)
indices = rand(1:10,10)
@test gradcheck(nll, a, indices, kw=(:dims=>1,), args=1)
@test gradcheck(nll, a, indices, kw=(:dims=>2,), args=1)
if gpu() >= 0
k = KnetArray(a)
@test gradcheck(nll, k, indices, kw=(:dims=>1,), args=1)
@test gradcheck(nll, k, indices, kw=(:dims=>2,), args=1)
@test isapprox(nll(k, indices, dims=1), nll(a, indices, dims=1))
@test isapprox(nll(k, indices, dims=2), nll(a, indices, dims=2))
# nll tests
N = 10
as = Any[rand(N,N)]
gpu() >= 0 && push!(as, KnetArray(as[1]))
indices = rand(1:N,N)
indices[1:2] = [1,2];
ignore = [1, [1,], [1,2], (1,2)]
mask = [indices .!= 1, indices .> 2]
for (i,ai) in enumerate(as), d in 1:2, avg in (true,false)
# gradcheck tests
kw = (:dims => d,:average => avg)
@test gradcheck(nll, ai, indices, kw=kw, args=1)
for (ki,k) in enumerate(ignore)
@test gradcheck(nll, ai, indices, kw=(:ignore => k, kw...), args=1)
end
for (ki,k) in enumerate(mask)
@test gradcheck(nll, ai, indices, kw=(:mask => k, kw...), args=1)
end

# test different array types
if length(as) > 1 && i == 1
aj = as[end]
@test isapprox(nll(ai, indices; kw...), nll(aj, indices; kw...))
end

# tests whether masking and averaging mechanism works or not
@test isapprox(nll(ai, indices; kw..., ignore=-1),
nll(ai, indices; kw...))
for (ki,k) in enumerate(mask)
@test !isapprox(nll(ai, indices; kw..., mask=k),
nll(ai, indices; kw...))
!avg && continue
@test isless(nll(ai, indices; mask=k, dims=d, average=false),
nll(ai, indices; dims=d, average=false))
@test isapprox(nll(ai, indices; mask=k, dims=d) * sum(k),
nll(ai, indices; mask=k, dims=d, average=false))
@test isapprox(nll(ai, indices; dims=d) * length(indices),
nll(ai, indices; dims=d, average=false))
end

# tests for different masking mechanisms with different array types
for (j,aj) in enumerate(as)
i == 2 && j == 1 && continue
@test isapprox(nll(ai, indices; kw..., ignore=ignore[1]),
nll(aj, indices; kw..., ignore=ignore[2]))
@test isapprox(nll(ai, indices; kw..., ignore=ignore[3]),
nll(aj, indices; kw..., ignore=ignore[4]))
@test isapprox(nll(aj, indices; kw..., ignore=ignore[1]),
nll(ai, indices; kw..., mask=mask[1]))
@test isapprox(nll(aj, indices; kw..., ignore=ignore[3]),
nll(ai, indices; kw..., mask=mask[2]))
end

# tests for nll(model, data, [ignore]; kw...)
# nll(model, x, y, [ignore]; kw...)
model, data = identity, [(ai,indices)]
x, y = first(data)
desired = nll(ai, indices; kw...)
@test isapprox(nll(model, data; kw...), desired)
@test isapprox(nll(model, data; ignore=0, kw...), desired)
@test isapprox(nll(model, x, y; kw...), desired)
@test isapprox(nll(model, x, y; ignore=0, kw...), desired)
@test isapprox(nll(model, x, y; kw..., mask=mask[1]),
nll(model, x, y; kw..., ignore=ignore[1]))
for k in ignore
@test isapprox(nll(model, data; ignore=k, kw...),
nll(ai, indices; ignore=k, kw...))
@test isapprox(nll(model, x, y; ignore=k, kw...),
nll(ai, indices; ignore=k, kw...))
end

# tests for mask/ignore errors
@test_throws ErrorException nll(ai, indices; kw...,
mask=mask[1], ignore=0)
@test_throws DimensionMismatch nll(ai, indices; kw...,
mask=mask[1][1:end-1])

# tests for old interface
f(w, x; o...) = identity(x)
w = nothing
@test isapprox(nll(w, data, f; kw..., ignore=ignore[1]),
nll(ai, indices; kw..., ignore=ignore[1]))
end

@test gradcheck(logistic,a[:],a[:])
@test gradcheck(bce,a[:],a[:])

Expand Down