denizyuret · ilkerkesen · Jun 16, 2019 · Jun 16, 2019 · Jun 17, 2019 · Jun 17, 2019
diff --git a/src/loss.jl b/src/loss.jl
@@ -84,18 +84,18 @@ mutable structdef enum
     CUDNN_SOFTMAX_MODE_CHANNEL = 1     /* compute the softmax over all C for each H, W, N */
 } cudnnSoftmaxMode_t;
 
-=#          
+=#
 
 
 """
 
     softmax(x; dims=1, algo=1)
 
 The softmax function typically used in classification.
-Gives the same results as to `exp.(logp(x, dims))`. 
+Gives the same results as to `exp.(logp(x, dims))`.
 
-If `algo=1` computation is more accurate, if `algo=0` it is 
-faster. 
+If `algo=1` computation is more accurate, if `algo=0` it is
+faster.
 
 See also `logsoftmax`.
 
@@ -108,7 +108,7 @@ function _softmax(x; dims=:, algo=1)
     @assert algo ∈ [0, 1]
     if algo == 1
         x = x .- maximum(x, dims=dims)
-    end    
+    end
     x = exp.(x)
     return x ./ sum(x;dims=dims)
 end
@@ -122,7 +122,7 @@ end
 """
      logsoftmax(x; dims=:)
 
- Equivalent to `logp(x; dims=:)`. See also `sotfmax`. 
+ Equivalent to `logp(x; dims=:)`. See also `sotfmax`.
 """
 const logsoftmax = logp
 
@@ -177,7 +177,7 @@ function csb1(y,dy,dx,ddx;algo=0,mode=0,o...)
     cdim = ndims(y) - 1
     dims = (mode == 0 ? ((1:cdim)...,) : (cdim,))
     if algo==0 || algo==1
-        ddx .* dy - dy .* sum(y .* ddx, dims=dims) - ddx .* sum(y .* dy, dims=dims) 
+        ddx .* dy - dy .* sum(y .* ddx, dims=dims) - ddx .* sum(y .* dy, dims=dims)
     elseif algo==2
         -ddx .* exp.(y) .* sum(dy,dims=dims)
     else
@@ -229,23 +229,38 @@ end
 @primitive logsumexp(x;dims=:),dy,y  (dy .* exp.(x .- y))
 
 
-"""
+Mask = Union{Nothing, AbstractArray{<:Bool}}
+Ignore = Union{Nothing,<:Integer,AbstractArray{<:Integer},
+               Tuple{<:Integer, Vararg{<:Integer}}}
+Answers = AbstractArray{<:Integer}
+build_nll_mask(a::Answers, ignore::Nothing) = nothing
+build_nll_mask(a::Answers, ignore::Ignore) = map(ai->ai ∉ ignore, a)
+MaskError = DimensionMismatch("nll mask must have the same size with answers")
 
-    nll(scores, answers; dims=1, average=true)
 
-Given an unnormalized `scores` matrix and an `Integer` array of
-correct `answers`, return the per-instance negative log
-likelihood. `dims=1` means instances are in columns, `dims=2` means
-instances are in rows.  Use `average=false` to return the sum instead
-of per-instance average.
+"""
+    nll(scores, answers; dims=1, average=true, mask, ignore)
 
+Given an unnormalized `scores` matrix, an `Integer` array of correct `answers` return per-instance negative log likelihood. `dims=1` means instances are in columns, `dims=2` means instances are in rows. Use `average=false` to return the sum instead of per-instance average. Masked loss computation can be implemented by using `mask` and `ignore` keyword arguments and they cannot be used at the same time. `mask` can be `nothing` (default value, means no masking) or `Bool` array has same size with `answers`. If `mask[i]` is `false`, then `answers[i]` does not affect the computation. `ignore` can be `nothing` (default value, means no masking), `Integer`, `Integer` array or `Integer` tuple. If `ignore` contains `answers[i]` or ignore equals to `answers[i]`, then `answers[i]` does not affect the computation. `
+p
 """
-function nll(y,a::AbstractArray{<:Integer}; dims=1, average=true)
+function nll(y, a::Answers; dims=1, average=true, mask::Mask=nothing,
+             ignore::Ignore=nothing)
+    if !isnothing(ignore) && !isnothing(mask)
+        error("ignore and mask arguments cannot be used at the same time.")
+    elseif !isnothing(ignore)
+        mask = map(ai->ai ∉ ignore, a)
+    elseif !isnothing(mask) && size(a) != size(mask)
+        throw(DimensionMismatch("mask must have the same size with answers"))
+    end
+
     indices = findindices(y,a,dims=dims)
     lp = logp(y,dims=dims)[indices]
+    lp = isnothing(mask) ? lp : lp[mask]
     average ? -mean(lp) : -sum(lp)
 end
 
+
 """
     logistic(scores, answers; average=true)
 Computes logistic loss given scores(predicted values) and answer labels.
@@ -265,7 +280,7 @@ Computes binary cross entropy given scores(predicted values) and answer labels.
 answer values should be {0,1}, then it returns negative of `mean|sum(answers * log(p) + (1-answers)*log(1-p))`
 where `p` is equal to `1/(1 + exp.(scores))`. See also `logistic`.
 """
-function bce(x̂,x;average=true) 
+function bce(x̂,x;average=true)
     ε = eltype(x̂)(1e-12)
     p = 1 ./ (1 .+ exp.(-x̂))
     l = x .* log.(p .+ ε) .+ (1 .- x).*log.((1-ε) .- p)
@@ -283,7 +298,7 @@ answer has the maximum score. `dims=1` means instances are in columns,
 the number of correct answers instead of the ratio.
 
 """
-function accuracy(y,a::AbstractArray{<:Integer}; dims=1, average=true)
+function accuracy(y, a::Answers; dims=1, average=true)
     indices = findindices(y,a,dims=dims)
     ycpu = convert(Array,y)
     (maxval,maxind) = findmax(ycpu,dims=dims)
@@ -292,7 +307,8 @@ function accuracy(y,a::AbstractArray{<:Integer}; dims=1, average=true)
     average ? mean(correct) : sum(correct)
 end
 
-function findindices(y,a::AbstractArray{<:Integer}; dims=1)
+
+function findindices(y, a::Answers; dims=1)
     n = length(a)
     indices = Vector{Int}(undef,n)
     if dims == 1                   # instances in first dimension
@@ -318,21 +334,21 @@ end
 
 
 """
-    nll(model, data; dims=1, average=true, o...)
+    nll(model, data; dims=1, average=true, ignore=nothing, o...)
 
-Compute `nll(model(x; o...), y; dims)` for `(x,y)` in `data` and return the per-instance
-average (if average=true) or total (if average=false) negative log likelihood.
+Compute `nll(model(x; o...), y; dims, ignore)` for `(x,y)` in `data` and return the per-instance average (if average=true) or total (if average=false) negative log likelihood. If `y[i]` is an element of `ignore`, it does not affect the computation. `ignore` can be an `nothing` (default value, means no masking), `Integer`, an `Integer` array or an `Integer` tuple.
 """
-function nll(model, data; dims=1, average=true, o...)
+function nll(model, data; dims=1, average=true, ignore::Ignore=nothing, o...)
     sum = cnt = 0
     for (x,y) in data
-        sum += nll(model(x; o...), y; dims=dims, average=false)
-        cnt += length(y)
+        sum += nll(model(x; o...), y; dims=dims, average=false, ignore=ignore)
+        cnt += isnothing(ignore) ? length(y) : count(i->i ∉ ignore, y)
     end
     average ? sum / cnt : sum
 end
 
 
+
 """
     accuracy(model, data; dims=1, average=true, o...)
 
@@ -352,9 +368,13 @@ end
 zeroone(x...; o...) = 1 - accuracy(x...; o...)
 
 # We need the (model,x,y) interface to implement regularization:
-nll(f, x, y; dims=1, average=true, o...)=nll(f(x; o...), y; dims=dims, average=average)
+function nll(f, x, y; ignore::Ignore=nothing, mask::Mask=nothing,
+             dims=1, average=true, o...)
+    nll(f(x; o...), y; ignore=ignore, mask=mask, dims=dims, average=average)
+end
 accuracy(f, x, y; dims=1, average=true, o...)=accuracy(f(x; o...), y; dims=dims, average=average)
 
 # We need the (weights,data,predict) interface to support the old interface:
-nll(w, data, f::Function; dims=1, average=true, o...)=nll(x->f(w,x;o...), data; dims=dims, average=average)
+nll(w,data,f::Function;ignore::Ignore=nothing,dims=1,average=true,o...) = nll(
+    x->f(w,x;o...), data; ignore=ignore, dims=dims, average=average)
 accuracy(w, data, f::Function; dims=1, average=true, o...)=accuracy(x->f(w,x;o...), data; dims=dims, average=average)
diff --git a/test/loss.jl b/test/loss.jl
@@ -15,7 +15,7 @@ include("header.jl")
             @test isapprox(f(a,dims=1),f(k,dims=1))
             @test isapprox(f(a,dims=2),f(k,dims=2))
         end
-        
+
         a = rand(10,10,10)
         @test gradcheck(f,a)
         @test gradcheck(f,a,kw=(:dims=>1,))
@@ -24,7 +24,7 @@ include("header.jl")
         @test gradcheck(f,a,kw=(:dims=>(1,2),))
         @test gradcheck(f,a,kw=(:dims=>(3,2),))
         @test gradcheck(f,a,kw=(:dims=>(1,3),))
-        
+
         if gpu() >= 0
             k = KnetArray(a)
             @test gradcheck(f,k)
@@ -50,17 +50,90 @@ include("header.jl")
         end
     end
 
-    a = rand(10,10)
-    indices = rand(1:10,10)
-    @test gradcheck(nll, a, indices, kw=(:dims=>1,), args=1)
-    @test gradcheck(nll, a, indices, kw=(:dims=>2,), args=1)
-    if gpu() >= 0
-        k = KnetArray(a)
-        @test gradcheck(nll, k, indices, kw=(:dims=>1,), args=1)
-        @test gradcheck(nll, k, indices, kw=(:dims=>2,), args=1)
-        @test isapprox(nll(k, indices, dims=1), nll(a, indices, dims=1))
-        @test isapprox(nll(k, indices, dims=2), nll(a, indices, dims=2))
+    # nll tests
+    N = 10
+    as = Any[rand(N,N)]
+    gpu() >= 0 && push!(as, KnetArray(as[1]))
+    indices = rand(1:N,N)
+    indices[1:2] = [1,2];
+    ignore = [1, [1,], [1,2], (1,2)]
+    mask = [indices .!= 1, indices .> 2]
+    for (i,ai) in enumerate(as), d in 1:2, avg in (true,false)
+        # gradcheck tests
+        kw = (:dims => d,:average => avg)
+        @test gradcheck(nll, ai, indices, kw=kw, args=1)
+        for (ki,k) in enumerate(ignore)
+            @test gradcheck(nll, ai, indices, kw=(:ignore => k, kw...), args=1)
+        end
+        for (ki,k) in enumerate(mask)
+            @test gradcheck(nll, ai, indices, kw=(:mask => k, kw...), args=1)
+        end
+
+        # test different array types
+        if length(as) > 1 && i == 1
+            aj = as[end]
+            @test isapprox(nll(ai, indices; kw...), nll(aj, indices; kw...))
+        end
+
+        # tests whether masking and averaging mechanism works or not
+        @test isapprox(nll(ai, indices; kw..., ignore=-1),
+                       nll(ai, indices; kw...))
+        for (ki,k) in enumerate(mask)
+            @test !isapprox(nll(ai, indices; kw..., mask=k),
+                            nll(ai, indices; kw...))
+            !avg && continue
+            @test isless(nll(ai, indices; mask=k, dims=d, average=false),
+                         nll(ai, indices; dims=d, average=false))
+            @test isapprox(nll(ai, indices; mask=k, dims=d) * sum(k),
+                           nll(ai, indices; mask=k, dims=d, average=false))
+            @test isapprox(nll(ai, indices; dims=d) * length(indices),
+                           nll(ai, indices; dims=d, average=false))
+        end
+
+        # tests for different masking mechanisms with different array types
+        for (j,aj) in enumerate(as)
+            i == 2 && j == 1 && continue
+            @test isapprox(nll(ai, indices; kw..., ignore=ignore[1]),
+                           nll(aj, indices; kw..., ignore=ignore[2]))
+            @test isapprox(nll(ai, indices; kw..., ignore=ignore[3]),
+                           nll(aj, indices; kw..., ignore=ignore[4]))
+            @test isapprox(nll(aj, indices; kw..., ignore=ignore[1]),
+                           nll(ai, indices; kw..., mask=mask[1]))
+            @test isapprox(nll(aj, indices; kw..., ignore=ignore[3]),
+                               nll(ai, indices; kw..., mask=mask[2]))
+        end
+
+        # tests for nll(model, data, [ignore]; kw...)
+        #           nll(model, x, y, [ignore]; kw...)
+        model, data  = identity, [(ai,indices)]
+        x, y = first(data)
+        desired = nll(ai, indices; kw...)
+        @test isapprox(nll(model, data; kw...), desired)
+        @test isapprox(nll(model, data; ignore=0, kw...), desired)
+        @test isapprox(nll(model, x, y; kw...), desired)
+        @test isapprox(nll(model, x, y; ignore=0, kw...), desired)
+        @test isapprox(nll(model, x, y; kw..., mask=mask[1]),
+                       nll(model, x, y; kw..., ignore=ignore[1]))
+        for k in ignore
+            @test isapprox(nll(model, data; ignore=k, kw...),
+                           nll(ai, indices; ignore=k, kw...))
+            @test isapprox(nll(model, x, y; ignore=k, kw...),
+                           nll(ai, indices; ignore=k, kw...))
+        end
+
+        # tests for mask/ignore errors
+        @test_throws ErrorException nll(ai, indices; kw...,
+                                        mask=mask[1], ignore=0)
+        @test_throws DimensionMismatch nll(ai, indices; kw...,
+                                           mask=mask[1][1:end-1])
+
+        # tests for old interface
+        f(w, x; o...) = identity(x)
+        w = nothing
+        @test isapprox(nll(w, data, f;  kw..., ignore=ignore[1]),
+                       nll(ai, indices; kw..., ignore=ignore[1]))
     end
+
     @test gradcheck(logistic,a[:],a[:])
     @test gradcheck(bce,a[:],a[:])