From 6b23b7bf7f5ebfbae320371a01fafd1e3410efad Mon Sep 17 00:00:00 2001 From: alisafaya Date: Fri, 3 Apr 2020 17:55:19 +0300 Subject: [PATCH] added gelu cpu implementations (forward, backward) to unary.jl --- src/Knet.jl | 1 + src/ops.jl | 2 ++ src/unary.jl | 49 +++++++++++++++++++++++++++++++++++-------------- 3 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/Knet.jl b/src/Knet.jl index a1fdabe02..910c4a299 100644 --- a/src/Knet.jl +++ b/src/Knet.jl @@ -38,6 +38,7 @@ export # ref:reference.md tut:tutorial gaussian, # ref #gc, # ref, tut, use Knet.gc #@gheck, # ref, use AutoGrad.@gcheck + gelu, goldensection, # ref gpu, # ref, tut gpucopy, # ref diff --git a/src/ops.jl b/src/ops.jl index b51edb485..798f7e7b9 100644 --- a/src/ops.jl +++ b/src/ops.jl @@ -35,6 +35,7 @@ binary_ops = [ ("seluback","seluback","(yi>0?1.0507009873554805*xi:xi*(1.7580993408473773+yi))"), ("sigmback","sigmback","(xi*yi*(1-yi))"), ("tanhback","tanhback","(xi*(1-yi*yi))"), + ("geluback","geluback","(xi*(0.5*tanh(0.035677408136300125*pow(xi,3)+0.7978845608028654*xi)+(0.0535161*pow(xi,3)+0.3989422804014327*xi)*(1-pow(tanh(0.035677408136300125*pow(xi,3)+0.7978845608028654*xi),2))+0.5))"), # ("rpow","rpow","pow(yi,xi)"), # need this for Array.^Scalar -> cuda bug #108 switching to CuArrays for pow ] @@ -96,6 +97,7 @@ unary_ops = [ ("relu", "relu", "(xi>0?xi:0)"), ("elu", "elu", "(xi>0?xi:exp(xi)-1)"), ("selu", "selu", "1.0507009873554805*(xi>0?xi:1.6732632423543778*(exp(xi)-1))"), +("gelu", "gelu", "0.5*xi*(1+tanh(0.035677408136300125*pow(xi,3)+0.7978845608028654*xi))"), # "rint", "round", # "rsqrt", diff --git a/src/unary.jl b/src/unary.jl index f33607717..8f91689fa 100644 --- a/src/unary.jl +++ b/src/unary.jl @@ -3,7 +3,7 @@ using SpecialFunctions import Base.Broadcast: broadcasted -import NNlib: relu, selu, elu +import NNlib: relu, selu, elu, gelu function unary_op(f, j=f, o...) J=Symbol(j) @@ -33,25 +33,36 @@ const λ01 = (1-erfc(1/sqrt(2))*sqrt(exp(1)))*sqrt(2pi)*(2*erfc(sqrt(2))*exp(2)+ const α01 = -sqrt(2/pi)/(erfc(1/sqrt(2))*exp(1/2)-1) const λα01 = λ01 * α01 +# Constant for gelu activation function from https://arxiv.org/pdf/1606.08415v3.pdf +const GConstant01 = sqrt(2/pi) +const GConstant02 = 0.044715 * sqrt(2/pi) +const GConstant03 = GConstant01 / 2 + # Define some common operations as primitives for efficiency: # 1. Avoid creating intermediate arrays # 2. Avoid taking derivatives of intermediate operations for (f,g,y,dx) in ((:invx, :invxback, :(one(T)/xi), :(-yi*yi*dyi)), - (:relu, :reluback, :(max(zero(T),xi)), :(ifelse(yi>0,dyi,zero(T)))), - (:selu, :seluback, :(xi >= 0 ? T(λ01)*xi : T(λα01)*(exp(xi)-1)), :(yi >= 0 ? dyi * T(λ01) : dyi * (yi + T(λα01)))), - (:elu, :eluback, :(xi >= 0 ? xi : exp(xi)-1), :(yi >= 0 ? dyi : dyi * (1+yi))), - (:tanx, :tanhback, :(tanh(xi)), :(dyi*(one(T)-yi*yi))), - (:sigm, :sigmback, - # Numerically stable implementation from - # http://timvieira.github.io/blog/post/2014/02/11/exp-normalize-trick - :(if xi>=0; z=exp(-xi); one(T)/(one(T)+z); else; z=exp(xi); z/(one(T)+z); end), - :(dyi*yi*(one(T)-yi))), - ) + (:relu, :reluback, :(max(zero(T),xi)), :(ifelse(yi>0,dyi,zero(T)))), + (:selu, :seluback, :(xi >= 0 ? T(λ01)*xi : T(λα01)*(exp(xi)-1)), :(yi >= 0 ? dyi * T(λ01) : dyi * (yi + T(λα01)))), + (:elu, :eluback, :(xi >= 0 ? xi : exp(xi)-1), :(yi >= 0 ? dyi : dyi * (1+yi))), + (:tanx, :tanhback, :(tanh(xi)), :(dyi*(one(T)-yi*yi))), + (:sigm, :sigmback, + # Numerically stable implementation from + # http://timvieira.github.io/blog/post/2014/02/11/exp-normalize-trick + :(if xi>=0; z=exp(-xi); one(T)/(one(T)+z); else; z=exp(xi); z/(one(T)+z); end), + :(dyi*yi*(one(T)-yi))), + (:gelu, :geluback, + # 0.5x(1+tanh(0.0356774 x^3 + 0.797885 x)) + :(T(0.5)*xi*( one(T) + tanh( (T(GConstant02)*xi^3) + (T(GConstant01)*xi) ) )), + # 0.5 tanh(0.0356774 x^3 + 0.797885 x) + (0.0535161 x^3 + 0.398942 x) sech^2(0.0356774 x^3 + 0.797885 x) + 0.5 + :(dyi*(T(0.5)*tanh(T(GConstant02)*xi^3 + T(GConstant01)*xi) + (T(0.0535161)*xi^3 + T(GConstant03)*xi)*(sech(T(GConstant02)*xi^3 + T(GConstant01)*xi))^2 + T(0.5)))), + ) + @eval begin $f(xi::T) where {T<:Number}=$y - $g(dyi::T,yi::T) where {T<:Number}=$dx + $g(dyi::T,yi::T,xi::T) where {T<:Number}=$dx function broadcasted(::typeof($f),x::Array{T}) where {T<:AbstractFloat} y = similar(x) @inbounds for i=1:length(y) @@ -60,16 +71,17 @@ for (f,g,y,dx) in end return y end - function broadcasted(::typeof($g),dy::Array{T},y::Array{T}) where {T<:AbstractFloat} + function broadcasted(::typeof($g),dy::Array{T},y::Array{T},x::Array{T}) where {T<:AbstractFloat} dx = similar(dy) @inbounds for i=1:length(dx) yi = y[i] + xi = x[i] dyi = dy[i] dx[i] = $dx end return dx end - @primitive $f(x),dy,y $g.(dy,y) + @primitive $f(x),dy,y $g.(dy,y,x) end end @@ -104,6 +116,15 @@ Reference: Self-Normalizing Neural Networks (https://arxiv.org/abs/1706.02515). """ selu +""" + gelu(x) + +Return `0.5 * x * (1 + tanh( √(2/π) * (0.044715 x^3 + x) ))` + +Reference: Gaussian Error Linear Units (https://arxiv.org/pdf/1606.08415v3.pdf). +""" +gelu + # To avoid conflict with AutoGrad: import Base: tanh @primitive tanh(x::Array),dy,y tanhback.(dy,y)