In [1]:
using Knet

In [303]:
mutable struct model
    lstm    #Knet rnn
    output  #final layer
    w1      #use this weight to compute a new hidden state at each times step
end

In [322]:
function model(embed::Int, hidden::Int, output::Int)
    model(RNN(embed,hidden), param(output, hidden), param(20; init=xavier, atype=KnetArray{Float32}))
end

model

MODEL 1

gradient of w1 is computed correctly if it is used after the lstm

In [306]:
function (m::model)(input)
    ht = KnetArray(randn(Float32,20))
    ct = KnetArray(randn(Float32,20))
    
    for t in 1:20
        #take the word corresponding to timestep t
        xt = input[:,t]
        
        #keep the hidden state coming in from the prev timestep unchanged
        new_h = ht 
        new_c = ct
                
        #update h
        m.lstm.h = new_h
        m.lstm.c = new_c
        
        #feed in the word
        m.lstm(xt)
        
        #just to reshape (20,1,1) into (20)
        ht = reshape(m.lstm.h,20)
        ct = reshape(m.lstm.c,20)
    end

    #use w1 after the lstm business is done, gradients for all weights return just fine
    ht = ht .+ m.w1

    return m.output * ht

end

(m::model)(input,output) = nll(m(input),output)

In [307]:
#2 classes
model1 = model(100,20,2)

model(LSTM(input=100,hidden=20), P(KnetArray{Float32,2}(2,20)), P(KnetArray{Float32,1}(20)))

In [312]:
#randomly generated data: 20 words, each is a (100) vector
data = KnetArray(randn(Float32,100,20));

#label for the single sentence (computed from the final hidden state)
label = [1];

In [313]:
model1(data)

2-element KnetArray{Float32,1}:
 -0.25322473
 -0.21963474

In [314]:
model1(data,label)

0.7100831f0

In [316]:
#all 3 gradients are computed correctly
J1 = @diff model1(data,label)
for p in params(model1)
    @show typeof(grad(J1, p))
end

typeof(grad(J1, p)) = KnetArray{Float32,3}
typeof(grad(J1, p)) = KnetArray{Float32,2}
typeof(grad(J1, p)) = KnetArray{Float32,1}


MODEL 2

Gradient of w1 returns nothing

it is used to compute new_h 

lstm.h is set to new_h at each timestep

for some reason the gradient is not backpropagated properly

In [323]:
function (m::model)(input)
    ht = KnetArray(randn(Float32,20))
    ct = KnetArray(randn(Float32,20))
    
    for t in 1:20
        #take the word corresponding to timestep t
        xt = input[:,t]
        
        #at each iteration, use w1 to compute new hidden and cell states
        new_h = ht + m.w1
        new_c = ct + m.w1
                
        #set h to be the newly computed hidden state
        m.lstm.h = new_h
        m.lstm.c = new_c
        
        #feed in the word
        m.lstm(xt)
        
        #just to reshape (20,1,1) into (20)
        ht = reshape(m.lstm.h,20)
        ct = reshape(m.lstm.c,20)
    end
    
    return m.output * ht

end

(m::model)(input,output) = nll(m(input),output)

In [324]:
model1 = model(100,20,2)
model1(data,label)

0.606835f0

In [325]:
#gradient for w1 is Nothing, it was used to compute new hidden states at each timestep
J1 = @diff model1(data,label)
for p in params(model1)
    @show typeof(grad(J1, p))
end

typeof(grad(J1, p)) = KnetArray{Float32,3}
typeof(grad(J1, p)) = KnetArray{Float32,2}
typeof(grad(J1, p)) = Nothing
