In [None]:
using Pkg
using Knet: Knet, param, param0, @diff, grad,RNN,mat, params, KnetArray, conv4, Data, relu, pool, dropout
using Random: shuffle!
using IterTools: ncycle
using TestImages,Images, ImageView
using PyCall
pickle = pyimport("pickle")
numpy = pyimport("numpy")

In [None]:
using Base.Iterators

In [None]:
function prepare_sentence(sent, to_ix)
    sent = split(strip(lowercase(sent),' '))
    idxs = [to_ix[w] for w in sent]
    return idxs
end

In [None]:
function mask!(a,pad)
    x,y = size(a)
    for i = 1:x
        for j = 1:y
            if a[i, j] == pad
                a[i, j] = 0
            end
        end
    end
    return a
end

In [None]:
function mypickle(filename, obj)
    out = open(filename,"w")
    pickle.dump(obj, out)
    close(out)
 end

function myunpickle(filename)
    r = nothing
    @pywith pybuiltin("open")(filename,"rb") as f begin
        r = pickle.load(f)
    end
    return r
end

In [None]:
true_dataset = myunpickle("dataset/dataset_true.pickle")
# false_dataset = myunpickle("dataset/dataset_false.pickle")

wdict = Dict()
w2i(x) = get!(wdict, x, 1+length(wdict))
UNK = w2i("<unk>")
EOS = w2i("<eos>")
#w2i(x) = get(wdict, x, UNK), for test time

In [None]:
EMBEDDING_DIM = 20
HIDDEN_DIM_LSTM = 10
VOCAB_SIZE = length(wdict)
BATCHSIZE = 6

In [None]:
images = []
labels = []
for i in 1:length(true_dataset)
    (f1,f2), sent = true_dataset[i]
    both_frames = hcat(f1,f2)
    both_frames = Float32.(both_frames)
    wordids = w2i.(split(sent))
    push!(labels, wordids)
    push!(images,both_frames)
end

In [99]:
mutable struct Conv 
    w 
    b 
    f_activation
    p_drop
    f_pool
end
(c::Conv)(x) = c.f_activation.(c.f_pool(conv4(c.w, dropout(x,c.p_drop)) .+ c.b))


Conv(w1::Int,w2::Int,cx::Int,cy::Int;f=relu, pdrop=0, f_pool=pool) = Conv(param(w1,w2,cx,cy), param0(1,1,cy,1), f, pdrop, f_pool)

Conv

In [None]:
struct Dense; w; b; f; p; end
(d::Dense)(x) = d.f.(d.w * mat(dropout(x,d.p)) .+ d.b) # mat reshapes 4-D tensor to 2-D matrix so we can use matmul
Dense(i::Int,o::Int,f=relu;pdrop=0) = Dense(param(o,i), param0(o), f, pdrop)

In [None]:
struct Projection; w; b; f; p; end
(d::Projection)(x) = d.f.(d.w * mat(dropout(x,d.p)) .+ d.b) # mat reshapes 4-D tensor to 2-D matrix so we can use matmul
Projection(i::Int,o::Int,f=relu;pdrop=0) = Projection(param(o,i), param0(o), f, pdrop)

In [None]:
struct Embed; w; end
Embed(vocabsize::Int,embedsize::Int) = Embed(param(embedsize,vocabsize))
(e::Embed)(x) = e.w[:,x]

In [97]:
mutable struct frame_head
    conv1
    conv2
    conv3
    conv4
    fc
    output
end

function frame_head(w1,c1,w2,c2,w3,c3,w4,c4,hidden, outdims)
    conv1 = Conv(w1, w1, 3, c1)
    conv2 = Conv(w2, w2, c1, c2)
    conv3 = Conv(w3, w3, c2, c3)
    conv4 = Conv(w4, w4, c3, c4; f_pool = identity)
    fc = Dense(47040, hidden)
    output = Dense(hidden, outdims)
    frame_head(conv1, conv2, conv3, conv4, fc, output)
end
    

function (f::frame_head)(x)
    f.output(f.fc(f.conv4(f.conv3(f.conv2(f.conv1(x))))))
end

In [92]:
mutable struct sentence_head
    embed
    encoder
end


function sentence_head(vocabsize::Int, embeddingsize::Int, hiddensize::Int)
    embed = Embed(vocabsize, embeddingsize)
    encoder = RNN(embeddingsize, hiddensize, rnnType = :lstm, h = 0)
    sentence_head(embed, encoder)
end

sentence_head

In [90]:
function (s::sentence_head)(x)
    
    src_embed_tensor = s.embed(x)
    s.encoder.h = 0
    s.encoder.c = 0
    y_enc = s.encoder(src_embed_tensor)
end

In [None]:
mutable struct bimodalEncoder
    fh
    sh
end


function bimodalEncoder(w1,c1,w2,c2,w3,c3,w4,c4,hidden, outdims, vocabsize, embeddingsize, hiddensize)
    fh = frame_head(w1,c1,w2,c2,w3,c3,w4,c4,hidden, outdims)
    sh = sentence_head(vocabsize, embeddingsize, hiddensize)
    bimodalEncoder(fh, sh)
end

In [101]:
function cosine_similarity(b::bimodalEncoder, frame_pairs, sentences)
   
    sentence_representations = b.sh(sentences)[:,:]
    frame_representations = b.fh(frame_pairs)
    numerator = sum(sentence_representations .* frame_representations, dims = 1)
    denominator = sqrt.(sum(sentence_representations.^2, dims = 1)) .* sqrt.(sum(frame_representations.^2, dims = 1))
    return numerator ./ denominator
end

cosine_similarity (generic function with 1 method)

In [93]:
function findmaxlength(sentences)
    maxsize = 0
    count = 0
    for sent in sentences
        count = count + 1
        if(length(sent) > maxsize)
            
            maxsize = length(sent)
        end
    end
    return maxsize
end

findmaxlength (generic function with 1 method)

In [102]:
function seqbatch(sentences)
    batchsize = size(sentences, 1)
    maxlength = findmaxlength(sentences)
    for sent in sentences
        if(length(sent) < maxlength)
            for i = 1:maxlength-length(sent)
                push!(sent, 2)
            end
        end
    end
    permutedims(reshape(collect(flatten(sentences)), (8,347)), (2,1))
end

seqbatch (generic function with 1 method)

In [103]:
image_batch = reshape(collect(flatten(images)), (210,320,3,347))
summary(image_batch)

"210×320×3×347 Array{Float32,4}"

In [104]:
b = bimodalEncoder(5,32,5,32,4,64,3,64, 100, 10, length(wdict), 12, 10)

bimodalEncoder(frame_head(Conv(P(Array{Float32,4}(5,5,3,32)), P(Array{Float32,4}(1,1,32,1)), NNlib.relu, 0, Knet.pool), Conv(P(Array{Float32,4}(5,5,32,32)), P(Array{Float32,4}(1,1,32,1)), NNlib.relu, 0, Knet.pool), Conv(P(Array{Float32,4}(4,4,32,64)), P(Array{Float32,4}(1,1,64,1)), NNlib.relu, 0, Knet.pool), Conv(P(Array{Float32,4}(3,3,64,64)), P(Array{Float32,4}(1,1,64,1)), NNlib.relu, 0, identity), Dense(P(Array{Float32,2}(100,47040)), P(Array{Float32,1}(100)), NNlib.relu, 0), Dense(P(Array{Float32,2}(10,100)), P(Array{Float32,1}(10)), NNlib.relu, 0)), sentence_head(Embed(P(Array{Float32,2}(12,21))), LSTM(input=12,hidden=10)))

In [105]:
cosine_similarity(b, image_batch, seqbatch(labels))

sentence rep: (10, 347, 8)
frame rep: (10, 347)


1×347×8 Array{Float32,3}:
[:, :, 1] =
 -0.118038  -0.107908  -0.0797857  -0.0904838  …  0.52989  0.536415  0.548458

[:, :, 2] =
 0.0081733  0.00578424  0.0267582  …  0.46213  0.49325  0.504588  0.51652

[:, :, 3] =
 0.074913  0.0857655  0.112887  0.11035  …  0.446556  0.437023  0.418254

[:, :, 4] =
 0.195273  0.209828  0.229917  0.218314  …  0.425147  0.439768  0.436326

[:, :, 5] =
 0.303071  0.322774  0.342415  0.331482  …  -0.251736  -0.264469  -0.26805

[:, :, 6] =
 0.325411  0.34792  0.365377  0.35577  …  -0.16953  -0.184082  -0.209256

[:, :, 7] =
 0.311041  0.335934  0.351619  0.342798  …  -0.368751  -0.368247  -0.374371

[:, :, 8] =
 0.280387  0.307391  0.321864  0.31341  …  -0.302818  -0.302033  -0.325656

In [None]:
# For running experiments we will use the Adam algorithm which typically converges faster than SGD.
function trainresults(file,maker,savemodel)
    if (print("Train from scratch? "); readline()[1]=='y')
        model = maker()
        results = ((nll(model,dtst), zeroone(model,dtst))
                   for x in takenth(progress(adam(model,ncycle(dtrn,5))),100))
        results = reshape(collect(Float32,flatten(results)),(2,:))
        Knet.save(file,"model",(savemodel ? model : nothing),"results",results)
        Knet.gc() # To save gpu memory
    else
        isfile(file) || download("http://people.csail.mit.edu/deniz/models/tutorial/$file",file)
        model,results = Knet.load(file,"model","results")
    end
    println(minimum(results,dims=2))
    return model,results
end

In [None]:
Tagger2(vocab,embed,hidden,output)=  # biRNN Tagger
    Chain(Embed(vocab,embed),RNN(embed,hidden,rnnType=:relu,bidirectional=true),Dense(2hidden,output));

In [None]:
VOCABSIZE = length(wdict)
EMBEDSIZE = 128
HIDDENSIZE = 128
OUTPUTSIZE = length(tdict)

In [None]:
t2maker() = Tagger2(VOCABSIZE,EMBEDSIZE,HIDDENSIZE,OUTPUTSIZE)
(t2,r2) = trainresults("instructions.txt",t2maker,true);