In [1]:
using Knet
import Base: length, size, iterate, eltype, IteratorSize, IteratorEltype, haslength, @propagate_inbounds, repeat, rand, tail
import .Iterators: cycle, Cycle, take, repeat
using Plots; default(fmt=:png,ls=:auto)

STANFORD GLOVE EMBEDDINGS 

In [2]:
f = open("glove.42B.300d.txt")

IOStream(<file glove.42B.300d.txt>)

In [3]:
lines = readlines(f);

In [4]:
embeddingdict = Dict()
embeddingindex = 1
for line in lines
    strword = split(line)
    strname = strword[1]
    embeddingdict[strname] = embeddingindex
    embeddingindex+=1
end

SENTIMENT TREEBANK DATA

In [5]:
txt1 = open("sentences.txt")
lines1 = readlines(txt1)
txt2 = open("labels.txt")
lines2 = readlines(txt2);

In [6]:
all_sentences = (x -> split(x)).(lines1);
all_sentences = [(x -> lowercase(x)).(s) for s in all_sentences]
        
train_sentences2 = all_sentences[1:8534]
test_sentences2 = all_sentences[8535:10744]
val_sentences2 = all_sentences[10745:11844]

train_labels = lines2[1:8534]
train_labels = (x -> parse(Float32, x)).(train_labels)
test_labels = lines2[8535:10744]
val_labels = lines2[10745:11844];

In [7]:
function mapfloat(label)
    if (0 <= label < 0.2); return 1; end;
    #if (0.4 <= label < 0.6); return 2; end;
    #if (0.6 <= label <= 1.0); return 3; end;
    if (0.2 <= label < 0.4); return 2; end;
    if (0.4 <= label < 0.6); return 3; end;
    if (0.6 <= label < 0.8); return 4; end;
    if (0.8 <= label <= 1.0); return 5; end;
end
train_labels = (x->mapfloat(x)).(train_labels);

In [8]:
function mapx(label)
    if label == "very neg" return 1; end;
    if label == "neg" return 2; end;
    if label == "neu" return 3; end;
    if label == "pos" return 4; end;
    if label == "very pos" return 5; end;
end

function map2x(i)
    if i == 1 return "very negative"; end;
    if i == 2 return "negative"; end;
    if i == 3 return "neutral"; end;
    if i == 4 return "positive"; end;
    if i == 5 return "very positive"; end;
end
                    
test_labels = (x->mapx(x)).(test_labels);

In [9]:
vocab = []
for sentence in all_sentences
    for word in sentence
        if !(word in vocab); push!(vocab, word);end
    end
end
push!(vocab, "UNK");

In [10]:
w2i = Dict()
i2w = Dict()
dictindex = 1
for word in vocab
    w2i[word] = dictindex
    i2w[dictindex] = word
    dictindex+=1
end
w2i["UNK"] = 19507
i2w[19507] = "UNK";

In [11]:
all_sentences = [(x -> w2i[x]).(s) for s in all_sentences];

In [12]:
function strings_to_indices(s)
    s = split(s)
    out = []
    for word in s
        word = lowercase(word)
        if !(word in vocab)
            word = w2i["UNK"]
        end
        push!(out, w2i[word])
    end
    hcat(out)
end

strings_to_indices("hey there")

2Ã—1 Array{Int64,2}:
 1804
   54

In [13]:
train_sentences = all_sentences[1:8534]
test_sentences = all_sentences[8535:10744]
val_sentences = all_sentences[10745:11844];

In [14]:
maxlength = 56
for sentence in train_sentences
    while length(sentence) != maxlength
        pushfirst!(sentence, w2i["UNK"])
    end
end

In [15]:
maxlength = 56
for sentence in test_sentences
    while length(sentence) != maxlength
        pushfirst!(sentence,w2i["UNK"])
    end
end

CONSTRUCTING THE EMBEDDING MATRIX

In [16]:
embedmatrix = []
no_embeddings = []
count = 0
for word in vocab
    if word in keys(embeddingdict)
        wordvector = (x-> parse(Float32, x)).(split(lines[embeddingdict[word]])[2:301])
        count += 1
    else
        wordvector = xavier(Float32, 300)
        push!(no_embeddings, (word, wordvector))
    end
    push!(embedmatrix, hcat(wordvector))
end

In [17]:
embedmatrix = hcat(embedmatrix...);

In [18]:
size(embedmatrix)

(300, 19507)

In [19]:
println(count, " out of ", length(vocab), " words are in Stanford Glove Embeddings. The rest is initialized randomly.")

18593 out of 19507 words are in Stanford Glove Embeddings. The rest is initialized randomly.


SET UP MODEL

In [20]:
#Hyperparameters of the Model
BATCHSIZE=5               # Number of instances in a minibatch
EMBEDSIZE=300             # Word embedding size
NUMHIDDEN=100             # Hidden layer size
MAXLEN=150                # maximum size of the word sequence, pad shorter sequences, truncate longer ones
VOCABSIZE=length(vocab)   # maximum vocabulary size, keep the most frequent 30K, map the rest to UNK token
NUMCLASS=5                # number of output classes
DROPOUT=0.5               # Dropout rate
LR=0.002                  # Learning rate
BETA_1=0.9                # Adam optimization parameter
BETA_2=0.999              # Adam optimization parameter
EPS=1e-08                 # Adam optimization parameter
MAXLENGTH = 56            # Used for padding

56

In [21]:
dtrn = minibatch(train_sentences,train_labels,BATCHSIZE;shuffle=true)
dtst = minibatch(test_sentences,test_labels ,BATCHSIZE)
length(dtrn), length(dtst)

(1706, 442)

In [23]:
#model struct
mutable struct LSTMN
    embeds
    lstm
    output
    pdrop
    Wh
    Wx
    Whh
    memory_tape
    hidden_tape
end

In [24]:
#model constructor
function LSTMN(input::Int, embed::Int, hidden::Int, output::Int; pdrop=0)
    embeds = param(KnetArray(embedmatrix))
    lstm = RNN(embed,hidden)
    output = param(output, hidden)
    Wh = param(KnetArray(xavier(Float32, 1,100)))
    Wx = param(KnetArray(xavier(Float32, 1,300)))
    Whh = param(KnetArray(xavier(Float32, 1,100)))
    memory_tape = []
    hidden_tape = []
    LSTMN(embeds, lstm, output, pdrop, Wh, Wx, Whh, memory_tape, hidden_tape)
end

LSTMN

In [25]:
function (lstmn::LSTMN)(input)
    embed = lstmn.embeds[:, permutedims(hcat(input...))]
    embed = dropout(embed,lstmn.pdrop)
    
    memory_tape = []
    hidden_tape = []
 
    xt = embed[:,:,1]
    
    lstmn.lstm.h = KnetArray(zeros(Float32,100,5))
    lstmn.lstm.c = KnetArray(zeros(Float32,100,5))
    ht = lstmn.lstm.h
    ct = lstmn.lstm.c[:,:,1]
    hprev = ht
    
    xt = embed[:,:,1]
    ht = lstmn.lstm(xt)
    ct = lstmn.lstm.c[:,:,1]
    
    push!(hidden_tape, ht)
    push!(memory_tape, ct)
    
    for t in 2:56
        xt = embed[:,:,t]
        ht = lstmn.lstm.h 
        ct = lstmn.lstm.c 
        
        h = hcat(hidden_tape[1:t-1]...)
        c = hcat(memory_tape[1:t-1]...)
    
        dot1 = lstmn.Wh * h
        dot1 = reshape(dot1, (t-1), 5)
        
        dot2 = lstmn.Wx * xt        

        dot3 = lstmn.Whh * hprev
        
        at = tanh.(dot1 .+ dot2 .+ dot3) 
        
        soft = softmax(at; dims=1)
        soft = reshape(soft, 5*(t-1))
        
        new_h = soft .* (h')
        new_h = reshape(new_h, 5, (t-1), 100)
        new_h = sum(new_h; dims = 2)
        new_h = reshape(new_h, 5,100)
        new_h = new_h'
                
        hprev = new_h
        
        new_c = soft .* (c')
        new_c = reshape(new_c, 5, (t-1), 100)
        new_c = sum(new_c; dims = 2)
        new_c = reshape(new_c, 5,100)
        new_c = new_c'
    
        lstmn.lstm.h = reshape(new_h, 100, 5, 1)
        lstmn.lstm.c = reshape(new_c, 100, 5, 1)
        
        ht = lstmn.lstm(xt)
        ct = lstmn.lstm.c[:,:,1]
        
        push!(hidden_tape, ht)
        push!(memory_tape, ct)
    end

    hidden = dropout(ht,lstmn.pdrop)
    return lstmn.output * hidden    
end

#model(input,output)
(l::LSTMN)(input,output) = nll(l(input),output)
#model(data)
(l::LSTMN)(d::Knet.Data) = Knet.mean(l(x,y) for (x,y) in d)

In [26]:
model = LSTMN(VOCABSIZE,EMBEDSIZE,NUMHIDDEN,NUMCLASS,pdrop=DROPOUT);

In [28]:
x1, y1 = first(dtrn)
model(x1, y1)

1.6387999f0

In [61]:
Knet.save("lstmn46.jld2", "embeds", model.embeds, "lstm", model.lstm, "output", model.output, "Wh", model.Wh, "Wx", model.Wx, "Whh", model.Whh)

In [33]:
function fasttrain!(lstmn::LSTMN, dtrn, dtst, max_iters=500)
    a = adam(lstmn, take(cycle(dtrn), max_iters+1);lr=LR,beta1=BETA_1,beta2=BETA_2,eps=EPS)
    progress!(a)
end            

fasttrain! (generic function with 2 methods)

In [30]:
function mytrain!(lstmn::LSTMN, dtrn, dtst,valid=10, max_iters=500)
        
    function pusher(lstmn::LSTMN,dtrn,dtst,trnloss,tstloss)
        push!(trnloss, lstmn(dtrn))
        push!(tstloss, lstmn(dtst))
    end
        
    trnloss = []
    tstloss = []
    
    takeevery(n,itr) = (x for (i,x) in enumerate(itr) if i % n == 1)            
    #progress!(adam(model,repeat(dtrn,EPOCHS);lr=LR,beta1=BETA_1,beta2=BETA_2,eps=EPS))
    #change the optimizer here: sgd, adam, ... @doc Knet.sgd to see other options :
    #a = sgd(sc, take(cycle(dtrn), max_iters+1))        
    a = adam(lstmn, take(cycle(dtrn), max_iters+1);lr=LR,beta1=BETA_1,beta2=BETA_2,eps=EPS)                   
    b = (pusher(lstmn,dtrn,dtst,trnloss,tstloss) for x in takeevery(valid, a))
    progress!(b)    
    return 0:valid:max_iters, trnloss, tstloss
end

mytrain! (generic function with 3 methods)

In [31]:
function tgraph(lstmn::LSTMN, dtrn, dtst, valid=10, max_iters=500)
    #Training_Accuracy = accuracy(lstmn, dtrn)
    #Test_Accuracy = accuracy(lstmn, dtst)
    #println("Training Accuracy: ", accuracy(lstmn, dtrn))
    #println("Test Accuracy: ", accuracy(lstmn, dtst))
    
    iters, trnloss, tstloss = mytrain!(lstmn,dtrn,dtst,valid,max_iters)
    
    println("Training Accuracy: ", accuracy(lstmn, dtrn))
    println("Test Accuracy: ", accuracy(lstmn, dtst))
    
    push!(models, (model, accuracy))
    
    plot(iters, [trnloss, tstloss], labels=[:trn, :tst], xlabel="iterations", ylabel="loss")
end

tgraph (generic function with 3 methods)

In [66]:
bestd = Knet.load("fine-grained.jld2")

Dict{String,Any} with 6 entries:
  "lstm"   => LSTM(input=300,hidden=100)
  "Wh"     => P(KnetArray{Float32,2}(1,100))
  "output" => P(KnetArray{Float32,2}(5,100))
  "embeds" => P(KnetArray{Float32,2}(300,19507))
  "Whh"    => P(KnetArray{Float32,2}(1,100))
  "Wx"     => P(KnetArray{Float32,2}(1,300))

In [67]:
model.lstm = bestd["lstm"]
model.Wh = bestd["Wh"]
model.output = bestd["output"]
model.embeds = bestd["embeds"]
model.Whh = bestd["Whh"]
model.Wx = bestd["Wx"]

P(KnetArray{Float32,2}(1,300))

In [40]:
function predict(input)
    input = split(lowercase.(input))
    for i in 1:length(input) 
        if !(input[i] in vocab)
            input[i] = "UNK"
        end
    end
    input = (i->w2i[i]).(input)
    while length(input) != maxlength
        pushfirst!(input, w2i["UNK"])
    end
    toybatch = [input]
    push!(toybatch, ones(56))
    push!(toybatch, ones(56))
    push!(toybatch, ones(56))
    push!(toybatch, ones(56))
    r = argmax(Array(model(toybatch)), dims=1)[1][1]
    ["very negative","negative","neutral","positive","very positive"][r]
end

predict (generic function with 1 method)

Try your own sentences!

In [79]:
userinput = readline(stdin)
predict(userinput)

stdin> it is so bad it is an insult to any fan of the series


"very negative"