# Download the IMDB Dataset

In [5]:
function pretty_print_review_and_label(i)
    println(labels[i] * "\t\t" * reviews[i][:80]*"...")
end
g = open("reviews.txt", "r");
reviews = map(x -> x[1:end-1], readlines(g))
close(g)

g = open("labels.txt", "r")
labels = map(x -> x[1:end-1], readlines(g))
close(g)

# Capturing Word Correlation in Input Data

In [6]:
onehots = Dict()
onehots["cat"] = [1,0,0,0]
onehots["the"] = [0,1,0,0]
onehots["dog"] = [0,0,1,0]
onehots["sat"] = [0,0,0,1]

sentence = ["the","cat","sat"]
x = onehots[sentence[1]] +
    onehots[sentence[2]] +
    onehots[sentence[3]]

println("Sent Encoding:" ,x)

Sent Encoding:[1, 1, 0, 1]


# Predicting Movie Reviews

In [78]:
f = open("reviews.txt")
raw_reviews = readlines(f)
close(f)

f = open("labels.txt")
raw_labels = readlines(f)
close(f)

tokens = collect(map(x -> Set(split(x, " ")), raw_reviews))

vocab = Set()
for sent in tokens
    for word in sent
        if length(word)>0
            push!(vocab, word)
        end
    end
end
vocab = collect(vocab)

word2index = Dict()
for (i,word) in enumerate(vocab)
    word2index[word] = i
end

input_dataset = []
for sent in tokens
    sent_indices = []
    for word in sent
        try
            push!(sent_indices, word2index[word])
        catch
            nothing
        end
    end
    push!(input_dataset, sent_indices)
end

target_dataset = []
for label in raw_labels
    if label == "positive"
        push!(target_dataset, 1)
    else
        push!(target_dataset, 0)
    end
end

In [85]:
using Random: seed!
using Statistics: mean
seed!(1);

sigmoid(x) = 1/(1 + exp(-x))

alpha, iterations = (0.01, 2)
hidden_size = 100

weights_0_1 = 0.2 .* rand(hidden_size, length(vocab)) .- 0.1
weights_1_2 = 0.2 .* rand(1, hidden_size) .- 0.1

for iter=1:iterations
    correct,total = (0,0)
    
    for i=1:length(input_dataset)-1000
        x,y = (input_dataset[i],target_dataset[i])
        layer_1 = sigmoid.(sum(weights_0_1[:,x]; dims=2)) #embed + sigmoid
        layer_2 = sigmoid.(weights_1_2 * layer_1) # linear + softmax

        layer_2_delta = layer_2[1] - y # compare pred with truth
        layer_1_delta = weights_1_2' * layer_2_delta #backprop
        weights_0_1[:,x] .-= layer_1_delta .* alpha
        weights_1_2 .-= layer_1' * layer_2_delta .* alpha
        
        if abs(layer_2_delta) < 0.5
            correct += 1
        end
        total += 1
        
        if (i%10 == 9)
            progress = string(i/length(input_dataset))
            print("Iter: $(iter) Progress: $(progress[3:4]).$(progress[5:6])% Training Accuracy: $(correct*100/total)% \r")
        end
    end
    println()
end

correct,total = (0,0)
for i=length(input_dataset)-1000+1:length(input_dataset)
    global correct,total
    x = input_dataset[i]
    y = target_dataset[i]
    
    layer_1 = sigmoid.(sum(weights_0_1[:,x]; dims=2))
    layer_2 = sigmoid.(weights_1_2 * layer_1)
    
    if abs(layer_2[1] - y) < 0.5
        correct += 1
    end
    total += 1 
end

println("Test Accuracy: $(correct*100 / total)%")

Iter: 1 Progress: 95.99% Training Accuracy: 83.29513729738738%  
Iter: 2 Progress: 95.99% Training Accuracy: 90.02875119796659% 
Test Accuracy: 84.6%


In [116]:
tokens[1]

Set{SubString{String}} with 93 elements:
  "ran"
  "high"
  "financially"
  "closer"
  "many"
  "that"
  "believe"
  "."
  "lead"
  "much"
  "right"
  "tried"
  "student"
  "teachers"
  "down"
  "of"
  "as"
  "life"
  "to"
  "burn"
  "repeatedly"
  "time"
  "is"
  "comedy"
  "school"
  â‹® 

# Comparing Word Embeddings

In [86]:
function similar(target = "beautiful")
    target_index = word2index[target]
    scores = Dict()
    for (word,index) in word2index
        raw_difference = weights_0_1[:,index] .- (weights_0_1[:,target_index])
        squared_difference = raw_difference .* raw_difference
        scores[word] = -sqrt(sum(squared_difference))
    end
    scores = sort(collect(scores), by = x -> x[2])
    return scores[end-10:end]
end

similar (generic function with 2 methods)

In [87]:
print(similar("beautiful"))

Pair{Any,Any}["lonely" => -0.7606872138205998, "very" => -0.757472400002682, "makes" => -0.7482008579252035, "atmosphere" => -0.7401130704185961, "realistic" => -0.7370231146359848, "enjoyed" => -0.7368032258956541, "outstanding" => -0.7191995643434201, "fascinating" => -0.7156162295886442, "tony" => -0.7145560051988458, "fun" => -0.7125931652622348, "beautiful" => -0.0]

In [88]:
print(similar("terrible"))

Pair{Any,Any}["boring" => -0.867778521263862, "lacks" => -0.8654050443143977, "annoying" => -0.8404860856552324, "dull" => -0.8344160597358742, "horrible" => -0.8339665570876666, "disappointment" => -0.8312817274328139, "badly" => -0.8258996837532668, "laughable" => -0.8199354332981658, "disappointing" => -0.787154935881353, "worse" => -0.7290208096213938, "terrible" => -0.0]

In [117]:
using Random: seed!, shuffle!
using Statistics: mean
seed!(1)

f = open("reviews.txt")
raw_reviews = readlines(f)
close(f)

tokens = collect(map(x -> split(x, " "), raw_reviews))

vocab = Set()
for sent in tokens
    for word in sent
        push!(vocab, word)
    end
end
vocab = collect(vocab)
pushfirst!(vocab, "")

word2index = Dict()
for (i,word) in enumerate(vocab)
    word2index[word] = i
end


concatenated = []
input_dataset = []

for sent in tokens
    sent_indices = []
    for word in sent
        try
            push!(sent_indices, word2index[word])
            push!(concatenated, word2index[word])
        catch
            nothing
        end
    end
    push!(input_dataset, sent_indices)
end
shuffle!(input_dataset);

In [119]:
alpha, iterations = (0.05, 2)
hidden_size,window,negative = (50,2,5)

weights_0_1 = (rand(hidden_size, length(vocab)) .- 0.5) .* 0.2
weights_1_2 = zeros(hidden_size, length(vocab))

layer_2_target = zeros(negative+1)
layer_2_target[1] = 1

function similar(target = "beautiful")
    target_index = word2index[target]
    scores = Dict()
    for (word,index) in word2index
        raw_difference = weights_0_1[:,index] .- (weights_0_1[:,target_index])
        squared_difference = raw_difference .* raw_difference
        scores[word] = -sqrt(sum(squared_difference))
    end
    scores = sort(collect(scores), by = x -> x[2])
    return scores[end-10:end]
end

sigmoid(x) = 1/(1 + exp(-x))

for (rev_i,review) in enumerate(repeat(input_dataset, iterations))
    for target_i=1:length(review)
    # since it's really expensive to predict every vocabulary
    # we're only going to predict a random subset  
        target_samples = cat([review[target_i]],
            concatenated[floor.(Int, rand(negative) .* length(concatenated)) .+ 1];dims=1)
        
        left_context = review[maximum([1,target_i-window]):target_i-1]
        right_context = review[target_i+1:minimum([length(review),target_i+window])]
        
        layer_1 = mean(weights_0_1[:,cat(left_context,right_context;dims=1)];dims=2)
        layer_2 = sigmoid.(weights_1_2[:,target_samples]' * layer_1)
        
        layer_2_delta = layer_2 .- layer_2_target
        layer_1_delta = weights_1_2[:,target_samples] * layer_2_delta
        
        weights_0_1[:,cat(left_context,right_context;dims=1)] .-= layer_1_delta .* alpha
#         weights_1_2[:,target_samples] .-= layer_2_delta' .* layer_1 .* alpha
#         println(size(weights_1_2[:,target_samples]),"   ", size(layer_1),"   ", size(layer_2_delta))
        weights_1_2[:,target_samples] .-= layer_1 * layer_2_delta' .* alpha
    end
    if ((rev_i-1)%250 ==0)
        progress = string(rev_i/(length(input_dataset)*iterations))
        print("Iter: $(rev_i) Progress: $(progress[3:4]).$(progress[5:6])% $(similar("terrible")) \r")
    end
end
println()
print(similar("terrible"))

Iter: 49751 Progress: 99.50% Pair{Any,Any}["pathetic" => -5.251849464500627, "poor" => -5.244059582063777, "horrid" => -5.224333439915089, "bad" => -5.025035807493965, "fabulous" => -4.900013910945217, "dire" => -4.862255655904565, "horrendous" => -4.815266730857877, "dreadful" => -4.633237439660956, "brilliant" => -4.35446600960425, "horrible" => -3.4853321926842233, "terrible" => -0.0]  -0.0] ] 0] 0]   ]  ] 0.0] 0.0] 
Pair{Any,Any}["phenomenal" => -5.24061492401048, "poor" => -5.1990643889362795, "bad" => -5.177405982619217, "pathetic" => -5.115065204777761, "dire" => -4.822512576226827, "fabulous" => -4.820005311662894, "dreadful" => -4.662066472024753, "horrendous" => -4.551263171793982, "brilliant" => -4.157469808834901, "horrible" => -3.326160370785526, "terrible" => -0.0]

# King - Man + Woman ~= Queen

In [120]:
function analogy(positive=["terrible","good"],negative=["bad"])
    norms = sum(weights_0_1 .* weights_0_1;dims=1)
    normed_weights = weights_0_1 .* norms
    
    query_vect = zeros(length(weights_0_1[:,1]))
    for word in positive
        query_vect .+= normed_weights[:,word2index[word]]
    end
    for word in negative
        query_vect .-= normed_weights[:,word2index[word]]
    end
    
    scores = Dict()
    for (word,index) in word2index
        raw_difference = weights_0_1[:,index] .- query_vect
        squared_difference = raw_difference .* raw_difference
        scores[word] = -sqrt(sum(squared_difference))
    end
    scores = sort(collect(scores), by = x -> x[2])
    return scores[end-10:end]
    
end

analogy (generic function with 3 methods)

In [121]:
analogy(["terrible","good"],["bad"])

11-element Array{Pair{Any,Any},1}:
 "wonderful" => -410.76347469946796
     "solid" => -410.5172999889353
  "terrific" => -410.46767384489203
 "fantastic" => -410.35283244490654
     "great" => -410.30986354459367
    "decent" => -410.16464895375964
  "terrible" => -409.83897055722974
      "nice" => -409.83306961734246
    "superb" => -409.6693237851858
      "fine" => -409.5907736118097
      "good" => -409.1190204707009

In [122]:
analogy(["elizabeth","he"],["she"])

11-element Array{Pair{Any,Any},1}:
     "mrs" => -291.00933784138675
 "william" => -291.00247975414936
  "claire" => -290.99774555556536
   "smith" => -290.97329667830707
    "paul" => -290.90533279249064
 "charles" => -290.8412218001043
  "rachel" => -290.8409379432931
      "br" => -290.6110910318185
    "jeff" => -290.4993442420866
    "alan" => -290.07338355489554
      "he" => -289.3412723735815