# Large Movie Review Dataset
# By Emre Can Acikgoz

## Imports

In [1]:
using Languages, Statistics
include("utils2.jl");
include("nb.jl");

## Configurations

In [2]:
PATH_TRAIN_POS = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/train/pos";
PATH_TRAIN_NEG = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/train/neg";
PATH_TEST_POS  = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/test/pos";
PATH_TEST_NEG  = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/test/neg";

## Data Pre-Processing

In [3]:
wdict = Dict()
w2i(x) = get!(wdict, x, 1+length(wdict))

w2i (generic function with 1 method)

In [4]:
function DataLoader(path::String, class::String)
    
    if lowercase(class) == "pos"
        tag = 1
    elseif lowercase(class) == "neg"
        tag = 2
    else
        error("class must be either 'pos' or 'neg'")
    end
    
    data = []
    for file in readdir(path)
        full_path = joinpath(path, file)
        f = open(full_path, "r")
        review = read(f, String)
        review = lowercase(review)
        #review = replace(review, stop_words => " ")
        review = replace.(review, "<br>" => " ", r"[^a-zA-Z\s-]" => " ", "--" => " ", "\u85" => " ", "-" => " ", "\t" => " ")
        #review = split(review, " ")
        #wordids = w2i.(split(review))
        words = split(review, " ")
        #words = setdiff(words, stop_words)
        words = setdiff(words, " ")
        words = w2i.(words)
        push!(data, (words, tag))
        close(f)
    end
    return data
end

DataLoader (generic function with 1 method)

In [5]:
trnPos  = DataLoader(PATH_TRAIN_POS, "pos"); trnNeg  = DataLoader(PATH_TRAIN_NEG, "neg"); 
testPos = DataLoader(PATH_TEST_POS,  "pos"); testNeg = DataLoader(PATH_TEST_NEG,  "neg"); 
trn = vcat(trnPos, trnNeg); test = vcat(testPos, testNeg);

# Naive Bayes Model

In [6]:
wordsPos = build_wordcount_dict(trnPos);
wordsNeg = build_wordcount_dict(trnNeg);

In [7]:
function dictFrequency(dictionary, scale)
    new_dict = Dict()
    for (key, values) in dictionary
        new_dict[key] = log(values / scale)
    end
    return new_dict
end

dictFrequency (generic function with 1 method)

In [8]:
WordPos_freq = dictFrequency(wordsPos, length(wordsPos));
WordNeg_freq = dictFrequency(wordsNeg, length(wordsNeg));

In [9]:
function prediction(review, WordPos_freq, WordNeg_freq)
    ProbPos, ProbNeg = [], []
    for word in review
        #println(word)
        if haskey(WordPos_freq, word)
            #println("1")
            push!(ProbPos, WordPos_freq[word])
            #println("2")
        else
            #println("3")
            push!(ProbPos, log(1/length(WordPos_freq)))
            #println("4")
        end
        #println("5") 
        if haskey(WordNeg_freq, word)
            #println("6")
            push!(ProbNeg, WordNeg_freq[word])
            #println("7")
        else
            #println("8")
            push!(ProbNeg, log(1/length(WordNeg_freq)))
            #println("9")
        end
        #println("10")
    end
    #println("11")
    if sum(ProbPos) > sum(ProbNeg)
        #println("12")
        label = 1
    else
        #println("13")
        label = 2
    end
    return label
end

prediction (generic function with 1 method)

In [10]:
function accuracy(data, WordPos_freq, WordNeg_freq)
    res = []
    for (review, label) in data
        flag = label==prediction(review, WordPos_freq, WordNeg_freq)
        push!(res, flag)
    end
    return mean(res)
end

accuracy (generic function with 1 method)

# Training and Evaluation

In [11]:
train_acc = accuracy(trn, WordPos_freq, WordNeg_freq)
println("Training set accuracy: $(train_acc)")

Training set accuracy: 0.88164


In [12]:
wordsPos_test = build_wordcount_dict(testPos);
wordsNeg_test = build_wordcount_dict(testNeg);
WordPos_freq_test = dictFrequency(wordsPos_test, length(wordsPos_test));
WordNeg_freq_test = dictFrequency(wordsNeg_test, length(wordsNeg_test));
val_acc = accuracy(test, WordPos_freq_test, WordNeg_freq_test)
println("Test set accuracy: $(val_acc)")

Test set accuracy: 0.87884
