# Large Movie Review Dataset
# By Emre Can Acikgoz

# Imports

In [1]:
include("utils.jl");

# Configurations

In [2]:
PATH_TRAIN_POS = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/train/pos";
PATH_TRAIN_NEG = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/train/neg";
PATH_TEST_POS  = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/test/pos";
PATH_TEST_NEG  = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/test/neg";

# Data Pre-Processing

In [3]:
wdict = Dict()
w2i(x) = get!(wdict, x, 1+length(wdict))

w2i (generic function with 1 method)

In [4]:
function DataLoader(path::String, class::String)
    
    if lowercase(class) == "pos"
        tag = 1
    elseif lowercase(class) == "neg"
        tag = 2
    else
        error("class must be either 'pos' or 'neg'")
    end
    
    data = []
    for file in readdir(path)
        full_path = joinpath(path, file)
        f = open(full_path, "r")
        review = read(f, String)
        review = lowercase(review)
        review = replace(review, r"<br>" => " ", r"[^a-zA-Z\s-]" => " ", r"--" => " ")
        #review = split(review, " ")
        wordids = w2i.(split(review))
        push!(data, (wordids, tag))
        close(f)
    end
    return data
end

DataLoader (generic function with 1 method)

In [5]:
trnPos  = DataLoader(PATH_TRAIN_POS, "pos"); trnNeg  = DataLoader(PATH_TRAIN_NEG, "neg"); 
testPos = DataLoader(PATH_TEST_POS,  "pos"); testNeg = DataLoader(PATH_TEST_POS,  "neg"); 

In [6]:
trn = vcat(trnPos, trnNeg);
test = vcat(testPos, testNeg);
println("Train Data: $(length(trn)), Test Data: $(length(test))")
println("Train Example:\n$(first(trn))")
println("Test Example:\n$(test[15000])")

Train Data: 25000, Test Data: 25000
Train Example:
([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 13, 21, 22, 23, 24, 10, 25, 26, 27, 28, 29, 30, 31, 1, 2, 32, 33, 3, 34, 35, 29, 36, 37, 3, 21, 10, 38, 29, 39, 40, 10, 41, 42, 43, 44, 45, 46, 47, 48, 49, 21, 50, 10, 51, 52, 10, 53, 54, 55, 56, 28, 52, 10, 57, 58, 59, 60, 48, 42, 61, 58, 62, 10, 63, 24, 64, 4, 65, 66, 67, 29, 68, 69, 10, 18, 58, 70, 71, 9, 2, 4, 72, 73, 74, 58, 75, 76, 29, 77, 78, 52, 79, 21, 65, 80, 29, 1, 2, 58, 81, 31, 82, 83, 52, 22, 84, 85, 31, 1, 2, 3, 86, 87, 88, 4, 89, 31, 7, 90, 91], 1)
Test Example:
([286, 3, 4, 2085, 354, 1480, 889, 424, 552, 24, 12, 102, 28, 7, 11407, 4, 1224, 52, 987, 7059, 62202, 60, 1936, 24, 31, 12, 7468, 5405, 19914, 60, 10, 15, 1705, 337, 367, 4, 1084, 434, 24, 286, 354, 10, 1936, 60, 3506, 110, 3988, 60, 988, 1193, 10, 354, 7, 3, 240, 409, 3131, 11491, 929, 5367, 987, 354, 103, 7, 410, 4, 385, 2357, 354, 2991, 102, 10, 429, 22, 1301, 410, 1084, 913, 52, 286, 3

In [7]:
function freqWords(data)
    """
    Calculate word frequencies in a dictionary.
    """
    words = Dict()
    for review in data
        for word in review[1]
            words[word] = get(words, word, 0) + 1
        end
    end
    return words
end

freqWords (generic function with 1 method)

In [8]:
words = freqWords(trn);

# Naive Bayes Model

In [9]:
function classPriors(data)
    class1 = 0
    class2 = 0
    for review in data
        if review[2] == 1
            class1 += 1
        else
            class2 += 1
        end
    end
    priors=[class1, class2]
end

classPriors (generic function with 1 method)

In [10]:
classPriors(trn)

2-element Vector{Int64}:
 12500
 12500