# Large Movie Review Dataset
# By Emre Can Acikgoz

# Imports

In [1]:
using Languages
include("utils.jl");

# Configurations

In [2]:
PATH_TRAIN_POS = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/train/pos";
PATH_TRAIN_NEG = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/train/neg";
PATH_TEST_POS  = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/test/pos";
PATH_TEST_NEG  = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/test/neg";

# Data Pre-Processing

In [3]:
wdict = Dict()
w2i(x) = get!(wdict, x, 1+length(wdict))

w2i (generic function with 1 method)

In [4]:
stopwords_dict = Dict()
stop_words = stopwords(Languages.English())

488-element Vector{String}:
 "a"
 "about"
 "above"
 "across"
 "after"
 "again"
 "against"
 "all"
 "almost"
 "alone"
 "along"
 "already"
 "also"
 ⋮
 "you'd"
 "you'll"
 "young"
 "younger"
 "youngest"
 "your"
 "you're"
 "yours"
 "yourself"
 "yourselves"
 "you've"
 "z"

In [5]:
function DataLoader(path::String, class::String)
    
    if lowercase(class) == "pos"
        tag = 1
    elseif lowercase(class) == "neg"
        tag = 2
    else
        error("class must be either 'pos' or 'neg'")
    end
    
    data = []
    for file in readdir(path)
        full_path = joinpath(path, file)
        f = open(full_path, "r")
        review = read(f, String)
        review = lowercase(review)
        #review = replace(review, stop_words => " ")
        review = replace.(review, "<br>" => " ", r"[^a-zA-Z\s-]" => " ", "--" => " ", "\u85" => " ", "-" => " ", "\t" => " ")
        #review = split(review, " ")
        #wordids = w2i.(split(review))
        words = split(review, " ")
        words = setdiff(words, stop_words)
        words = setdiff(words, " ")
        #words = w2i.(words)
        push!(data, (words, tag))
        close(f)
    end
    return data
end

DataLoader (generic function with 1 method)

In [6]:
trnPos  = DataLoader(PATH_TRAIN_POS, "pos"); trnNeg  = DataLoader(PATH_TRAIN_NEG, "neg"); 
testPos = DataLoader(PATH_TEST_POS,  "pos"); testNeg = DataLoader(PATH_TEST_POS,  "neg"); 

In [7]:
trn = vcat(trnPos, trnNeg);
test = vcat(testPos, testNeg);
println("Train Data: $(length(trn)), Test Data: $(length(test))")
println("Train Example:\n$(first(trn))")
println("Test Example:\n$(test[15000])");

Train Data: 25000, Test Data: 25000
Train Example:
(SubString{String}["bromwell", "cartoon", "comedy", "", "ran", "time", "programs", "school", "life", "teachers", "teaching", "profession", "lead", "believe", "satire", "closer", "reality", "scramble", "survive", "financially", "insightful", "students", "pathetic", "pomp", "pettiness", "situation", "remind", "schools", "episode", "student", "repeatedly", "tried", "burn", "immediately", "recalled", "classic", "line", "inspector", "sack", "welcome", "expect", "adults", "age", "fetched", "pity", "isn"], 1)
Test Example:
(SubString{String}["decent", "movie", "", "little", "bit", "short", "time", "packs", "lot", "action", "grit", "commonsense", "emotions", "frame", "matt", "dillon", "main", "character", "job", "intensity", "convincing", "tense", "throughout", "typical", "fancy", "expensive", "hollywood", "cgi", "satisfying", "indeed", "price", "evening", "straight", "traditional", "acting", "story", "directing", "recommend", "development", "

In [8]:
words = build_wordcount_dict(trn)

Dict{Any, Any} with 72840 entries:
  "redeemiing"     => 1
  "poulange"       => 1
  "inattentive"    => 1
  "sleepwalking"   => 20
  "photosynthesis" => 1
  "lunk"           => 1
  "henry"          => 245
  "whiz"           => 16
  "redresses"      => 1
  "gathered"       => 38
  "nikhilji"       => 1
  "cannibalistic"  => 22
  "jacqualine"     => 1
  "rainstorms"     => 1
  "underground"    => 152
  "sobre"          => 2
  "methods"        => 79
  "keg"            => 2
  "proportioned"   => 4
  "crib"           => 9
  "premature"      => 16
  "prerelease"     => 1
  "menotti"        => 1
  "burkley"        => 1
  "debie"          => 1
  ⋮                => ⋮

In [9]:
function wordFilter(dictionary, threshold)
    for (key, value) in dictionary
        if value < threshold
            dictionary = delete!(dictionary, key)
        end
        return dictionary
    end
end

wordFilter (generic function with 1 method)

In [10]:
words_filtered = wordFilter(words, 2);

In [11]:
words_filtered = delete!(words_filtered, "")

Dict{Any, Any} with 72838 entries:
  "poulange"       => 1
  "inattentive"    => 1
  "sleepwalking"   => 20
  "photosynthesis" => 1
  "lunk"           => 1
  "henry"          => 245
  "whiz"           => 16
  "redresses"      => 1
  "gathered"       => 38
  "nikhilji"       => 1
  "cannibalistic"  => 22
  "jacqualine"     => 1
  "rainstorms"     => 1
  "underground"    => 152
  "sobre"          => 2
  "methods"        => 79
  "keg"            => 2
  "proportioned"   => 4
  "crib"           => 9
  "premature"      => 16
  "prerelease"     => 1
  "menotti"        => 1
  "burkley"        => 1
  "debie"          => 1
  "hicks"          => 23
  ⋮                => ⋮

# Naive Bayes Model

In [12]:
# class probabilities
cp = classPriors(trn)

2-element Vector{Float64}:
 0.5
 0.5

In [13]:
# Find Probability of each word
probs = Dict()
function wordProbs(word_dict)
    freq_dict = Dict()
    for class in 1:2
        for (word,freq) in word_dict
            #freq_dict[class][word]= log.(word_dict[word]/sum(values(word_dict))
            neg_logs = log(word_dict[word]/sum(values(word_dict)))
            freq_dict[word] = neg_logs + log(0.5)
        end
        
    end
    return freq_dict
end

wordProbs (generic function with 1 method)

In [16]:
new_dict = wordProbs(words_filtered)

Dict{Any, Any} with 72838 entries:
  "poulange"       => -15.1942
  "inattentive"    => -15.1942
  "sleepwalking"   => -12.1984
  "photosynthesis" => -15.1942
  "lunk"           => -15.1942
  "henry"          => -9.69289
  "whiz"           => -12.4216
  "redresses"      => -15.1942
  "gathered"       => -11.5566
  "nikhilji"       => -15.1942
  "cannibalistic"  => -12.1031
  "jacqualine"     => -15.1942
  "rainstorms"     => -15.1942
  "underground"    => -10.1703
  "sobre"          => -14.501
  "methods"        => -10.8247
  "keg"            => -14.501
  "proportioned"   => -13.8079
  "crib"           => -12.9969
  "premature"      => -12.4216
  "prerelease"     => -15.1942
  "menotti"        => -15.1942
  "burkley"        => -15.1942
  "debie"          => -15.1942
  "hicks"          => -12.0587
  ⋮                => ⋮

In [17]:
argmax(new_dict)

"movie"