# Large Movie Review Dataset
# By Emre Can Acikgoz
### Naive Bayes Sentiment Analysis in 8 cells

## Imports

In [1]:
using Statistics
include("utils.jl");
include("nb.jl");

## Configurations

In [2]:
# Data paths
PATH_TRAIN_POS = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/train/pos";
PATH_TRAIN_NEG = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/train/neg";
PATH_TEST_POS  = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/test/pos";
PATH_TEST_NEG  = "/Users/emrecanacikgoz/Desktop/Comp442/data/aclImdb/test/neg";

## Data Pre-Processing

In [3]:
# Convert word to IDs
wdict = Dict()
w2i(x) = get!(wdict, x, 1+length(wdict));

In [4]:
# Get Dataloader by using: lowercasing,igoring some punctuations, splitting, converting id's
trnPos  = DataLoader(PATH_TRAIN_POS, "pos"); trnNeg  = DataLoader(PATH_TRAIN_NEG, "neg"); 
testPos = DataLoader(PATH_TEST_POS,  "pos"); testNeg = DataLoader(PATH_TEST_NEG,  "neg"); 
trn     = vcat(trnPos, trnNeg); test = vcat(testPos, testNeg);

# Naive Bayes Approach

In [5]:
# Count each word for related class
wordsPos_train = build_wordcount_dict(trnPos);
wordsNeg_train = build_wordcount_dict(trnNeg);
wordsPos_test  = build_wordcount_dict(testPos);
wordsNeg_test  = build_wordcount_dict(testNeg);

In [6]:
# Get Logarithmic Frequencies for related class
WordPos_freq_train = dictFrequency(wordsPos_train, length(wordsPos_train));
WordNeg_freq_train = dictFrequency(wordsNeg_train, length(wordsNeg_train));
WordPos_freq_test  = dictFrequency(wordsPos_test,  length(wordsPos_test));
WordNeg_freq_test  = dictFrequency(wordsNeg_test,  length(wordsNeg_test));

# Training and Evaluation

In [7]:
# Training
train_acc = @time accuracy(trn, WordPos_freq_train, WordNeg_freq_train);
println("Training set accuracy: $(train_acc*100)%")

  0.916919 seconds (7.98 M allocations: 289.407 MiB, 30.26% gc time, 13.73% compilation time)
Training set accuracy: 88.164%


In [8]:
# Evaluation
val_acc = @time accuracy(test, WordPos_freq_test, WordNeg_freq_test);
println("Test set accuracy: $(val_acc*100)%")

  0.568173 seconds (7.57 M allocations: 269.820 MiB, 8.13% gc time)
Test set accuracy: 87.884%
