In [107]:
# Read data and store in arrays
## In order for the data to be read correctly, this notebook and aclImdb folder which contains 
## training and test data should be in the same directory.
function read_data(path)
    contents = String[]
    for (root, dirs, files) in walkdir(path)
        push!.(Ref(contents), read.(joinpath.(root, files), String))
    end
    contents
end

## First the positive training examples and then, the negative ones
trn_pos = read_data("aclImdb/train/pos")
trn_neg = read_data("aclImdb/train/neg")

## Remove punctuations, html <br > and special characters
## Convert to lowercase
function preprocess(text; remove_punc=true)
    text = lowercase(text)
    if remove_punc
        punc_and_spec = ['"','!','\'','^','#','+','%','&','/','(',')','[',']','{','}','=','*','?','@',';',',',':','.','\\','|']
        text = replace(text, punc_and_spec => "")
    end
    text = replace(text, "<br >" => " ")
    text = replace(text, ['-','_'] => " ")
    text
end

## Preprocess training data
trn_pos = preprocess.(trn_pos; remove_punc=false)
trn_neg = preprocess.(trn_neg)

## Tokenize the texts into individual words
function tokenize(text)
    tokens = split(text)
end

## Create dictionaries in the form of (word => frequency)
function calculate_freq(iter)
    word_dict = Dict{String, Integer}()
    for i in iter
        if haskey(word_dict, i)
            freq = get(word_dict, i, 1)
            word_dict[i] = freq + 1
        else
            word_dict[i] = 1
        end
    end
    word_dict
end

## Parse training data into words and calculate  word frequencies
trn_pos_tokens = tokenize.(trn_pos)
trn_pos_tokens = collect(Iterators.flatten(trn_pos_tokens))
trn_pos_words = calculate_freq(trn_pos_tokens)
trn_neg_tokens = tokenize.(trn_neg)
trn_neg_tokens = collect(Iterators.flatten(trn_neg_tokens))
trn_neg_words = calculate_freq(trn_neg_tokens)

# Read test data & preprocess
test_pos = read_data("aclImdb/test/pos")
test_neg = read_data("aclImdb/test/neg")
test_pos = preprocess.(test_pos; remove_punc=false)
test_neg = preprocess.(test_neg)

# Eliminate words with low frequency
## Threshold values have been optimized by trial and error strategy
threshold_pos = sum(values(trn_pos_words))/length(values(trn_pos_words)) * 3
trn_pos_words = filter(p->(last(p)>=threshold_pos), trn_pos_words)

threshold_neg = sum(values(trn_neg_words))/length(values(trn_neg_words)) * 2.2
trn_neg_words = filter(p->(last(p)>=threshold_neg), trn_neg_words)

# Total count of all features in the training set
d = length(unique(collect(Iterators.flatten(vcat(keys(trn_pos_words), keys(trn_neg_words))))))

# Total count of all features in class 'positive'
total_cnts_features_p = length(unique(keys(trn_pos_words)))

# Total count of all features in class 'negative'
total_cnts_features_n = length(unique(keys(trn_neg_words)))

# Calculate P(word|class)
function calculate_word_prob(word, dictionary, total_cnts_features, d, alpha)
    if haskey(dictionary, word)
        prob = (dictionary[word] + alpha)/(total_cnts_features + alpha*d)
    else
        prob = alpha/(total_cnts_features + alpha*d)
    end
    prob
end

# Predict class of a sentence
function naive_bayes_classifier(sentence; alpha = 1)
    prob_pos = Float64(1)
    prob_neg = Float64(1)
    tokens = tokenize(sentence)
    for token in tokens
        prob_pos *= calculate_word_prob(token, trn_pos_words, total_cnts_features_p, d, alpha)
        prob_neg *= calculate_word_prob(token, trn_neg_words, total_cnts_features_n, d, alpha)
    end
    if prob_pos >= prob_neg
        pred = 1
    else
        pred = 0
    end
    pred
end


############## MAIN ###############################

## Run Naive Bayes Classifier
pred_pos_test = naive_bayes_classifier.(test_pos)
pred_neg_test = naive_bayes_classifier.(test_neg)

## Calculate Accuracy
total = length(pred_pos_test) + length(pred_neg_test)
true_pos = length(filter(e-> e==1,pred_pos_test))
true_neg = length(filter(e-> e==0,pred_neg_test))
accuracy = (true_pos + true_neg)/total * 100
println("Accuracy is: $accuracy%")

Accuracy is: 92.704%
