In [31]:
library(future.apply)
library(magrittr)
library(data.table)
library(caret)

In [32]:
tidyverse <- c('ggplot2','dplyr','tidyr','stringr','forcats','tibble','readr','purrr')
lapply(tidyverse, library, character.only = T)

In [33]:
# Implemented binary search, as searching through
# the rack of words required too much time

binary_search <- function(arr, target){

  low <- 1
  high <- length(arr)

  while (low <= high) {

    mid <- floor((low + high) / 2)

    if (arr[mid] == target){
      return(mid)
    }  else if (arr[mid] < target){
      low <- mid + 1
    } else high <- mid -1

  }

  return(Inf)  # Return Inf if the target is not found
}

In [34]:
# Useful functions to be used afterwards

# Random variable to use when comparing logs
# Non-log probabilities were used in the end
r <- function(rand = runif(1)) log(rand/(1-rand))/2

# Just merges two dataframes, with set instructions
merger <- function(df1,df2){
  return(suppressWarnings(merge(df1,df2,by = 'keys', all = T)))
}

In [35]:
# Useful regex for handling the data.
# We want to treat numbers only by the fact that they are numbers.
# Also, two letters or less words are to be removed (together w/ stopwords)
# Also, every non-latin character is to be removed for simplicity purposes.

numbers_regex <- "[-+]?\\d+(?:\\.\\d+)?(?:[eE][-+]?\\d+)?"
two_letter_or_less_regex <- "\\b[A-Za-z]{1,2}\\b"
non_alphanumerical_regex <- "[^a-zA-Z0-9]"


# Core function, returns the vector with names = words, values = occurrence
# in a single sentence (After applying regex substitutions)

bag <- function(sentence) {

  # Remove every number, put keyword NUM instead.
  sentence <- gsub(numbers_regex, " NUM ", sentence, perl=T)

  # Lower everything
  sentence <- tolower(sentence)

  # Treat n't and apostrophes
  sentence <- gsub("n't",' not ',sentence)
  sentence <- gsub("'", " ", sentence)

  # Remove two-letter, non-alphanumerical characters, multiple spaces
  sentence <- gsub(non_alphanumerical_regex, " ", sentence, perl=T)
  sentence <- gsub(two_letter_or_less_regex, " ", sentence, perl=T)
  sentence <- gsub(" +", " ", sentence)

  # Split into words to make a vector of words.
  words <- strsplit(sentence, "\\W+")[[1]]

  bag_of_words <- list()

  # Count the frequency of each word
  for (word in words) {
    if (word %in% names(bag_of_words)) {
      bag_of_words[[word]] <- bag_of_words[[word]] + 1
    } else {
      bag_of_words[[word]] <- 1
    }
  }

  return(bag_of_words)
}

In [42]:
# Read the StopWords from file (csv in our case)
# sw <- read_delim('data/sw1k.csv')

train_perc = 0.1

read <- function(sw_path = 'data/sw1k.csv',
                 data_path = 'data/SixClasses/train.csv',
                 c = 2){

    sw <- read_delim(sw_path)
    sw <- sw['term']
    sw <- na.omit(sw)

    # Read the training data from the csv
    # data <- read_delim('data/TwoClasses/train.csv')
    data <- read_delim(data_path)
    data <- na.omit(data)


    # Extract test data, since the actual test data does not
    # provide any labels to test it against.
    data <- data[sample(nrow(data)),]

    test_index <- as.integer(train_perc*nrow(data))


    if(c == 6){
        
        # Shuffle the data

        # Retrieve the test set
        test <- data[1:test_index,c('Text','Labels')]
        colnames(test) = c('text' , 'label')

        # Retrieve the training set
        data <- data[test_index:length(data[[1]]),c('Text','Labels')]
        print(length)
        colnames(data) = c('text' , 'label')
        labels <- data$label
        
    } else{

        # Shuffle the data
        data <- data[sample(nrow(data)), ]
        
        # Retrieve test and training sets
        test <- data[1:test_index ,c('text','label')]
        data <- data[test_index:length(data[[1]]),c('text','label')]
        labels <- data$label
    }
    
    return(list(data,test,labels,sw))
}

split = read(c = 6)
data = split[[1]]
test = split[[2]]
labels = split[[3]]
sw = split[[4]]

[1mRows: [22m[34m1000[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (2): term, type
[32mdbl[39m (3): frequency, presence, doc_size_sum

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m10240[39m [1mColumns: [22m[34m3[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (2): Text, Text_Tag
[32mdbl[39m (1): Labels

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


function (x)  .Primitive("length")


In [43]:
# # This line is just needed to parallelize the lapply
# plan(multicore)

# # Just lapply, but parallelized.
# # bag(data$text[[i]]) is a vector of word:occurrence, which means that
# # Silo is a **list of vectors of word:occurrence**
# silo <- future_lapply(data$text,bag)

silo <- lapply(data$text, bag)



In [54]:
# Core function. Divides Silo into classes, removes the
# Zero probability (either by doing p -> p+eps, or count -> count + 1)
# Then returns the relative frequency of words for each class.
# The relative frequency is occurrence / sum(occurrences)
calc_probs <- function(silo = silo , sw = sw , epsilon = 1 , add_to_counts = T , only_na = F , sw_rm = T){

    # Create List of Silos and List of Racks
    # Where Racks are the final, grouped_by_words, vector of word:value.
    silos <- list()
    racks <- list()

    # Create racks for every class
    for(i in 1:length(unique(labels))){

      # Every vector in silo comes from a text, which comes from a class
      # We can use the labels vector that shares indices to separate
      # each vector in silo by class. Unlist them to create a sequence
      # of vectors, concatenate them and make them a list.
      silos[i] <- list(sapply(unlist(silo[labels == i-1]), c))

      # Convert into df
      df <- data.table(keys = names(unlist(silos[[i]])), values = unlist(silos[[i]]))

      # Finally, group by the words and sum the occurrencies
      racks[[i]] <- df %>% group_by(keys) %>% summarise(values = sum(values))
    }

    # Just merge the (first) two racks: we want a big object
    # with all the relative frequencies
    word_freq <- merger(racks[1],racks[2])

    # Merge more for multiple classes
    if(length(unique(labels)) >= 3){
      for(j in 3:length(unique(labels))){
        word_freq <- merger(word_freq, racks[j])
      }
      for(j in 3:length(unique(labels))) colnames(word_freq)[j+1] <- paste0('C',j-1)
    }
    
    # Obviously add name of classes and word-column192.168.1.77
    colnames(word_freq)[2:3] <- c("C0", "C1")
    colnames(word_freq)[1] <- "words"

    # And do the same if multiple classes are given
    if(sw_rm) word_freq <- word_freq[!(unlist(word_freq['words']) %in% names(bag(sw))), ]

    # If there is no data for the frequency, it just means zero occurrence.
    word_freq[is.na(word_freq)] <- 0

    # If we decide to add 1 to everything as a strategy, do it
    # And decide whether to do it only for NA values or add 1
    # to everything.
    if(add_to_counts){
      if(!only_na){
        word_freq[names(word_freq)[-1]] <- word_freq[names(word_freq)[-1]] + epsilon
      } else word_freq[(word_freq == 0)] <- epsilon
    }

    # Calculate the relative frequencies
    for(k in 2:length(word_freq)){
      word_freq[k] <- word_freq[k]/sum(word_freq[k])
    }

    # If we did not decide to add 1 to everything, it means
    # We want to substitute the probabilities (which are
    # the relative frequency) with an epsilon. Add that
    # only to NA values or every value.
    if(!add_to_counts){
      if(!only_na){
        word_freq[names(word_freq)[-1]] <- word_freq[names(word_freq)[-1]] + epsilon
      } else word_freq[(word_freq == 0)] <- epsilon
    }

    # Return the big object of word frequencies
    return(word_freq)
}

# At the end, probs is a table of columns WORDS, C0, C1, ..., Cn
probss <- calc_probs(silo = silo, sw = sw , epsilon = 0.5)
probss[order(probss$C0, decreasing=TRUE), ]

Unnamed: 0_level_0,words,C0,C1,C2,C3,C4,C5
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
10505,voted,0.005910937,0.0035797082,0.0037986411,0.0035151184,0.0027090194,0.0034480145
9694,taxes,0.004823035,0.0026251193,0.0055281038,0.0046071940,0.0027090194,0.0040542589
1759,clinton,0.004460400,0.0035115233,0.0033662755,0.0024912975,0.0024965473,0.0012503789
6623,obamacare,0.004242820,0.0027614892,0.0023780111,0.0015357313,0.0029214916,0.0007956957
9786,texas,0.004242820,0.0036478931,0.0055898703,0.0048802130,0.0024965473,0.0046605032
805,barack,0.003807659,0.0060343652,0.0040457072,0.0026278070,0.0053649209,0.0024628675
4548,hillary,0.003735132,0.0023523797,0.0025633107,0.0020135144,0.0024965473,0.0008714762
6057,medicare,0.002864810,0.0028296741,0.0026868437,0.0014674766,0.0018591310,0.0012503789
10752,wisconsin,0.002864810,0.0048752216,0.0024397776,0.0029008259,0.0035589079,0.0018566232
8743,senate,0.002792283,0.0026933042,0.0023780111,0.0027643164,0.0014341868,0.0026902092


In [45]:
# This is the prior, P(c) where c is the
# vector of classes -> Vector of P(c_i)'s
CC <- NULL
for(i in 1:length(unique(labels))){
  count <- length(labels[labels==i-1])
  CC <- c(CC,count)
}

CC <- CC / sum(CC)

CC

In [46]:
test_a_text <- function(text, probs = probss, static = FALSE, c = 6){
    
    bagged <- bag(text)
    # print(bagged)
    # cat(text , '\n')
    numerators <- NULL
    unseen <- 0

    for(class in 1:c){
        
        temp <- 0
        
        for(j in 1:length(bagged)){
            
            Word <- bagged[j]
            index <- binary_search(probs[["words"]], names(Word))
            
            if (is.infinite(index)){
                unseen <- unseen + 1
                if(unseen/length(bagged) > 0.9) return(Inf)
                p_wgc <- 1
            }else{
                p_wgc <- probs[index, class+1]
            }

            temp <- temp + as.numeric(unname(Word))*log((p_wgc))
        }

        temp <- temp + log(CC[class])
        numerators <- c(numerators, temp)
    }
    
    final_probabilities <- numerators
    
    if(static | ((max(numerators)-max(numerators[numerators != max(numerators)])) >= 6)) pred <- which.max(numerators) - 1
        # If difference in orders of magnitude between 1st and 2nd is
        # more than 6, we just care about the one that is ~100%

    if(!static){
        
        # For the given text, the probabilities of C0, C1, ...
        final_probabilities <- final_probabilities - mean(final_probabilities)
        final_probabilities <- exp(final_probabilities)
        final_probabilities <- final_probabilities / sum(final_probabilities)
          
        p <- cumsum(final_probabilities)
        p <- p - runif(1)
        pred <- length(p[p<=0])
    }
    
    # cat(numerators, "prob:", final_probabilities, "label:", pred, "\n")

    return(pred)
}



# test_a_text(test$text[1])
# plan(multicore)
# preds <- future_lapply(test$text , test_a_text)

preds <- sapply(test$text, test_a_text)

preds <- unname(preds[is.finite(preds)])


In [47]:
preds

In [None]:
do_cm <- function(labels = test[[2]] , predictions = preds , c = 6){

    cm <- matrix(data = 0, nrow = c, ncol = c)


    for(i in 1:length(preds)){
        pred <- as.integer(predictions[i])
        label <- as.integer(labels[i])
        cm[pred+1,label+1] <- cm[pred+1,label+1] + 1
    }

    return(cm)
}
# do_cm()
cm <- confusionMatrix(do_cm())
cm

In [None]:
labels

In [None]:
# Core function: evaluate the test text and calculate
# the probabilities for each class.
# Either use the threshold strategy or random draw strategy.

# N.B. test is the test set (of texts), NOT a single text.
eval_NBC <- function(test, probs, threshold = 0.5, static = T , c = 2){

    # All values for confusion matrix [Results purposes]
    if(c == 2){

        TP <- 0
        TN <- 0
        FP <- 0
        FN <- 0
    }
    res <- matrix(data = 0,c,c)

    prob_1 <- NULL

    # For every text to be tested
    # (Using index to get label afterwards)
    for(k in 1:length(test[[1]])){

      bagged <- unlist(bag(test[[1]][k]))
      numerators <- NULL

      #For every class, produce a numerators vector which has the ln(P(c_i))
      for(i in unique(labels)){
        temp <- 0
        # Product of every P(word|Class). Ignores unseen words.
        # p_wgc stands for Probability (of) Words Given Counts 
        # So, for every word in the text...
        for(j in 1:length(bagged)){

          # Take word, find it in the probs
          Word <- bagged[j]

          index <- binary_search(probs[["words"]], names(Word))

          # If it's not found -> unseen word -> use 1
          # due to multiplicative nature
          if (is.infinite(index)){
            p_wgc <- 1
          }else{
            p_wgc <- probs[index, i+2]
          }

          temp <- temp + unname(Word)*log((p_wgc))
        }

        # Add the prior probability
        temp <- temp + log(CC[i+1])

        # Add the unnormalized posterior to the list of P(c_i)'s
        numerators <- c(numerators, temp)
      }

      final_probabilities <- numerators
        
        
      if((max(numerators)-max(numerators[numerators != max(numerators)])) >= 6){
        # If difference in orders of magnitude is more than 6, 
        # we just care about the one that is ~100%
        pred <- which.max(numerators) - 1
      } else{

        # For the given text, the probabilities of C0, C1, ...
        final_probabilities <- final_probabilities - mean(final_probabilities)
        final_probabilities <- exp(final_probabilities)
        final_probabilities <- final_probabilities / sum(final_probabilities)
          
        if(c==2){
          
          prob_1 <- c(prob_1, final_probabilities[2])
          if(static) pred <- as.integer((final_probabilities[2] >= threshold))
          if(!static) pred <- as.integer((final_probabilities[2] >= runif(1)))
        }
        
        if(c==6){
          
          if(static) pred <- which.max(final_probabilities) - 1
          if(!static){
              p <- cumsum(probabilities)
              p <- p - runif(1)
              pred <- length(p[p<0])
          }
        }
      }

      res[test[[2]][k]+1,pred+1] <- res[test[[2]][k]+1,pred+1] +1

    }
    if(c==2){
        hist(prob_1)
    }
    
    return(res)
    
}


res <- eval_NBC(test = test , probs = probs , c = 2)
res

In [None]:
do_plot <- function(test_res , epsilon , sw_rm){
    confusion_matrix <- matrix(test_res, nrow = 2, byrow = TRUE,
                               dimnames = list(Actual = c("Positive", "Negative"),
                                               Predicted = c("Positive", "Negative")))

    cm <- confusionMatrix(confusion_matrix)

    cm_df <- data.frame(Actual = rep(c("Positive", "Negative"), each = 2),
                        Predicted = rep(c("Positive", "Negative"), times = 2),
                        Value = test_res)

    the_plot <- ggplot(cm_df, aes(x = Predicted, y = Actual, fill = Value)) +
      geom_tile(color = "white") +
      geom_text(aes(label = Value), color = "black", size = 15) +
      scale_fill_gradient(low = "#A0FFFF", high = "blue") +
      theme_minimal() +
      labs(title = paste0('epsilon ',epsilon ,'\n Stop Word_rm: ', sw_rm),
           x = "Predicted",
           y = "Actual")

    print(the_plot)
    return(cm[['overall']]['Accuracy'])
}



# # cm <- table(unlist(test[[2]]) , unlist(preds))
# # arr <- 1:36
# # cm <- confusionMatrix(matrix(data = arr , 6,6))

# cm <- do_cm()
# heatmap(cm, col = heat.colors(10),Rowv = NA , Colv = NA)

# for (i in 1:nrow(cm)) {
#   for (j in 1:ncol(cm)) {
#     text((j-0.9)/1.6, (i-0.7)/1.6, cm[i, j], col = "black", cex = 1.8)
#   }
# }


In [None]:
epsilon <- 1e-6
sw_rm <- T

probs <- calc_probs(silo = silo, sw = sw, epsilon = epsilon, sw_rm = sw_rm, add_to_counts = F, only_na = F)

In [None]:
test_res <- eval_NBC(test = test, probs = probs, static = TRUE)
print(test_res)

In [None]:
acc <- do_plot(test_res = test_res, epsilon = epsilon, sw_rm = sw_rm)
print(acc)

In [None]:
sth <- c(1,2,3,4,8,99,0,63,46,45,6,5464,564,3,4)
max(sth[sth != max(sth)])