<a href="https://colab.research.google.com/github/christianbentz/Workshop_DGfS2022/blob/main/Code/EntropyEstimation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Entropy Estimation with Relative Frequencies
Author: Chris Bentz

Date:

# Install Libraries
Some packages are already pre-installed on jupyter, but some need to be installed. Run this code to make sure that the packages/libraries needed to run this code are installed.

In [None]:
install.packages("stringr")
install.packages("entropy")
install.packages("quanteda")

# Load Libraries

If the libraries are not installed yet, you need to install them using, for example, the command: install.packages("ggplot2").

In [None]:
library(stringr)
library(entropy)
library(quanteda)

# List Files
Create list with all the file paths of files which are about to be processed further.

In [None]:
file.list <- list.files(path = "/content/processed", 
                        recursive = T, full.names = T)
head(file.list)
length(file.list)

# Character Entropy Estimation
Estimate character entropy by using relative frequencies of characters in the text.

In [None]:
# set counter
counter = 0

# initialize data frame to append results to
entropy.df <- data.frame(filename = character(0), subcorpus = character(0), 
                         id = character(0), h.unigrams = numeric (0),
                         h.bigrams = numeric(0), h.trigrams = numeric(0)) 

# start time
start_time <- Sys.time()
for (file in file.list) {
  # loading textfile ("skip" specifies the number of lines to skip, whereas
  # nmax gives the max number of lines to read.)
  chars <- scan(file, what = "char", quote = "", comment.char = "", 
                   encoding = "UTF-8", sep = " " , skip = 0, nmax = F) 
  # get filename
  filename <- basename(file) 
  # get subcorpus category
  subcorpus <- sub("_.*", "", filename)
  # get three letter identifier from file name
  id <- substr(str_extract(filename, "_.*_"), 2, 4) 
  
  # estimate entropy for character unigrams
  chars.uni.df <- as.data.frame(table(chars))
  h.unigrams <- entropy(chars.uni.df$Freq, method = "ML", unit = "log2")
  
  # estimate entropy for character bigrams
  chars.bi <- char_ngrams(chars, n = 2, concatenator = "")
  chars.bi.df <- as.data.frame(table(chars.bi)) 
  h.bigrams <- entropy(chars.bi.df$Freq, method = "ML", unit = "log2")
  
  # estimate entropy for character trigrams
  chars.tri <- char_ngrams(chars, n = 3, concatenator = "")
  chars.tri.df <- as.data.frame(table(chars.tri)) 
  h.trigrams <- entropy(chars.tri.df$Freq, method = "ML", unit = "log2")
  
  # append results to data frame
  local.df <- data.frame(filename, subcorpus, id, h.unigrams, h.bigrams, h.trigrams)
  entropy.df <- rbind(entropy.df, local.df)
  # counter
  counter <- counter + 1
  # print(counter)
}

# get end time
end_time <- Sys.time()
end_time - start_time

# show final data frame with results
print(entropy.df)

# Write Table to File
Write the table as comma separted values (csv) to a file (useful for checking the table in a regular program like excel, and later loading it into another session).

In [9]:
write.csv(entropy.df, "/content/results/entropyEstimations.csv", row.names = F)