# Literature/KEGG Data Reformatting:  Notebook 1
## Restructuring literature data
#### Mr. George L. Malone
#### 8<sup>th</sup> of December, 2020


### Contents
1.  *R*:  Function definition
2.  Cleaning chemical names from literature spreadsheet
3.  Getting DOI by compound name


### *R*:  Function definition
#### `chemname_cleaner_mk03.R`

A number of chemical names in the literature feature an appended section,
including asterisks and bracketed components, or are combined into one entry
but separated by a slash or plus.  This function is designed to clean chemical
names given in the spreadsheet of recorded literature data.  The input is the
dataframe of the spreadsheet once read into *R*.

```r
#' Mk. III Chemical Name Cleaner
#' - Trim peripheral end brackets and asterisks
#' - Push to lower-case
#' - Unfold rows with multiple chemicals (e.g. slash, plus)
#' - Name becomes the first column
#' @param data Dataframe containing chemical names to clean
#' @return Cleaned, lower-case chemical names, with order preserved
chemname_cleaner_mk03 <- function(data, nameIndex = 1L) {
  # Initialise results object
  result <- NULL
  # Clean chemical names
  clean <- sapply(
    data[, nameIndex],
    function(name) {
      # Push to lower and substitute undesirable components
      tolower(gsub(
        # Target the ending bracketed part(s) and asterisks
        "(?<!^)((\\* | )((\\(|\\[).+(\\)|\\]))?\\*?|\\*)$",
        # Eliminate them
        '',
        name,
        # PCRE mate, what else?
        perl = TRUE
      ))
    }
  )
  # Split by slash and plus
  split <- strsplit(clean, "/|\\+", perl = TRUE)
  # Loop over the split
  for (i in seq_along(split)) {
    # Row-bind the chemicals, but preserve regulation direction
    colSelect <- seq_len(ncol(data))
    result <- rbind(
      result,
      cbind(
        # If length(split[[i]]) > 1, multiple rows are bound
        "name" = trimws(split[[i]]),
        # Bind the remaining non-name columns
        data[i, colSelect[which(colSelect != nameIndex)]],
        # Do not apply row names
        row.names = NULL
      )
    )
  }
  # Return;
  return(result)
}
```

The intended usage of the function is as such:

```r
# Example path
path <- "anpc_public/rfData/R/data/literatureData.tsv"

# Read in the data
data <- read.delim(path, sep = "\t", header = TRUE, stringsAsFactors = FALSE)

# Clean the names
dataClean <- chemname_cleaner_mk03(data, nameIndex = 1L)

```

The dataframe can now be written out to avoid re-calculation if used further,
or can otherwise be used in the remainder of the script, such as for searching
for names present in the KEGG Compound data object.

### Cleaning chemical names from literature spreadsheet

The operations here are performed in *R*.  Initially, the literature
spreadsheet is read in from a TSV.  The data are then cleaned using the
function `chemname_cleaner_mk03`, as documented previously.


```r
# Set working directory
setwd("/anpc_public/rfData/R/")  # Set this as required.

# Source required functions
source("./functions/chemname_cleaner_mk03.R")

# Read in the spreadsheet
# via remote source
urlText <- readLines("./data/litInfoLink.txt")
data <- read.delim(
  url(urlText),
  sep = "\t",
  header = TRUE,
  stringsAsFactors = FALSE
)

# or via local
data <- read.delim(
  "./data/litInfo.tsv",
  sep = "\t",
  header = TRUE,
  stringsAsFactors = FALSE
)

# Clean up the chemical names
dataClean <- chemname_cleaner_mk03(data, nameIndex = 1L)

# Write out
write.table(
  dataClean,
  file = "./data/litInfoClean.tsv",
  sep = "\t",
  eol = "\n",
  row.names = FALSE,
  na = ''
)
```

### Getting DOI by compound name

Using *R*, the following script produces a dataframe of two columns from the
cleaned literature data.  The first column is a vector of sorted, unique names
found in the cleaned literature data.  The second column is a vector of the
DOIs for which the name is attributed.  The script formats for DOI by compound
name and compound by DOI, but it was thought that DOI by compound name is more
appropriate.  There are also multiple rows where chemicals are reported on more
than one occasion -- that is, there is one row for each combination of compound
and DOI.

There may be duplicate rows.  This suggests that a limitation could be made
whereby each article is limited to one report per chemical.  Duplication likely
occurs due to the method of recording the data -- if the paper reports more
than one comparison group, the chemical may be reported more than once under
the same DOI.

```r
# Set working directory
setwd("/anpc_public/rfData/R/")  # Set this as required.

# Source required functions
source("chemname_cleaner_mk03.R")

# Read in the data
urlText <- readLines("./data/litInfoLink.txt")
data <- read.delim(url(urlText), sep = "\t", header = TRUE)

# Clean up the names
dataClean <- chemname_cleaner_mk03(data, nameIndex = 1L)

# Collect by DOI
# Compound per DOI
doiUniq <- sort(unique(dataClean[, 8]))
doiCompound <- lapply(
  doiUniq,
  function(x) dataClean[which(dataClean[, 8] == x), 1]
)
names(doiCompound) <- doiUniq

# DOI per compound
nameUniq <- sort(unique(dataClean[, 1]))
compoundDoi <- lapply(
  nameUniq,
  function(x) dataClean[which(dataClean[, 1] == x), 8]
)
names(compoundDoi) <- nameUniq

# Function version -- could be of use
eachOfFor <- function(data, eachOf, eachFor) {
  uniq <- sort(unique(data[, eachOf]))
  result <- lapply(uniq, function(x) data[which(data[, eachOf] == x), eachFor])
  names(result) <- uniq
  return(result)
}

# Open out the data
framesCd <- sapply(
  seq_along(compoundDoi),
  function(i) {
    cbind(
      rep(names(compoundDoi)[i], length(compoundDoi[[i]])),
      compoundDoi[[i]]
    )
  }
)
resultCd <- NULL
for (i in seq_along(framesCd)) resultCd <- rbind(resultCd, framesCd[[i]])
resultCd <- data.frame("name" = resultCd[, 1], "doi" = resultCd[, 2])

framesDc <- sapply(
  seq_along(doiCompound),
  function(i) {
    cbind(
      rep(names(doiCompound)[i], length(doiCompound[[i]])),
      doiCompound[[i]]
    )
  }
)
resultDc <- NULL
for (i in seq_along(framesDc)) resultDc <- rbind(resultDc, framesDc[[i]])
resultDc <- data.frame("doi" = resultDc[, 1], "name" = resultDc[, 2])

# Whether compound by doi or doi by compound, the unfolded result will be the
# same, but in a different order.

# That is, the relevant rows will still be there, but the columns will be at
# different indices, and the rows will be in a different order.

# i.e. resultDc has doi at column index 1, and rows are sorted by doi;
# resultCd has name at column index 1, and rows are sorted by chemical name

# So in both cases, the dataframe is ordered by column 1, but column 1 depends
# on the method. The resulting dataframe contains the same data, ultimately.

# Write DOI by compound
write.table(
  resultCd,
  file = "./data/doiByCompound.tsv",
  sep = "\t",
  eol = "\n",
  row.names = FALSE,
  na = ''
)

# And finish with some measurements
numCompoundDoi <- sapply(
  unique(resultCd[, 2]),
  function(x) length(resultCd[which(resultCd[, 2] == x), 1])
)

summary(numCompoundDoi)
# One paper reports 1212 compounds, but possible repeats (due to groups).
# This is probably <MASKED>

numDoiCompound <- sapply(
  unique(resultCd[, 1]),
  function(x) length(resultCd[which(resultCd[, 1] == x), 2])
)

summary(numDoiCompound)
# At least one compound is reported 30 times, but most are sitting at around
# 1-3 times, so it's quite skewed.

# All of this is also prior to culling names that are not metabolites, and
# prior to checking for / reducing by names in KEGG.
```