Skip to content

Commit

Permalink
refactor: styler & linter
Browse files Browse the repository at this point in the history
  • Loading branch information
Curro Campuzano committed Jul 17, 2023
1 parent d0aeda3 commit edfc834
Show file tree
Hide file tree
Showing 37 changed files with 906 additions and 909 deletions.
2 changes: 2 additions & 0 deletions .lintr
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
linters: linters_with_defaults() # see vignette("lintr")
encoding: "UTF-8"
6 changes: 3 additions & 3 deletions R/HMMER.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ post_query <- function(query) {
# Parses results from HMMER into a tibble.
#
# @param results list with results (@seealso HMMERutils::post_query())
parse_results_into_tbl <- function(results) {
parse_results_into_tbl <- function(results) { # nolint
tibble::tibble(
"algorithm" = purrr::pluck(results, "algo", .default = NA),
"uuid" = purrr::pluck(results, "uuid", .default = NA),
Expand All @@ -67,7 +67,7 @@ parse_results_into_tbl <- function(results) {
) %>%
tidyr::unnest_wider("stats", names_sep = ".") %>%
tidyr::unnest_wider("hits", names_sep = ".") %>%
dplyr::mutate("hits.evalue" = fix_evalue_column(.[["hits.evalue"]]))
dplyr::mutate("hits.evalue" = fix_evalue_column(.[["hits.evalue"]])) # nolint
}

# Fix E-value column which sometimes can be a list, a character vector
Expand All @@ -91,7 +91,7 @@ fix_evalue_column <- function(column) {
# - seqdb a string with seqdb (for phmmer, hmmsearch or jackhmmer)
# - timeout_in_seconds an integer with the number of
# seconds to wait before exits.
search_in_hmmer <- function(...) {
search_in_hmmer <- function(...) { # nolint
r <- params_into_query_list(...) %>%
post_query()
if (r$status != 200) {
Expand Down
10 changes: 5 additions & 5 deletions R/add_physicochemical_properties_to_HMMER_tbl.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#' other proteins as receptors, to normalize it is divided by the number of
#' residues. A protein have high binding potential if the index value is
#' higher than 2.48.
#' - hydrophobicity: GRAVY hydrophobicity index of an amino acids sequence
#' - hydrophobicity: GRAVY hydrophobicity index of an amino acids sequence
#' using KyteDoolittle hydophobicity scale.
#' - instaIndex: Guruprasad's instability index.
#' This index predicts the stability of a protein based
Expand All @@ -54,9 +54,9 @@
#' colname = "hits.fullfasta"
#' )
#' @export
#'
add_physicochemical_properties_to_HMMER_tbl <- function(
data, colname = "hits.fullfasta") {
#' @importFrom magrittr `%>%`

add_physicochemical_properties_to_HMMER_tbl <- function(data, colname = "hits.fullfasta") { # nolint
if (!requireNamespace("Peptides", quietly = TRUE)) {
stop("Package \"Peptides\" must be installed to use this function.",
call. = FALSE
Expand Down Expand Up @@ -88,7 +88,7 @@ add_physicochemical_properties_to_HMMER_tbl <- function(
})
}

calculate_peptides <- function(y) {
calculate_peptides <- function(y) { # nolint
Peptides::aaComp(y) %>%
purrr::map_dfr(~ {
as.data.frame(.x) %>%
Expand Down
9 changes: 4 additions & 5 deletions R/add_sequences_to_hmmer_tbl.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
#'
#' @export
#'
add_sequences_to_hmmer_tbl <- function(data, extension = "fullfasta",
max_times = 3) {
add_sequences_to_hmmer_tbl <- function(data, extension = "fullfasta", max_times = 3) { # nolint
stopifnot(any("uuid" %in% colnames(data)))
stopifnot(any("hits.name" %in% colnames(data)))
inner_function <- purrr::insistently(
Expand All @@ -32,14 +31,14 @@ add_sequences_to_hmmer_tbl <- function(data, extension = "fullfasta",
dplyr::group_by(!!group_var) %>%
dplyr::group_split() %>%
purrr::map_dfr(inner_function) %>%
delete_na_rows
delete_na_rows()
}

delete_na_rows <- function(data) {
data[rowSums(is.na(data)) <= nrow(data),]
data[rowSums(is.na(data)) <= nrow(data), ]
}

add_AAStringSet_to_tbl <- function(fasta, data, extension) {
add_AAStringSet_to_tbl <- function(fasta, data, extension) { # nolint
col_name <- paste0("hits.", extension)
x <- tibble::tibble("hits.name" = names(fasta))
x[c(col_name)] <- as.character(fasta)
Expand Down
5 changes: 2 additions & 3 deletions R/add_taxa_to_hmmer_tbl.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

#' Add taxonomic information to a Data Frame obtained
#' from HMMER with a "hits.taxid" column.
#'
Expand All @@ -20,8 +19,8 @@
#' )
#' @export
#'
add_taxa_to_hmmer_tbl <- function(data, mode = "remote", rank_vc = NULL) {
inner_function <- function(x) {
add_taxa_to_hmmer_tbl <- function(data, mode = "remote", rank_vc = NULL) { # nolint
inner_function <- function(x) { # nolint
annotate_with_NCBI_taxid(
taxid = unique(x$hits.taxid),
mode = mode, rank_vc = rank_vc
Expand Down
2 changes: 1 addition & 1 deletion R/annotate_with_ncbi.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#' @examples
#' annotate_with_NCBI_taxid(7955, mode = "remote")
#' @export
annotate_with_NCBI_taxid <- function(taxid, rank_vc = NULL, mode = "remote") {
annotate_with_NCBI_taxid <- function(taxid, rank_vc = NULL, mode = "remote") { # nolint
if (!requireNamespace("taxizedb", quietly = TRUE) && mode == "local") {
stop(
"Package \"taxizedb\" must be installed to use this function with a
Expand Down
26 changes: 13 additions & 13 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
#' \item{\code{algorithm }}{HMMER algorithm}
#' \item{\code{uuid}}{unique hmmer identifier}
#' \item{\code{stats.page}}{}
#' \item{\code{stats.nhits}}{The number of hits found above reporting
#' \item{\code{stats.nhits}}{The number of hits found above reporting
#' thresholds}
#' \item{\code{stats.elapsed}}{}
#' \item{\code{stats.Z}}{The number of sequences or models in the target
#' \item{\code{stats.Z}}{The number of sequences or models in the target
#' database}
#' \item{\code{stats.Z_setby}}{}
#' \item{\code{stats.n_past_msv}}{}
Expand All @@ -36,7 +36,7 @@
#' \item{\code{hits.ndom}}{Total number of domains identified in this sequence}
#' \item{\code{hits.extlink}}{}
#' \item{\code{hits.fullfasta}}{Protein sequences as a character vector.}
#' \item{\code{hits.taxid}}{The NCBI taxonomy identifier of the target (if
#' \item{\code{hits.taxid}}{The NCBI taxonomy identifier of the target (if
#' applicable)}
#' \item{\code{hits.acc}}{Accession of the target}
#' \item{\code{hits.taxlink}}{}
Expand All @@ -45,26 +45,26 @@
#' \item{\code{hits.flags}}{}
#' \item{\code{hits.nregions}}{Number of regions evaluated}
#' \item{\code{hits.niseqs}}{}
#' \item{\code{hits.name}}{Name of the target (sequence for phmmer/hmmsearch,
#' \item{\code{hits.name}}{Name of the target (sequence for phmmer/hmmsearch,
#' HMM for hmmscan)}
#' \item{\code{hits.species}}{The species name of the target (if applicable)}
#' \item{\code{hits.score}}{Bit score of the sequence (all domains, without
#' \item{\code{hits.score}}{Bit score of the sequence (all domains, without
#' correction)}
#' \item{\code{hits.bias}}{}
#' \item{\code{hits.sindex}}{}
#' \item{\code{hits.nincluded}}{Number of domains satisfying inclusion
#' \item{\code{hits.nincluded}}{Number of domains satisfying inclusion
#' thresholding}
#' \item{\code{hits.domains}}{The domain or hit hash contains the details of the
#' match, in particular the alignment between the query and the target.}
#' \item{\code{hits.pdbs}}{Array of pdb identifiers (which chains information)}
#' \item{\code{hits.evalue}}{E-value of the score}
#' \item{\code{hits.nreported}}{Number of domains satisfying reporting
#' \item{\code{hits.nreported}}{Number of domains satisfying reporting
#' thresholding}
#' \item{\code{hits.archindex}}{}
#' \item{\code{hits.acc2}}{Secondary accession of the target}
#' }
#'
#' For further details, see
#' For further details, see
#' \url{https://hmmer-web-docs.readthedocs.io/en/latest/appendices.html}
#'
"phmmer_2abl"
Expand All @@ -78,10 +78,10 @@
#' \item{\code{algorithm }}{HMMER algorithm}
#' \item{\code{uuid}}{unique hmmer identifier}
#' \item{\code{stats.page}}{}
#' \item{\code{stats.nhits}}{The number of hits found above reporting
#' \item{\code{stats.nhits}}{The number of hits found above reporting
#' thresholds}
#' \item{\code{stats.elapsed}}{}
#' \item{\code{stats.Z}}{The number of sequences or models in the target
#' \item{\code{stats.Z}}{The number of sequences or models in the target
#' database}
#' \item{\code{stats.Z_setby}}{}
#' \item{\code{stats.n_past_msv}}{}
Expand Down Expand Up @@ -109,9 +109,9 @@
#' \item{\code{hits.nreported}}{}
#' \item{\code{hits.hindex}}{}
#' \item{\code{hits.ndom}}{Total number of domains identified in this sequence}
#' \item{\code{hits.name}}{Name of the target (sequence for phmmer/hmmsearch,
#' \item{\code{hits.name}}{Name of the target (sequence for phmmer/hmmsearch,
#' HMM for hmmscan)}
#' \item{\code{hits.score}}{Bit score of the sequence (all domains, without
#' \item{\code{hits.score}}{Bit score of the sequence (all domains, without
#' correction)}
#' \item{\code{hits.bias}}{}
#' \item{\code{hits.domains}}{The domain or hit hash contains the details of the
Expand All @@ -120,7 +120,7 @@
#' \item{\code{hits.nincluded}}{E-value of the score}
#' }
#'
#' For further details, see
#' For further details, see
#' \url{https://hmmer-web-docs.readthedocs.io/en/latest/appendices.html}
#'
"hmmscan_2abl"
60 changes: 31 additions & 29 deletions R/extract_from_hmmer.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,62 +13,62 @@
#' column = "hits.domains"
#' )
#' @export
extract_from_hmmer <- function(data, column = "hits.domains") {
extract_from_hmmer <- function(data, column = "hits.domains") { # nolint
# Create copy of data
data2 <- data.frame(data)
# Save number of rows
n.rows <- nrow(data2)
n_rows <- nrow(data2)
# Initialize list with new column
new.column <- list()
for (row in seq_len(n.rows)) {
new.column[[length(new.column) + 1]] <- list()
new_column <- list()
for (row in seq_len(n_rows)) {
new_column[[length(new_column) + 1]] <- list()
}

# Iterate over each row of dataframe
for (row in seq_len(n.rows)) {
for (row in seq_len(n_rows)) {
# Calculate number of domains/sequences in actual row
n.elements <- length(data2[row, column][[1]])
n_elements <- length(data2[row, column][[1]])

# If there are more than one domain/sequence, add them at the end of
# dataframe
if (n.elements > 1) {
for (el in c(2:n.elements)) {
if (n_elements > 1) {
for (el in c(2:n_elements)) {
data2[nrow(data2) + 1, ] <- data2[row, ]
assigned.element <- data2[row, column][[1]][[el]]
assigned_element <- data2[row, column][[1]][[el]]

if (is.null(assigned.element)) {
assigned.element <- NA
if (is.null(assigned_element)) {
assigned_element <- NA
}
new.column[[length(new.column) + 1]] <- assigned.element
new_column[[length(new_column) + 1]] <- assigned_element
}
}
# Access to actual row list
assigned.element <- data2[row, column][[1]][[1]]
if (is.null(assigned.element)) {
assigned.element <- NA
assigned_element <- data2[row, column][[1]][[1]]
if (is.null(assigned_element)) {
assigned_element <- NA
}
new.column[[row]] <- assigned.element
new_column[[row]] <- assigned_element
}
# Substitute new.column by column name and unnest column list into
# multiple columns
data2 <- data2 %>% bind_and_unnest(column, new.column)
data2 <- data2 %>% bind_and_unnest(column, new_column)
data2
}


bind_and_unnest <- function(data, old.column, new.column) {
bind_and_unnest <- function(data, old_column, new.column) { # nolint
data2 <- data.frame(data)
data2 <- cbind(data2, I(new.column))

data2 <- data2 %>%
dplyr::select(-c({
old.column
old_column
})) %>%
dplyr::rename({{ old.column }} := new.column)
dplyr::rename({{ old_column }} := new.column)

if (old.column != "hits.pdbs") {
if (old_column != "hits.pdbs") {
data2 <- data2 %>%
tidyr::unnest_wider({{ old.column }}, names_sep = ".")
tidyr::unnest_wider({{ old_column }}, names_sep = ".")
}

# Remove empty columns
Expand All @@ -78,16 +78,18 @@ bind_and_unnest <- function(data, old.column, new.column) {
# Remove 'hits.' prefix from colnames
colnames(data2) <- colnames(data2) %>%
stringr::str_replace_all(
old.column,
stringr::str_remove(old.column, "hits.")
old_column,
stringr::str_remove(old_column, "hits.")
)

# Coerce some columns to numeric
if (old.column == "hits.domains"){
to_coerce <- c("domains.ievalue", "domains.bias", "domains.cevalue",
"domains.oasc")
if (old_column == "hits.domains") {
to_coerce <- c(
"domains.ievalue", "domains.bias", "domains.cevalue",
"domains.oasc"
)

data2[to_coerce] <- lapply(data2[to_coerce], as.numeric)
data2[to_coerce] <- lapply(data2[to_coerce], as.numeric)
}

# Return new dataframe
Expand Down
12 changes: 6 additions & 6 deletions R/filter_hmmer.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#' by = "hits.evalue"
#' )
#' @export
filter_hmmer <- function(data, threshold = 0.0005, by = "hits.evalue") {
filter_hmmer <- function(data, threshold = 0.0005, by = "hits.evalue") { # nolint
data2 <- data.frame(data)

# Extract type
Expand All @@ -39,18 +39,18 @@ extract_evalue_from_domains <- function(data, by = "ievalue") {
new_evalue <- c()

# Save number of rows
n.rows <- nrow(data2)
n_rows <- nrow(data2)

# Iterate over all rows
for (row in seq_len(n.rows)) {
for (row in seq_len(n_rows)) {
# Calculate number of domains in actual row
n.elements <- length(data2[row, "hits.domains"][[1]])
n_elements <- length(data2[row, "hits.domains"][[1]])

lowest <- as.double(data2[row, "hits.domains"][[1]][[1]][by])

# Iterate over each domain and keep lowest
if (n.elements > 1) {
for (el in seq_len(n.elements)) {
if (n_elements > 1) {
for (el in seq_len(n_elements)) {
element <- as.double(data2[row, "hits.domains"][[1]][[el]][by])

if (element < lowest) { # Keep lowest
Expand Down
13 changes: 7 additions & 6 deletions R/hmmer_evalues_cleveland_dot_plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,19 @@
#' threshold = 0.001
#' )
#'
hmmer_evalues_cleveland_dot_plot <- function(data,
threshold = 0.001) {
hmmer_evalues_cleveland_dot_plot <- function(data, threshold = 0.001) { # nolint
df <- data %>%
extract_from_hmmer()
df$domains.ievalue <- as.numeric(df$domains.ievalue)
df <- df %>%
dplyr::group_by(.data$uuid, .data$hits.name, .data$hits.acc) %>%
dplyr::mutate("best.ievalue" =
min(as.numeric(.data$domains.ievalue))) %>%
dplyr::group_by(.data$uuid, .data$hits.name, .data$hits.acc) %>% # nolint
dplyr::mutate(
"best.ievalue" =
min(as.numeric(.data$domains.ievalue))
) %>%
dplyr::ungroup()
df %>%
dplyr::arrange(-log(.data$best.ievalue)) %>%
dplyr::arrange(-log(.data$best.ievalue)) %>% # nolint
ggplot2::ggplot() +
ggplot2::geom_segment(
ggplot2::aes(
Expand Down
Loading

0 comments on commit edfc834

Please sign in to comment.