Skip to content

Commit

Permalink
version 0.2.5
Browse files Browse the repository at this point in the history
  • Loading branch information
traversc authored and cran-robot committed Sep 1, 2023
0 parents commit 67cbc97
Show file tree
Hide file tree
Showing 50 changed files with 9,154 additions and 0 deletions.
Empty file added ChangeLog
Empty file.
36 changes: 36 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
Package: seqtrie
Title: Radix Tree and Trie-Based String Distances
Version: 0.2.5
Date: 2023-8-31
Authors@R: c(
person("Travers", "Ching", email = "traversc@gmail.com", role = c("aut", "cre", "cph")),
person("Martin", "Moene", role = c("ctb", "cph"), comment = "span-lite C++ library"),
person("Hanov", "Steve", role = c("ctb"), comment = "Trie levenshtein implementation in Python")
)
Description: A collection of Radix Tree and Trie algorithms for finding similar sequences and calculating sequence distances (Levenshtein and other distance metrics). This work was inspired by a trie implementation in Python: "Fast and Easy Levenshtein distance using a Trie." Hanov (2011) <http://stevehanov.ca/blog/index.php?id=114>.
License: GPL-3
Biarch: true
Encoding: UTF-8
Depends: R (>= 3.5.0)
LazyData: true
SystemRequirements: GNU make
LinkingTo: Rcpp, RcppParallel, BH
Imports: Rcpp (>= 0.12.18.3), RcppParallel (>= 5.1.3), R6
Suggests: knitr, rmarkdown, stringdist, stringfish, qs, dplyr,
Biostrings, igraph, ggplot2
VignetteBuilder: knitr
RoxygenNote: 7.2.3
Copyright: This package includes code from the 'span-lite' library
owned by Martin Moene under Boost Software License 1.0. This
package contains data derived from Adaptive Biotechnologies
"ImmuneCODE" dataset under Creative Commons Attribution 4.0.
URL: https://github.com/traversc/seqtrie
BugReports: https://github.com/traversc/seqtrie/issues
NeedsCompilation: yes
Packaged: 2023-08-31 16:59:08 UTC; tching
Author: Travers Ching [aut, cre, cph],
Martin Moene [ctb, cph] (span-lite C++ library),
Hanov Steve [ctb] (Trie levenshtein implementation in Python)
Maintainer: Travers Ching <traversc@gmail.com>
Repository: CRAN
Date/Publication: 2023-09-01 11:00:02 UTC
49 changes: 49 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
d41d8cd98f00b204e9800998ecf8427e *ChangeLog
af9b0da618b57cc82b371b028024f60c *DESCRIPTION
23ce79206fc87da62e0e421116f87229 *NAMESPACE
7dcd39f24e5ffebe7877d39cca3bc4c8 *R/RadixForest.R
218683c44182355448cf01e5666d55bf *R/RadixTree.R
29f25473dda69d18bf351d2f4c2c0627 *R/RcppExports.R
895fdaf1cc727b0325c5d9ee422c99fc *R/covid_cdr3.R
fb0a8ebb910d7d11b6a20f3b4725b226 *R/pairwise.R
8658bac5185867dffa772b5524a66000 *R/utils.R
3c17f95ee97459d4d65c411dc1b2d56c *R/zzz.R
a43a67f5501ffbf794e3d98ac4f491fa *README.md
9e111a4eb8a2e399a1405292928be198 *build/vignette.rds
56038efc459f403884e488fe40e7af2c *data/covid_cdr3.RData
c2267718f507e07b89b6ab297129ed73 *inst/doc/vignette.R
a8d4e41c2a191d0a6c0c7ce5268b6272 *inst/doc/vignette.html
188df8b62b26b0d54c9bcd7dc7592193 *inst/doc/vignette.rmd
5ce7887238d5043d6eb8bdc6400dfc63 *inst/include/nonstd/span.hpp
e76f61960e0ca3c10e22f663c70c2fc3 *inst/include/pairwise/pairwise.h
ac06c0e302fc54770a3c928500e8eea3 *inst/include/seqtrie/radixmap.h
54956600f16f230c418f0f773fb76792 *inst/include/seqtrie/utility.h
3d83e67b0990145661d85a8b41f72f9c *inst/include/simple_array/nullable_array.h
2add6ca59e991fe6e0c990dc4fa13688 *inst/include/simple_array/simple_array.h
1e60ea9bb394775b50d6a39c3c7e0c79 *inst/include/simple_array/small_array.h
92130f015a006904ae58ba918cffd6c2 *inst/include/simple_array/small_nullable_array.h
acc552880e6407a6616726014351d87a *inst/include/simple_array_tests.cpp
a9ed9fc83c8c0710796ef87872c4053b *inst/include/simple_progress/simple_progress.h
8e8b8d55b5110531b239730be968a10f *inst/include/simple_progress_openmp_tests.cpp
9a7b67f36f7cde3f647f961de6b581cc *inst/include/simple_progress_tests.cpp
03267173608fe46a5349a795065632f7 *man/RadixForest.Rd
b2f5ea739d35fe5dcb31816d61286814 *man/RadixTree.Rd
4eb181dd1bc159b8de80d0933ef46b82 *man/covid_cdr3.Rd
92edbe8b59ff7279a20c232b094dab60 *man/dist_matrix.Rd
913f1430c3e8c55cf6ce1877aef8701f *man/dist_pairwise.Rd
926fff2b2cdb1b7e0e320769a5398148 *man/dist_search.Rd
5f0b686146c7df609e59e7a4a8de34bd *man/generate_cost_matrix.Rd
262275b6a0060a1a2c27feea62cec3ce *src/CharCounter.cpp
832f84aa523d429ea6b239fb22ea2d58 *src/Makevars
a8f9b10bc62f2e4d567ec0c82643c8e8 *src/Makevars.win
1f498bb2432ed880c42bd0bd93ed8c4d *src/RadixForest.cpp
36b876b18197dc2da0e8be1b2f4aceb6 *src/RadixTree.cpp
56c0a6e32881ac24b0b2f5f176aff280 *src/RcppExports.cpp
6ee1031e1db4e397e95021813e41b6a1 *src/pairwise.cpp
2450cf309bdbe3efc756e32ceab13b53 *src/seqtrie_types.h
60339ae56688b88b815302486a117c73 *src/seqtrie_utils.h
13a0bfe1f837a34db9955519fa784d65 *tests/test_RadixForest.R
d7723c8cab8e40fd0aef811f84850ba6 *tests/test_RadixTree.R
4a6483253823ae6f001dcfc61cafde02 *tests/test_pairwise.R
f6e1ceb285e458d8f3f8b7ee3af1336b *vignettes/simple_tree.png
188df8b62b26b0d54c9bcd7dc7592193 *vignettes/vignette.rmd
10 changes: 10 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
importFrom(Rcpp,sourceCpp)
importFrom(RcppParallel, RcppParallelLibs)
importFrom(R6, R6Class)
useDynLib(seqtrie, .registration=TRUE)
export("RadixTree")
export("RadixForest")
export("dist_matrix")
export("dist_pairwise")
export("dist_search")
export("generate_cost_matrix")
164 changes: 164 additions & 0 deletions R/RadixForest.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#' @title RadixForest
#' @description Radix Forest class implementation
#'
#' @details
#' The RadixForest class is a specialization of the RadixTree implementation. Instead of putting sequences into
#' a single tree, the RadixForest class puts sequences into separate trees based on sequence length. This allows for faster
#' searching of similar sequences based on Hamming or Levenshtein distance metrics.
#' Unlike the RadixTree class, the RadixForest class does not support anchored searches or a custom cost matrix.
#' See *RadixTree* for additional details.
#' @examples
#' forest <- RadixForest$new()
#' forest$insert(c("ACGT", "AAAA"))
#' forest$erase("AAAA")
#' forest$search("ACG", max_distance = 1, mode = "levenshtein")
#' # query target distance
#' # 1 ACG ACGT 1
#'
#' forest$search("ACG", max_distance = 1, mode = "hamming")
#' # query target distance
#' # <0 rows> (or 0-length row.names)
RadixForest <- R6::R6Class("RadixForest", list(
#' @field forest_pointer Map of sequence length to RadixTree
forest_pointer = NULL,
#' @field char_counter_pointer Character count data for the purpose of validating input
char_counter_pointer = NULL,
#' @description Create a new RadixForest object
#' @param sequences A character vector of sequences to insert into the forest
initialize = function(sequences = NULL) {
self$forest_pointer <- RadixForest_create()
if(!is.null(sequences)) {
self$insert(sequences)
}
},
#' @description Print the forest to screen
show = function() {
cat(RadixForest_print(self$forest_pointer))
},
#' @description Print the forest to a string
to_string = function() {
RadixForest_print(self$forest_pointer)
},
#' @description Plot of the forest using igraph
#' @param depth The tree depth to plot for each tree in the forest.
#' @param root_label The label of the root node(s) in the plot.
#' @param plot Whether to create a plot or return the data used to generate the plot.
#' @return A data frame of parent-child relationships used to generate the igraph plot OR a ggplot2 object
graph = function(depth = -1, root_label = "root", plot = TRUE) {
result <- RadixForest_graph(self$forest_pointer, depth)
if(is.null(result)) {
result <- data.frame(parent = character(0), child = character(0), stringsAsFactors=FALSE)
return(result)
} else if(plot) {
if (!requireNamespace("igraph", quietly = TRUE)) {
stopf("igraph package is required to plot the tree.") # nocov
}
if (!requireNamespace("ggplot2", quietly = TRUE)) {
stopf("ggplot2 package is required to plot the tree.") # nocov
}
result$parent <- ifelse(result$parent == "", root_label, result$parent)
gr <- igraph::graph_from_data_frame(result)
fr <- igraph::layout.fruchterman.reingold(gr)
fr <- as.data.frame(fr)
fr$node <- names(igraph::V(gr))
fr$fill <- ifelse(fr$node == root_label, "white", "skyblue")
fr$size <- ifelse(fr$node == root_label, 16, 12)

result$parent_x <- fr$V1[match(result$parent, fr$node)]
result$parent_y <- fr$V2[match(result$parent, fr$node)]
result$child_x <- fr$V1[match(result$child, fr$node)]
result$child_y <- fr$V2[match(result$child, fr$node)]

g <- ggplot2::ggplot() +
ggplot2::geom_segment(data=result, ggplot2::aes(x=parent_x, xend=child_x, y=parent_y, yend=child_y)) +
ggplot2::geom_point(data=fr, ggplot2::aes(x=V1, y=V2, fill=fill, size=size), shape = 21, color = "black") +
ggplot2::geom_text(data=fr, ggplot2::aes(x=V1, y=V2, label=node)) +
ggplot2::scale_fill_identity() +
ggplot2::scale_size_identity() +
ggplot2::theme_bw() +
ggplot2::theme(axis.title = ggplot2::element_blank())
return(g)
} else {
return(result)
}
},
#' @description Output all sequences held by the forest as a character vector
#' @return A character vector of all sequences contained in the forest.
to_vector = function() {
RadixForest_to_vector(self$forest_pointer)
},
#' @description Output the size of the forest (i.e. how many sequences are contained)
#' @return The size of the forest
size = function() {
RadixForest_size(self$forest_pointer)
},
#' @description Insert new sequences into the forest
#' @param sequences A character vector of sequences to insert into the forest
#' @return A logical vector indicating whether the sequence was inserted (TRUE) or already existing in the forest (FALSE)
insert = function(sequences) {
result <- RadixForest_insert(self$forest_pointer, sequences)
invisible(result)
},
#' @description Erase sequences from the forest
#' @param sequences A character vector of sequences to erase from the forest
#' @return A logical vector indicating whether the sequence was erased (TRUE) or not found in the forest (FALSE)
erase = function(sequences) {
result <- RadixForest_erase(self$forest_pointer, sequences)
invisible(result)
},
#' @description Find sequences in the forest
#' @param query A character vector of sequences to find in the forest
#' @return A logical vector indicating whether the sequence was found (TRUE) or not found in the forest (FALSE)
find = function(query) {
RadixForest_find(self$forest_pointer, query)
},
#' @description Search for sequences in the forest that start with a specified prefix.
#' E.g.: a query of "CAR" will find "CART", "CARBON", "CARROT", etc. but not "CATS".
#' @param query A character vector of sequences to search for in the forest
#' @return A data frame of all matches with columns "query" and "target".
prefix_search = function(query) {
result <- RadixForest_prefix_search(self$forest_pointer, query)
if(is.null(result)) {
data.frame(query = character(0), target = character(0), stringsAsFactors=FALSE)
} else {
result
}
},
#' @description Search for sequences in the forest that are with a specified distance metric to a specified query.
#' @param query `r rdoc("query")`
#' @param max_distance `r rdoc("max_distance")`
#' @param max_fraction `r rdoc("max_fraction")`
#' @param mode `r rdoc("mode")`
#' @param nthreads `r rdoc("nthreads")`
#' @param show_progress `r rdoc("show_progress")`
#' @return The output is a data.frame of all matches with columns "query" and "target".

search = function(query, max_distance = NULL, max_fraction = NULL, mode = "levenshtein", nthreads = 1, show_progress = FALSE) {
check_alignment_params(mode, cost_matrix=NULL, gap_cost=NULL, gap_open_cost=NULL, charset = "", diag_must_be_zero = TRUE)
mode <- normalize_mode_parameter(mode)
if(!mode %in% c("hamming", "global")) {
stop("mode must be one of hamming (hm) or global (gb, lv, levenshtein)")
}

if(!is.null(max_distance)) {
if(length(max_distance) == 1) {
max_distance <- rep(max_distance, length(query))
}
} else if(!is.null(max_fraction)) {
max_distance <- as.integer(nchar(query) * max_fraction)
} else {
stop("Either max_distance or max_fraction must be non-null")
}
if(any(max_distance < 0)) {
stop("max_distance/max_fraction must be non-negative")
}
RadixForest_search(self$forest_pointer, query, max_distance, mode, nthreads, show_progress)
},
#' @description Validate the forest
#' @return A logical indicating whether the forest is valid (TRUE) or not (FALSE). This is mostly an internal function for debugging purposes and should always return TRUE.
validate = function() {
RadixForest_validate(self$forest_pointer)
}
),
cloneable=FALSE)

0 comments on commit 67cbc97

Please sign in to comment.