-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 67cbc97
Showing
50 changed files
with
9,154 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
Package: seqtrie | ||
Title: Radix Tree and Trie-Based String Distances | ||
Version: 0.2.5 | ||
Date: 2023-8-31 | ||
Authors@R: c( | ||
person("Travers", "Ching", email = "traversc@gmail.com", role = c("aut", "cre", "cph")), | ||
person("Martin", "Moene", role = c("ctb", "cph"), comment = "span-lite C++ library"), | ||
person("Hanov", "Steve", role = c("ctb"), comment = "Trie levenshtein implementation in Python") | ||
) | ||
Description: A collection of Radix Tree and Trie algorithms for finding similar sequences and calculating sequence distances (Levenshtein and other distance metrics). This work was inspired by a trie implementation in Python: "Fast and Easy Levenshtein distance using a Trie." Hanov (2011) <http://stevehanov.ca/blog/index.php?id=114>. | ||
License: GPL-3 | ||
Biarch: true | ||
Encoding: UTF-8 | ||
Depends: R (>= 3.5.0) | ||
LazyData: true | ||
SystemRequirements: GNU make | ||
LinkingTo: Rcpp, RcppParallel, BH | ||
Imports: Rcpp (>= 0.12.18.3), RcppParallel (>= 5.1.3), R6 | ||
Suggests: knitr, rmarkdown, stringdist, stringfish, qs, dplyr, | ||
Biostrings, igraph, ggplot2 | ||
VignetteBuilder: knitr | ||
RoxygenNote: 7.2.3 | ||
Copyright: This package includes code from the 'span-lite' library | ||
owned by Martin Moene under Boost Software License 1.0. This | ||
package contains data derived from Adaptive Biotechnologies | ||
"ImmuneCODE" dataset under Creative Commons Attribution 4.0. | ||
URL: https://github.com/traversc/seqtrie | ||
BugReports: https://github.com/traversc/seqtrie/issues | ||
NeedsCompilation: yes | ||
Packaged: 2023-08-31 16:59:08 UTC; tching | ||
Author: Travers Ching [aut, cre, cph], | ||
Martin Moene [ctb, cph] (span-lite C++ library), | ||
Hanov Steve [ctb] (Trie levenshtein implementation in Python) | ||
Maintainer: Travers Ching <traversc@gmail.com> | ||
Repository: CRAN | ||
Date/Publication: 2023-09-01 11:00:02 UTC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
d41d8cd98f00b204e9800998ecf8427e *ChangeLog | ||
af9b0da618b57cc82b371b028024f60c *DESCRIPTION | ||
23ce79206fc87da62e0e421116f87229 *NAMESPACE | ||
7dcd39f24e5ffebe7877d39cca3bc4c8 *R/RadixForest.R | ||
218683c44182355448cf01e5666d55bf *R/RadixTree.R | ||
29f25473dda69d18bf351d2f4c2c0627 *R/RcppExports.R | ||
895fdaf1cc727b0325c5d9ee422c99fc *R/covid_cdr3.R | ||
fb0a8ebb910d7d11b6a20f3b4725b226 *R/pairwise.R | ||
8658bac5185867dffa772b5524a66000 *R/utils.R | ||
3c17f95ee97459d4d65c411dc1b2d56c *R/zzz.R | ||
a43a67f5501ffbf794e3d98ac4f491fa *README.md | ||
9e111a4eb8a2e399a1405292928be198 *build/vignette.rds | ||
56038efc459f403884e488fe40e7af2c *data/covid_cdr3.RData | ||
c2267718f507e07b89b6ab297129ed73 *inst/doc/vignette.R | ||
a8d4e41c2a191d0a6c0c7ce5268b6272 *inst/doc/vignette.html | ||
188df8b62b26b0d54c9bcd7dc7592193 *inst/doc/vignette.rmd | ||
5ce7887238d5043d6eb8bdc6400dfc63 *inst/include/nonstd/span.hpp | ||
e76f61960e0ca3c10e22f663c70c2fc3 *inst/include/pairwise/pairwise.h | ||
ac06c0e302fc54770a3c928500e8eea3 *inst/include/seqtrie/radixmap.h | ||
54956600f16f230c418f0f773fb76792 *inst/include/seqtrie/utility.h | ||
3d83e67b0990145661d85a8b41f72f9c *inst/include/simple_array/nullable_array.h | ||
2add6ca59e991fe6e0c990dc4fa13688 *inst/include/simple_array/simple_array.h | ||
1e60ea9bb394775b50d6a39c3c7e0c79 *inst/include/simple_array/small_array.h | ||
92130f015a006904ae58ba918cffd6c2 *inst/include/simple_array/small_nullable_array.h | ||
acc552880e6407a6616726014351d87a *inst/include/simple_array_tests.cpp | ||
a9ed9fc83c8c0710796ef87872c4053b *inst/include/simple_progress/simple_progress.h | ||
8e8b8d55b5110531b239730be968a10f *inst/include/simple_progress_openmp_tests.cpp | ||
9a7b67f36f7cde3f647f961de6b581cc *inst/include/simple_progress_tests.cpp | ||
03267173608fe46a5349a795065632f7 *man/RadixForest.Rd | ||
b2f5ea739d35fe5dcb31816d61286814 *man/RadixTree.Rd | ||
4eb181dd1bc159b8de80d0933ef46b82 *man/covid_cdr3.Rd | ||
92edbe8b59ff7279a20c232b094dab60 *man/dist_matrix.Rd | ||
913f1430c3e8c55cf6ce1877aef8701f *man/dist_pairwise.Rd | ||
926fff2b2cdb1b7e0e320769a5398148 *man/dist_search.Rd | ||
5f0b686146c7df609e59e7a4a8de34bd *man/generate_cost_matrix.Rd | ||
262275b6a0060a1a2c27feea62cec3ce *src/CharCounter.cpp | ||
832f84aa523d429ea6b239fb22ea2d58 *src/Makevars | ||
a8f9b10bc62f2e4d567ec0c82643c8e8 *src/Makevars.win | ||
1f498bb2432ed880c42bd0bd93ed8c4d *src/RadixForest.cpp | ||
36b876b18197dc2da0e8be1b2f4aceb6 *src/RadixTree.cpp | ||
56c0a6e32881ac24b0b2f5f176aff280 *src/RcppExports.cpp | ||
6ee1031e1db4e397e95021813e41b6a1 *src/pairwise.cpp | ||
2450cf309bdbe3efc756e32ceab13b53 *src/seqtrie_types.h | ||
60339ae56688b88b815302486a117c73 *src/seqtrie_utils.h | ||
13a0bfe1f837a34db9955519fa784d65 *tests/test_RadixForest.R | ||
d7723c8cab8e40fd0aef811f84850ba6 *tests/test_RadixTree.R | ||
4a6483253823ae6f001dcfc61cafde02 *tests/test_pairwise.R | ||
f6e1ceb285e458d8f3f8b7ee3af1336b *vignettes/simple_tree.png | ||
188df8b62b26b0d54c9bcd7dc7592193 *vignettes/vignette.rmd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
importFrom(Rcpp,sourceCpp) | ||
importFrom(RcppParallel, RcppParallelLibs) | ||
importFrom(R6, R6Class) | ||
useDynLib(seqtrie, .registration=TRUE) | ||
export("RadixTree") | ||
export("RadixForest") | ||
export("dist_matrix") | ||
export("dist_pairwise") | ||
export("dist_search") | ||
export("generate_cost_matrix") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
#' @title RadixForest | ||
#' @description Radix Forest class implementation | ||
#' | ||
#' @details | ||
#' The RadixForest class is a specialization of the RadixTree implementation. Instead of putting sequences into | ||
#' a single tree, the RadixForest class puts sequences into separate trees based on sequence length. This allows for faster | ||
#' searching of similar sequences based on Hamming or Levenshtein distance metrics. | ||
#' Unlike the RadixTree class, the RadixForest class does not support anchored searches or a custom cost matrix. | ||
#' See *RadixTree* for additional details. | ||
#' @examples | ||
#' forest <- RadixForest$new() | ||
#' forest$insert(c("ACGT", "AAAA")) | ||
#' forest$erase("AAAA") | ||
#' forest$search("ACG", max_distance = 1, mode = "levenshtein") | ||
#' # query target distance | ||
#' # 1 ACG ACGT 1 | ||
#' | ||
#' forest$search("ACG", max_distance = 1, mode = "hamming") | ||
#' # query target distance | ||
#' # <0 rows> (or 0-length row.names) | ||
RadixForest <- R6::R6Class("RadixForest", list( | ||
#' @field forest_pointer Map of sequence length to RadixTree | ||
forest_pointer = NULL, | ||
#' @field char_counter_pointer Character count data for the purpose of validating input | ||
char_counter_pointer = NULL, | ||
#' @description Create a new RadixForest object | ||
#' @param sequences A character vector of sequences to insert into the forest | ||
initialize = function(sequences = NULL) { | ||
self$forest_pointer <- RadixForest_create() | ||
if(!is.null(sequences)) { | ||
self$insert(sequences) | ||
} | ||
}, | ||
#' @description Print the forest to screen | ||
show = function() { | ||
cat(RadixForest_print(self$forest_pointer)) | ||
}, | ||
#' @description Print the forest to a string | ||
to_string = function() { | ||
RadixForest_print(self$forest_pointer) | ||
}, | ||
#' @description Plot of the forest using igraph | ||
#' @param depth The tree depth to plot for each tree in the forest. | ||
#' @param root_label The label of the root node(s) in the plot. | ||
#' @param plot Whether to create a plot or return the data used to generate the plot. | ||
#' @return A data frame of parent-child relationships used to generate the igraph plot OR a ggplot2 object | ||
graph = function(depth = -1, root_label = "root", plot = TRUE) { | ||
result <- RadixForest_graph(self$forest_pointer, depth) | ||
if(is.null(result)) { | ||
result <- data.frame(parent = character(0), child = character(0), stringsAsFactors=FALSE) | ||
return(result) | ||
} else if(plot) { | ||
if (!requireNamespace("igraph", quietly = TRUE)) { | ||
stopf("igraph package is required to plot the tree.") # nocov | ||
} | ||
if (!requireNamespace("ggplot2", quietly = TRUE)) { | ||
stopf("ggplot2 package is required to plot the tree.") # nocov | ||
} | ||
result$parent <- ifelse(result$parent == "", root_label, result$parent) | ||
gr <- igraph::graph_from_data_frame(result) | ||
fr <- igraph::layout.fruchterman.reingold(gr) | ||
fr <- as.data.frame(fr) | ||
fr$node <- names(igraph::V(gr)) | ||
fr$fill <- ifelse(fr$node == root_label, "white", "skyblue") | ||
fr$size <- ifelse(fr$node == root_label, 16, 12) | ||
|
||
result$parent_x <- fr$V1[match(result$parent, fr$node)] | ||
result$parent_y <- fr$V2[match(result$parent, fr$node)] | ||
result$child_x <- fr$V1[match(result$child, fr$node)] | ||
result$child_y <- fr$V2[match(result$child, fr$node)] | ||
|
||
g <- ggplot2::ggplot() + | ||
ggplot2::geom_segment(data=result, ggplot2::aes(x=parent_x, xend=child_x, y=parent_y, yend=child_y)) + | ||
ggplot2::geom_point(data=fr, ggplot2::aes(x=V1, y=V2, fill=fill, size=size), shape = 21, color = "black") + | ||
ggplot2::geom_text(data=fr, ggplot2::aes(x=V1, y=V2, label=node)) + | ||
ggplot2::scale_fill_identity() + | ||
ggplot2::scale_size_identity() + | ||
ggplot2::theme_bw() + | ||
ggplot2::theme(axis.title = ggplot2::element_blank()) | ||
return(g) | ||
} else { | ||
return(result) | ||
} | ||
}, | ||
#' @description Output all sequences held by the forest as a character vector | ||
#' @return A character vector of all sequences contained in the forest. | ||
to_vector = function() { | ||
RadixForest_to_vector(self$forest_pointer) | ||
}, | ||
#' @description Output the size of the forest (i.e. how many sequences are contained) | ||
#' @return The size of the forest | ||
size = function() { | ||
RadixForest_size(self$forest_pointer) | ||
}, | ||
#' @description Insert new sequences into the forest | ||
#' @param sequences A character vector of sequences to insert into the forest | ||
#' @return A logical vector indicating whether the sequence was inserted (TRUE) or already existing in the forest (FALSE) | ||
insert = function(sequences) { | ||
result <- RadixForest_insert(self$forest_pointer, sequences) | ||
invisible(result) | ||
}, | ||
#' @description Erase sequences from the forest | ||
#' @param sequences A character vector of sequences to erase from the forest | ||
#' @return A logical vector indicating whether the sequence was erased (TRUE) or not found in the forest (FALSE) | ||
erase = function(sequences) { | ||
result <- RadixForest_erase(self$forest_pointer, sequences) | ||
invisible(result) | ||
}, | ||
#' @description Find sequences in the forest | ||
#' @param query A character vector of sequences to find in the forest | ||
#' @return A logical vector indicating whether the sequence was found (TRUE) or not found in the forest (FALSE) | ||
find = function(query) { | ||
RadixForest_find(self$forest_pointer, query) | ||
}, | ||
#' @description Search for sequences in the forest that start with a specified prefix. | ||
#' E.g.: a query of "CAR" will find "CART", "CARBON", "CARROT", etc. but not "CATS". | ||
#' @param query A character vector of sequences to search for in the forest | ||
#' @return A data frame of all matches with columns "query" and "target". | ||
prefix_search = function(query) { | ||
result <- RadixForest_prefix_search(self$forest_pointer, query) | ||
if(is.null(result)) { | ||
data.frame(query = character(0), target = character(0), stringsAsFactors=FALSE) | ||
} else { | ||
result | ||
} | ||
}, | ||
#' @description Search for sequences in the forest that are with a specified distance metric to a specified query. | ||
#' @param query `r rdoc("query")` | ||
#' @param max_distance `r rdoc("max_distance")` | ||
#' @param max_fraction `r rdoc("max_fraction")` | ||
#' @param mode `r rdoc("mode")` | ||
#' @param nthreads `r rdoc("nthreads")` | ||
#' @param show_progress `r rdoc("show_progress")` | ||
#' @return The output is a data.frame of all matches with columns "query" and "target". | ||
|
||
search = function(query, max_distance = NULL, max_fraction = NULL, mode = "levenshtein", nthreads = 1, show_progress = FALSE) { | ||
check_alignment_params(mode, cost_matrix=NULL, gap_cost=NULL, gap_open_cost=NULL, charset = "", diag_must_be_zero = TRUE) | ||
mode <- normalize_mode_parameter(mode) | ||
if(!mode %in% c("hamming", "global")) { | ||
stop("mode must be one of hamming (hm) or global (gb, lv, levenshtein)") | ||
} | ||
|
||
if(!is.null(max_distance)) { | ||
if(length(max_distance) == 1) { | ||
max_distance <- rep(max_distance, length(query)) | ||
} | ||
} else if(!is.null(max_fraction)) { | ||
max_distance <- as.integer(nchar(query) * max_fraction) | ||
} else { | ||
stop("Either max_distance or max_fraction must be non-null") | ||
} | ||
if(any(max_distance < 0)) { | ||
stop("max_distance/max_fraction must be non-negative") | ||
} | ||
RadixForest_search(self$forest_pointer, query, max_distance, mode, nthreads, show_progress) | ||
}, | ||
#' @description Validate the forest | ||
#' @return A logical indicating whether the forest is valid (TRUE) or not (FALSE). This is mostly an internal function for debugging purposes and should always return TRUE. | ||
validate = function() { | ||
RadixForest_validate(self$forest_pointer) | ||
} | ||
), | ||
cloneable=FALSE) | ||
|
Oops, something went wrong.