Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 42beae1
Showing
28 changed files
with
1,704 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
Package: tlsh | ||
Type: Package | ||
Title: Transitive Locality-Sensitive Hashing (LSH) for Record Linkage | ||
Version: 0.1.0 | ||
Authors@R: person("Rebecca", "Steorts", email = "beka@stat.duke.edu", | ||
role = c("aut", "cre")) | ||
Depends: R (>= 3.5.0), blink, stats, utils, plyr, igraph, bit64 | ||
Imports: | ||
Suggests: knitr, ggplot2, rmarkdown | ||
VignetteBuilder: knitr | ||
Description: An implementation of the blocking algorithm transitive locality-sensitive hashing (TLSH) in Steorts, Ventura, Sadinle, Fienberg (2014) <DOI:10.1007/978-3-319-11257-2_20>, which is a k-means variant of locality sensitive hashing. The method is illustrated with examples and a vignette. | ||
Encoding: UTF-8 | ||
LazyData: true | ||
License: GPL-3 | ||
RoxygenNote: 7.1.1.9000 | ||
NeedsCompilation: no | ||
Packaged: 2020-11-02 16:22:06 UTC; rebeccasteorts | ||
Author: Rebecca Steorts [aut, cre] | ||
Maintainer: Rebecca Steorts <beka@stat.duke.edu> | ||
Repository: CRAN | ||
Date/Publication: 2020-11-06 17:00:02 UTC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
395f2eae2242b16010635c0159710d32 *DESCRIPTION | ||
b308deeacf7be1a89891ae2523b10aed *NAMESPACE | ||
883a5648af177469263685d621b6968b *R/blocking-evaluations.R | ||
54acb8a8a2d3db0d8846427f6096e22d *R/minhash_v2.R | ||
10eb3c9ee8f70eb94b3e60164268dc45 *R/tlsh.R | ||
b5b601a33c1238f64d7521e55fa5a4ca *build/vignette.rds | ||
4c8da6ba82b58b79ae31cb933f833730 *inst/doc/tlsh.R | ||
043c61554a7b1f24e72a3b813bf9cef4 *inst/doc/tlsh.Rmd | ||
7c94d16e855c117546c0893ddba9edf5 *inst/doc/tlsh.html | ||
239e26c6ffab4f19c228f2b4965ef831 *man/block.ids.from.blocking.Rd | ||
68b865ead685190f24280d8bf17d4378 *man/block_setup_v2.Rd | ||
431dfd43b092d3359adf85ffcb837de9 *man/compare_buckets.Rd | ||
cf7770443b48e38aa67e4551b58ce8c1 *man/confusion.from.blocking.Rd | ||
eed4ab096a637c80534def65b5e11a42 *man/eval.blocksetup.Rd | ||
b634e13566eabb03db9f007c03806fe3 *man/extract_pairs_from_band.Rd | ||
4154bc7a1e4f546ac23c29bda5995a67 *man/hash_signature.Rd | ||
8d6a8176be4b368f9eb7bb39aacd90e8 *man/minhash_v2.Rd | ||
27ebdba13d154cc2051cbb266b2282ee *man/my_hash.Rd | ||
2be0caecb0d785afb805c7ee861b4c42 *man/primest.Rd | ||
b28b796a274f35ee917457e8727d3b49 *man/reduction.ratio.Rd | ||
87093270eb12fdf33bffe80d6bb5c3ba *man/reduction.ratio.from.blocking.Rd | ||
88bc1a83dda8e41d19bc2f11a4055f3b *man/rhash_funcs.Rd | ||
eeb46ff58a2f61eed834eec73ada48f8 *man/shingled_record_to_index_vec.Rd | ||
71ce81206422e086585828a257af8ab3 *man/shingles.Rd | ||
da525e154d909a4b9ec3184d02867e5f *vignettes/blocks_members.Rdata | ||
91073e85e36c66cafac6f0b6bb54e1c0 *vignettes/candidate_pairs_graph.Rdata | ||
043c61554a7b1f24e72a3b813bf9cef4 *vignettes/tlsh.Rmd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(block.ids.from.blocking) | ||
export(block_setup_v2) | ||
export(compare_buckets) | ||
export(confusion.from.blocking) | ||
export(eval.blocksetup) | ||
export(extract_pairs_from_band) | ||
export(hash_signature) | ||
export(minhash_v2) | ||
export(my_hash) | ||
export(primest) | ||
export(reduction.ratio) | ||
export(reduction.ratio.from.blocking) | ||
export(rhash_funcs) | ||
export(shingled_record_to_index_vec) | ||
export(shingles) | ||
import(bit64) | ||
import(blink) | ||
import(igraph) | ||
import(plyr) | ||
import(utils) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#' Perform evaluations (recall) for blocking. | ||
#' | ||
#' @import blink | ||
#' @param blocking A list of the blocks | ||
#' @param true_ids The true identifiers for comparisons | ||
#' @param recall.only Flag that when true only prints the recall, otherwise | ||
#' prints many evaluation metrics in a list | ||
#' @return A vector of that returns the recall and the precision | ||
#' @export | ||
#' @examples | ||
#' r.set <- RLdata500[1:250,c(-2)] | ||
#' tlsh.blocks <- block_setup_v2(r.set, b=22, save_signature=FALSE, k=2) | ||
#' confusion.from.blocking(tlsh.blocks, identity.RLdata500, recall.only=TRUE) | ||
|
||
confusion.from.blocking <- function(blocking, true_ids, recall.only=FALSE) { | ||
# convert blocking into a vector of labels | ||
nn <- sum(sapply(blocking,length)) | ||
block.ids = rep(NA, nn) | ||
for(ii in 1:length(blocking)) block.ids[blocking[[ii]]] = ii | ||
# For each pair of records, check whether they are in the same block | ||
|
||
candidate.pairs = combn(length(block.ids), 2) | ||
same.block <- block.ids[candidate.pairs[1,]] == block.ids[candidate.pairs[2,]] | ||
same.truth <- true_ids[candidate.pairs[1,]] == true_ids[candidate.pairs[2,]] | ||
|
||
#same.block <- outer(block.ids,block.ids,"==") | ||
# For each pair of records, check whether the true ids match | ||
#same.truth <- outer(true_ids,true_ids,"==") | ||
# table same-block vs. same-truth | ||
confusion <- table(same.block,same.truth, dnn=c("same block?","actually same?")) | ||
# In the confusion matrix, rows refer to the blocks and columns refer to the truth | ||
|
||
false.positives <- confusion[2,1] | ||
false.negatives <- confusion[1,2] | ||
true.positives <- confusion[2,2] | ||
true.negatives <- confusion[1,1] | ||
recall <- true.positives/(false.negatives + true.positives) | ||
|
||
|
||
#true.positives <- confusion[2,2] | ||
#misses <- confusion[1,2] | ||
#recall <- true.positives/(true.positives+misses) | ||
#precision <- true.positives/(same.truth) | ||
if (recall.only) { | ||
return(recall) | ||
} else { | ||
return(return(list(confusion, | ||
recall = true.positives/(false.negatives + true.positives), | ||
precision = true.positives/(true.positives + false.positives), | ||
fpr = false.positives/(false.positives + true.negatives), | ||
fnr = false.negatives/(false.negatives + true.positives), | ||
accuracy = (true.positives + true.negatives)/(true.positives + true.negatives + false.negatives + false.positives), | ||
specificity = true.negatives/(true.negatives + false.positives)))) | ||
} | ||
} | ||
|
||
#' Returns the block ids associated with a blocking method. | ||
#' | ||
#' @import blink | ||
#' @param blocking A list of the blocks. | ||
#' @return A list of the blocks ids that corresponds to each block | ||
#' @export | ||
#' @examples | ||
#' tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:250,c(-2,-4)], b=10, save_signature=FALSE, k=1) | ||
#' block.ids.from.blocking(tlsh.blocks) | ||
|
||
block.ids.from.blocking <- function(blocking) { | ||
nn <- sum(sapply(blocking,length)) | ||
block.ids = rep(NA, nn) | ||
for(ii in 1:length(blocking))block.ids[blocking[[ii]]] = ii | ||
return(block.ids) | ||
} | ||
|
||
#' Returns the reduction ratio associated with a blocking method | ||
#' | ||
#' @import blink | ||
#' @import utils | ||
#' @param block.labels A list of the blocks labels. | ||
#' @return The reduction ratio | ||
#' @export | ||
#' @examples | ||
#' tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:50,c(-2)], b=22, save_signature=FALSE, k=2) | ||
#' block.ids <- block.ids.from.blocking(tlsh.blocks) | ||
#' reduction.ratio(block.ids) | ||
|
||
reduction.ratio <- function(block.labels) 1 - sum(choose(table(block.labels),2)) / choose(length(block.labels),2) | ||
|
||
#' Returns the reduction ratio associated with a blocking method | ||
#' | ||
#' @import blink | ||
#' @param blocking The actual blocks | ||
#' @return The reduction ratio | ||
#' @export | ||
#' @examples | ||
#' tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:50,c(-2,-4)], b=10, save_signature=FALSE, k=1) | ||
#' reduction.ratio.from.blocking(tlsh.blocks) | ||
reduction.ratio.from.blocking <- function(blocking) { | ||
reduction.ratio(block.ids.from.blocking(blocking)) | ||
} |
Oops, something went wrong.