Skip to content

Commit

Permalink
version 0.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
resteorts authored and cran-robot committed Nov 6, 2020
0 parents commit 42beae1
Show file tree
Hide file tree
Showing 28 changed files with 1,704 additions and 0 deletions.
21 changes: 21 additions & 0 deletions DESCRIPTION
@@ -0,0 +1,21 @@
Package: tlsh
Type: Package
Title: Transitive Locality-Sensitive Hashing (LSH) for Record Linkage
Version: 0.1.0
Authors@R: person("Rebecca", "Steorts", email = "beka@stat.duke.edu",
role = c("aut", "cre"))
Depends: R (>= 3.5.0), blink, stats, utils, plyr, igraph, bit64
Imports:
Suggests: knitr, ggplot2, rmarkdown
VignetteBuilder: knitr
Description: An implementation of the blocking algorithm transitive locality-sensitive hashing (TLSH) in Steorts, Ventura, Sadinle, Fienberg (2014) <DOI:10.1007/978-3-319-11257-2_20>, which is a k-means variant of locality sensitive hashing. The method is illustrated with examples and a vignette.
Encoding: UTF-8
LazyData: true
License: GPL-3
RoxygenNote: 7.1.1.9000
NeedsCompilation: no
Packaged: 2020-11-02 16:22:06 UTC; rebeccasteorts
Author: Rebecca Steorts [aut, cre]
Maintainer: Rebecca Steorts <beka@stat.duke.edu>
Repository: CRAN
Date/Publication: 2020-11-06 17:00:02 UTC
27 changes: 27 additions & 0 deletions MD5
@@ -0,0 +1,27 @@
395f2eae2242b16010635c0159710d32 *DESCRIPTION
b308deeacf7be1a89891ae2523b10aed *NAMESPACE
883a5648af177469263685d621b6968b *R/blocking-evaluations.R
54acb8a8a2d3db0d8846427f6096e22d *R/minhash_v2.R
10eb3c9ee8f70eb94b3e60164268dc45 *R/tlsh.R
b5b601a33c1238f64d7521e55fa5a4ca *build/vignette.rds
4c8da6ba82b58b79ae31cb933f833730 *inst/doc/tlsh.R
043c61554a7b1f24e72a3b813bf9cef4 *inst/doc/tlsh.Rmd
7c94d16e855c117546c0893ddba9edf5 *inst/doc/tlsh.html
239e26c6ffab4f19c228f2b4965ef831 *man/block.ids.from.blocking.Rd
68b865ead685190f24280d8bf17d4378 *man/block_setup_v2.Rd
431dfd43b092d3359adf85ffcb837de9 *man/compare_buckets.Rd
cf7770443b48e38aa67e4551b58ce8c1 *man/confusion.from.blocking.Rd
eed4ab096a637c80534def65b5e11a42 *man/eval.blocksetup.Rd
b634e13566eabb03db9f007c03806fe3 *man/extract_pairs_from_band.Rd
4154bc7a1e4f546ac23c29bda5995a67 *man/hash_signature.Rd
8d6a8176be4b368f9eb7bb39aacd90e8 *man/minhash_v2.Rd
27ebdba13d154cc2051cbb266b2282ee *man/my_hash.Rd
2be0caecb0d785afb805c7ee861b4c42 *man/primest.Rd
b28b796a274f35ee917457e8727d3b49 *man/reduction.ratio.Rd
87093270eb12fdf33bffe80d6bb5c3ba *man/reduction.ratio.from.blocking.Rd
88bc1a83dda8e41d19bc2f11a4055f3b *man/rhash_funcs.Rd
eeb46ff58a2f61eed834eec73ada48f8 *man/shingled_record_to_index_vec.Rd
71ce81206422e086585828a257af8ab3 *man/shingles.Rd
da525e154d909a4b9ec3184d02867e5f *vignettes/blocks_members.Rdata
91073e85e36c66cafac6f0b6bb54e1c0 *vignettes/candidate_pairs_graph.Rdata
043c61554a7b1f24e72a3b813bf9cef4 *vignettes/tlsh.Rmd
22 changes: 22 additions & 0 deletions NAMESPACE
@@ -0,0 +1,22 @@
# Generated by roxygen2: do not edit by hand

export(block.ids.from.blocking)
export(block_setup_v2)
export(compare_buckets)
export(confusion.from.blocking)
export(eval.blocksetup)
export(extract_pairs_from_band)
export(hash_signature)
export(minhash_v2)
export(my_hash)
export(primest)
export(reduction.ratio)
export(reduction.ratio.from.blocking)
export(rhash_funcs)
export(shingled_record_to_index_vec)
export(shingles)
import(bit64)
import(blink)
import(igraph)
import(plyr)
import(utils)
99 changes: 99 additions & 0 deletions R/blocking-evaluations.R
@@ -0,0 +1,99 @@
#' Perform evaluations (recall) for blocking.
#'
#' @import blink
#' @param blocking A list of the blocks
#' @param true_ids The true identifiers for comparisons
#' @param recall.only Flag that when true only prints the recall, otherwise
#' prints many evaluation metrics in a list
#' @return A vector of that returns the recall and the precision
#' @export
#' @examples
#' r.set <- RLdata500[1:250,c(-2)]
#' tlsh.blocks <- block_setup_v2(r.set, b=22, save_signature=FALSE, k=2)
#' confusion.from.blocking(tlsh.blocks, identity.RLdata500, recall.only=TRUE)

confusion.from.blocking <- function(blocking, true_ids, recall.only=FALSE) {
# convert blocking into a vector of labels
nn <- sum(sapply(blocking,length))
block.ids = rep(NA, nn)
for(ii in 1:length(blocking)) block.ids[blocking[[ii]]] = ii
# For each pair of records, check whether they are in the same block

candidate.pairs = combn(length(block.ids), 2)
same.block <- block.ids[candidate.pairs[1,]] == block.ids[candidate.pairs[2,]]
same.truth <- true_ids[candidate.pairs[1,]] == true_ids[candidate.pairs[2,]]

#same.block <- outer(block.ids,block.ids,"==")
# For each pair of records, check whether the true ids match
#same.truth <- outer(true_ids,true_ids,"==")
# table same-block vs. same-truth
confusion <- table(same.block,same.truth, dnn=c("same block?","actually same?"))
# In the confusion matrix, rows refer to the blocks and columns refer to the truth

false.positives <- confusion[2,1]
false.negatives <- confusion[1,2]
true.positives <- confusion[2,2]
true.negatives <- confusion[1,1]
recall <- true.positives/(false.negatives + true.positives)


#true.positives <- confusion[2,2]
#misses <- confusion[1,2]
#recall <- true.positives/(true.positives+misses)
#precision <- true.positives/(same.truth)
if (recall.only) {
return(recall)
} else {
return(return(list(confusion,
recall = true.positives/(false.negatives + true.positives),
precision = true.positives/(true.positives + false.positives),
fpr = false.positives/(false.positives + true.negatives),
fnr = false.negatives/(false.negatives + true.positives),
accuracy = (true.positives + true.negatives)/(true.positives + true.negatives + false.negatives + false.positives),
specificity = true.negatives/(true.negatives + false.positives))))
}
}

#' Returns the block ids associated with a blocking method.
#'
#' @import blink
#' @param blocking A list of the blocks.
#' @return A list of the blocks ids that corresponds to each block
#' @export
#' @examples
#' tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:250,c(-2,-4)], b=10, save_signature=FALSE, k=1)
#' block.ids.from.blocking(tlsh.blocks)

block.ids.from.blocking <- function(blocking) {
nn <- sum(sapply(blocking,length))
block.ids = rep(NA, nn)
for(ii in 1:length(blocking))block.ids[blocking[[ii]]] = ii
return(block.ids)
}

#' Returns the reduction ratio associated with a blocking method
#'
#' @import blink
#' @import utils
#' @param block.labels A list of the blocks labels.
#' @return The reduction ratio
#' @export
#' @examples
#' tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:50,c(-2)], b=22, save_signature=FALSE, k=2)
#' block.ids <- block.ids.from.blocking(tlsh.blocks)
#' reduction.ratio(block.ids)

reduction.ratio <- function(block.labels) 1 - sum(choose(table(block.labels),2)) / choose(length(block.labels),2)

#' Returns the reduction ratio associated with a blocking method
#'
#' @import blink
#' @param blocking The actual blocks
#' @return The reduction ratio
#' @export
#' @examples
#' tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:50,c(-2,-4)], b=10, save_signature=FALSE, k=1)
#' reduction.ratio.from.blocking(tlsh.blocks)
reduction.ratio.from.blocking <- function(blocking) {
reduction.ratio(block.ids.from.blocking(blocking))
}

0 comments on commit 42beae1

Please sign in to comment.