From 42beae160f662fa6f4c67735f55ee13debed7769 Mon Sep 17 00:00:00 2001 From: Rebecca Steorts Date: Fri, 6 Nov 2020 16:00:02 +0000 Subject: [PATCH] version 0.1.0 --- DESCRIPTION | 21 ++ MD5 | 27 ++ NAMESPACE | 22 ++ R/blocking-evaluations.R | 99 ++++++ R/minhash_v2.R | 380 +++++++++++++++++++++ R/tlsh.R | 115 +++++++ build/vignette.rds | Bin 0 -> 188 bytes inst/doc/tlsh.R | 19 ++ inst/doc/tlsh.Rmd | 86 +++++ inst/doc/tlsh.html | 464 ++++++++++++++++++++++++++ man/block.ids.from.blocking.Rd | 21 ++ man/block_setup_v2.Rd | 28 ++ man/compare_buckets.Rd | 28 ++ man/confusion.from.blocking.Rd | 27 ++ man/eval.blocksetup.Rd | 27 ++ man/extract_pairs_from_band.Rd | 27 ++ man/hash_signature.Rd | 29 ++ man/minhash_v2.Rd | 31 ++ man/my_hash.Rd | 27 ++ man/primest.Rd | 23 ++ man/reduction.ratio.Rd | 22 ++ man/reduction.ratio.from.blocking.Rd | 21 ++ man/rhash_funcs.Rd | 27 ++ man/shingled_record_to_index_vec.Rd | 24 ++ man/shingles.Rd | 23 ++ vignettes/blocks_members.Rdata | Bin 0 -> 946 bytes vignettes/candidate_pairs_graph.Rdata | Bin 0 -> 12763 bytes vignettes/tlsh.Rmd | 86 +++++ 28 files changed, 1704 insertions(+) create mode 100644 DESCRIPTION create mode 100644 MD5 create mode 100644 NAMESPACE create mode 100644 R/blocking-evaluations.R create mode 100644 R/minhash_v2.R create mode 100644 R/tlsh.R create mode 100644 build/vignette.rds create mode 100644 inst/doc/tlsh.R create mode 100644 inst/doc/tlsh.Rmd create mode 100644 inst/doc/tlsh.html create mode 100644 man/block.ids.from.blocking.Rd create mode 100644 man/block_setup_v2.Rd create mode 100644 man/compare_buckets.Rd create mode 100644 man/confusion.from.blocking.Rd create mode 100644 man/eval.blocksetup.Rd create mode 100644 man/extract_pairs_from_band.Rd create mode 100644 man/hash_signature.Rd create mode 100644 man/minhash_v2.Rd create mode 100644 man/my_hash.Rd create mode 100644 man/primest.Rd create mode 100644 man/reduction.ratio.Rd create mode 100644 man/reduction.ratio.from.blocking.Rd create mode 100644 man/rhash_funcs.Rd create mode 100644 man/shingled_record_to_index_vec.Rd create mode 100644 man/shingles.Rd create mode 100644 vignettes/blocks_members.Rdata create mode 100644 vignettes/candidate_pairs_graph.Rdata create mode 100644 vignettes/tlsh.Rmd diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..8f7a986 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,21 @@ +Package: tlsh +Type: Package +Title: Transitive Locality-Sensitive Hashing (LSH) for Record Linkage +Version: 0.1.0 +Authors@R: person("Rebecca", "Steorts", email = "beka@stat.duke.edu", + role = c("aut", "cre")) +Depends: R (>= 3.5.0), blink, stats, utils, plyr, igraph, bit64 +Imports: +Suggests: knitr, ggplot2, rmarkdown +VignetteBuilder: knitr +Description: An implementation of the blocking algorithm transitive locality-sensitive hashing (TLSH) in Steorts, Ventura, Sadinle, Fienberg (2014) , which is a k-means variant of locality sensitive hashing. The method is illustrated with examples and a vignette. +Encoding: UTF-8 +LazyData: true +License: GPL-3 +RoxygenNote: 7.1.1.9000 +NeedsCompilation: no +Packaged: 2020-11-02 16:22:06 UTC; rebeccasteorts +Author: Rebecca Steorts [aut, cre] +Maintainer: Rebecca Steorts +Repository: CRAN +Date/Publication: 2020-11-06 17:00:02 UTC diff --git a/MD5 b/MD5 new file mode 100644 index 0000000..5d0b21c --- /dev/null +++ b/MD5 @@ -0,0 +1,27 @@ +395f2eae2242b16010635c0159710d32 *DESCRIPTION +b308deeacf7be1a89891ae2523b10aed *NAMESPACE +883a5648af177469263685d621b6968b *R/blocking-evaluations.R +54acb8a8a2d3db0d8846427f6096e22d *R/minhash_v2.R +10eb3c9ee8f70eb94b3e60164268dc45 *R/tlsh.R +b5b601a33c1238f64d7521e55fa5a4ca *build/vignette.rds +4c8da6ba82b58b79ae31cb933f833730 *inst/doc/tlsh.R +043c61554a7b1f24e72a3b813bf9cef4 *inst/doc/tlsh.Rmd +7c94d16e855c117546c0893ddba9edf5 *inst/doc/tlsh.html +239e26c6ffab4f19c228f2b4965ef831 *man/block.ids.from.blocking.Rd +68b865ead685190f24280d8bf17d4378 *man/block_setup_v2.Rd +431dfd43b092d3359adf85ffcb837de9 *man/compare_buckets.Rd +cf7770443b48e38aa67e4551b58ce8c1 *man/confusion.from.blocking.Rd +eed4ab096a637c80534def65b5e11a42 *man/eval.blocksetup.Rd +b634e13566eabb03db9f007c03806fe3 *man/extract_pairs_from_band.Rd +4154bc7a1e4f546ac23c29bda5995a67 *man/hash_signature.Rd +8d6a8176be4b368f9eb7bb39aacd90e8 *man/minhash_v2.Rd +27ebdba13d154cc2051cbb266b2282ee *man/my_hash.Rd +2be0caecb0d785afb805c7ee861b4c42 *man/primest.Rd +b28b796a274f35ee917457e8727d3b49 *man/reduction.ratio.Rd +87093270eb12fdf33bffe80d6bb5c3ba *man/reduction.ratio.from.blocking.Rd +88bc1a83dda8e41d19bc2f11a4055f3b *man/rhash_funcs.Rd +eeb46ff58a2f61eed834eec73ada48f8 *man/shingled_record_to_index_vec.Rd +71ce81206422e086585828a257af8ab3 *man/shingles.Rd +da525e154d909a4b9ec3184d02867e5f *vignettes/blocks_members.Rdata +91073e85e36c66cafac6f0b6bb54e1c0 *vignettes/candidate_pairs_graph.Rdata +043c61554a7b1f24e72a3b813bf9cef4 *vignettes/tlsh.Rmd diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..ba3292d --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,22 @@ +# Generated by roxygen2: do not edit by hand + +export(block.ids.from.blocking) +export(block_setup_v2) +export(compare_buckets) +export(confusion.from.blocking) +export(eval.blocksetup) +export(extract_pairs_from_band) +export(hash_signature) +export(minhash_v2) +export(my_hash) +export(primest) +export(reduction.ratio) +export(reduction.ratio.from.blocking) +export(rhash_funcs) +export(shingled_record_to_index_vec) +export(shingles) +import(bit64) +import(blink) +import(igraph) +import(plyr) +import(utils) diff --git a/R/blocking-evaluations.R b/R/blocking-evaluations.R new file mode 100644 index 0000000..04cb2dd --- /dev/null +++ b/R/blocking-evaluations.R @@ -0,0 +1,99 @@ +#' Perform evaluations (recall) for blocking. +#' +#' @import blink +#' @param blocking A list of the blocks +#' @param true_ids The true identifiers for comparisons +#' @param recall.only Flag that when true only prints the recall, otherwise +#' prints many evaluation metrics in a list +#' @return A vector of that returns the recall and the precision +#' @export +#' @examples +#' r.set <- RLdata500[1:250,c(-2)] +#' tlsh.blocks <- block_setup_v2(r.set, b=22, save_signature=FALSE, k=2) +#' confusion.from.blocking(tlsh.blocks, identity.RLdata500, recall.only=TRUE) + +confusion.from.blocking <- function(blocking, true_ids, recall.only=FALSE) { + # convert blocking into a vector of labels + nn <- sum(sapply(blocking,length)) + block.ids = rep(NA, nn) + for(ii in 1:length(blocking)) block.ids[blocking[[ii]]] = ii + # For each pair of records, check whether they are in the same block + + candidate.pairs = combn(length(block.ids), 2) + same.block <- block.ids[candidate.pairs[1,]] == block.ids[candidate.pairs[2,]] + same.truth <- true_ids[candidate.pairs[1,]] == true_ids[candidate.pairs[2,]] + + #same.block <- outer(block.ids,block.ids,"==") + # For each pair of records, check whether the true ids match + #same.truth <- outer(true_ids,true_ids,"==") + # table same-block vs. same-truth + confusion <- table(same.block,same.truth, dnn=c("same block?","actually same?")) + # In the confusion matrix, rows refer to the blocks and columns refer to the truth + + false.positives <- confusion[2,1] + false.negatives <- confusion[1,2] + true.positives <- confusion[2,2] + true.negatives <- confusion[1,1] + recall <- true.positives/(false.negatives + true.positives) + + + #true.positives <- confusion[2,2] + #misses <- confusion[1,2] + #recall <- true.positives/(true.positives+misses) + #precision <- true.positives/(same.truth) + if (recall.only) { + return(recall) + } else { + return(return(list(confusion, + recall = true.positives/(false.negatives + true.positives), + precision = true.positives/(true.positives + false.positives), + fpr = false.positives/(false.positives + true.negatives), + fnr = false.negatives/(false.negatives + true.positives), + accuracy = (true.positives + true.negatives)/(true.positives + true.negatives + false.negatives + false.positives), + specificity = true.negatives/(true.negatives + false.positives)))) + } +} + +#' Returns the block ids associated with a blocking method. +#' +#' @import blink +#' @param blocking A list of the blocks. +#' @return A list of the blocks ids that corresponds to each block +#' @export +#' @examples +#' tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:250,c(-2,-4)], b=10, save_signature=FALSE, k=1) +#' block.ids.from.blocking(tlsh.blocks) + +block.ids.from.blocking <- function(blocking) { + nn <- sum(sapply(blocking,length)) + block.ids = rep(NA, nn) + for(ii in 1:length(blocking))block.ids[blocking[[ii]]] = ii + return(block.ids) +} + +#' Returns the reduction ratio associated with a blocking method +#' +#' @import blink +#' @import utils +#' @param block.labels A list of the blocks labels. +#' @return The reduction ratio +#' @export +#' @examples +#' tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:50,c(-2)], b=22, save_signature=FALSE, k=2) +#' block.ids <- block.ids.from.blocking(tlsh.blocks) +#' reduction.ratio(block.ids) + +reduction.ratio <- function(block.labels) 1 - sum(choose(table(block.labels),2)) / choose(length(block.labels),2) + +#' Returns the reduction ratio associated with a blocking method +#' +#' @import blink +#' @param blocking The actual blocks +#' @return The reduction ratio +#' @export +#' @examples +#' tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:50,c(-2,-4)], b=10, save_signature=FALSE, k=1) +#' reduction.ratio.from.blocking(tlsh.blocks) +reduction.ratio.from.blocking <- function(blocking) { + reduction.ratio(block.ids.from.blocking(blocking)) +} diff --git a/R/minhash_v2.R b/R/minhash_v2.R new file mode 100644 index 0000000..fa2ac53 --- /dev/null +++ b/R/minhash_v2.R @@ -0,0 +1,380 @@ +#' Function to shingle (token or gram) a string into its k components +#' +#' @param record String or record +#' @param k Parameter k, which is the number of shingle, tokens, or grams to break the string into +#' @return Computes the shingled (tokened or grammed) version of a string +#' @export +#' @examples +#' shingles("Alexander",2) +#' shingles("Alexander Smith", 2) + +shingles <- function(record,k){ + # factors only convert to characters properly elementwise, + # not by applying as.character() to a whole row of a data + # frame + char_record <- lapply(record,as.character) + string <- paste(char_record, collapse=" ") + k_substring <- function(start){ + substring(string,start,start + k - 1) + } + tokens <- lapply(X=seq(1,nchar(string)-k+1), k_substring) + return(tokens) +} + +#' Function to convert to tell what index the shingle corresponds to in the record +#' +#' @param shingled_record Shingled record +#' @param universal_set Universal set of all shingles +#' @return the index regarding where the shingle falls in the record +#' @export +#' @examples +#' shingles("Alexander",2) +#' shingles("Alexander Smith", 2) +#' shingled_record_to_index_vec(shingles("Alexander",2), unique(shingles("Alexander Smith", 2))) + +shingled_record_to_index_vec <- function(shingled_record, universal_set) { + unique_tokens_in_record <- unique(shingled_record) + token_indices <- match(unique_tokens_in_record, universal_set) + return(token_indices) + # TODO: Check if next line ever needed? + #return(1:length(universal_set) %in% token_indices) +} + +#' Function to create a matrix of minhashed signatures +#' +#' @import blink +#' @import plyr +#' @param shingled_records Shingled records +#' @param p Number of permutations to be applied to the hash function +#' @param do_one_hash_and_record Combination of one hash and one record +#' @return Computes an integer-valued matrix of minhash signatures with one row per permutation and one column per record +#' @export +#' @examples +#' head(data <- RLdata500[-c(2,4)]) +#' minidata <- data[1:2,] +#' head(all_the_shingles <- apply(minidata,1,shingles,k=8)) +#' head(minhash.minidata <- minhash_v2(all_the_shingles, p=10)) + +minhash_v2 <- function(shingled_records, p, do_one_hash_and_record=do_one_hash_and_record) { + n.records <- length(shingled_records) + + # Figure out the universal set of all tokens + print("Creating the universal set of tokens") + print(system.time(universal_set_tokens <- unique(do.call(c, shingled_records)))[3]) + n.shingles <- length(universal_set_tokens) + print("Number of tokens in universal set") + print(n.shingles) + + # Generate a vector of p random hash functions + print("Generating a vector of random hash functions") + print(system.time(vector_of_hash_funcs <- rhash_funcs(n=p, size=n.shingles, vector.valued=FALSE))[3]) + + # prepare a function to do the combination of one hash and one record + # Presumes: rec_col is a vector saying which shingles (from the universal set) # are present in the shingled record + do_one_hash_and_record <- function(h, rec_col) { + # if (!timing) { + # updated_v <- vector_of_hash_funcs[[h]](rec_col) + # } else { + applying_hash <- (updated_v <- vector_of_hash_funcs[[h]](rec_col))[3] + # } + # if (!timing) { + # min_value <- min(updated_v) + # } else { + taking_min <- (min_value <- min(updated_v))[3] + # } + # if (timing) { + # print("Applying hash function") + # print(applying_hash) + # print("Getting values") + # print(getting_values) + # print("Taking the minimum") + # print(taking_min) + #} + return(min_value) + } + # Create a function to apply all the hash functions to one record + # This function must turn the shingled record into an indicator vector + # then apply all the functions + # then discard the indicator vector + multiple_hash_one_record <- function(record) { + index_vec <- shingled_record_to_index_vec(record, universal_set_tokens) + multi_hash <- sapply(1:p, do_one_hash_and_record, rec_col=index_vec) + return(multi_hash) + } + # Timing applying all the hash functions at once to one record + print("Creating index vector and applying hash functions to first record") + print(system.time(multiple_hash_one_record(shingled_records[[1]]))) + # Apply that multi-hash-function to all records + signatures <- sapply(shingled_records, multiple_hash_one_record) + # Return the matrix of minhash signatures + return(signatures) +} + +#' Function to generate all primes larger than an integer n1 (lower limit) and less than any other integer n2 (upper limit) +#' +#' @param n1 An integer taken to be 1 as the default +#' @param n2 Any integer n2 +#' @return Generates all prime numbers with the above constraints +#' @export +#' @examples +#' primest(1, 5) +#' primest(1, 17) +primest <- function(n1=1, n2){ + p <- (n1+1):n2 + i <- 1 + while (p[i] <= sqrt(n2)) { + p <- p[p %% p[i] != 0 | p==p[i]] + i <- i+1 + } + p +} + +#' Function to generate a vector of random hash functions (or optionally one vector-valued function) +#' +#' @param n Number of random hash functions +#' @param size Range of each size +#' @param vector.valued Flag for outputing vector of functions or vector-valued function +#' @param perfect Flag for whether a perfect permutation should be done, or just a hash function +#' @return Vector of n hash functions or a function which will take a number and return a vector of n different hashes of it +#' @export +#' @examples +#' rhash_funcs(1, 1, vector.valued=FALSE, perfect=FALSE) +#' rhash_funcs(5, 1, vector.valued=FALSE, perfect=FALSE) + +# TODO: replace this with digest +rhash_funcs <- function(n, size, vector.valued, perfect=FALSE) { + # Determine a suitable prime greater than size and =< 2*size + candidate_primes <- primest(size,2*size) + # Take the first suitable prime for simplicity's sake + the_prime <- candidate_primes[1] + # Create a single random hash function and return it + if (!perfect) { + # Make up a random hash function by modulo arithmetic + make_one_hash_func <- function() { + # Generate a function of the form ((ax+b) mod the_prime ) mod size + # a,b < the_prime, a non-zero + a <- as.integer64(sample(1:(the_prime-1),size=1)) + b <- as.integer64(sample(0:(the_prime-1),size=1)) + # Cast to a 64-bit integer + the_prime <- as.integer64(the_prime) + size <- as.integer64(size) + hash_func <- function(x) { + x <- as.integer64(x) + h <- ((a*x+b) %% the_prime) %% size + return(as.integer(h)) + } + return(hash_func) + } + } else { + # Make a perfect hash function that permutes the whole domain + make_one_hash_func <- function() { + perm <- sample(size) + hash_func <- function(x) { perm[x] } + return(hash_func) + } + } + # Make a list of n random hash functions + hash_func_list <- replicate(n, make_one_hash_func()) + if (vector.valued) { + # Create a function which takes a number and returns a vector, + # each component a different hash function's evaluation + # TODO: replace iteration with something more vectorized + # want many functions with one input, not many inputs + # to one function + vector_hash_func <- function(x) { + h <- vector(length=n) + for (i in 1:length(h)) { + h[i] <- (hash_func_list[[i]])(x) + } + return(h) + } + # return the vector-valued hash function + return(vector_hash_func) + } else { + return(hash_func_list) + } +} + + +#' Function to take a signature matrix M composed of b bands and r rows and return +#' a bucket for each band for each record +#' +#' @param signature Signature matrix M composed of b bands and r rows +#' @param b Number of bands +#' @return Bucket for each band for each record +#' @export +#' @examples +#' head(data <- RLdata500[-c(2,4)]) +#' minidata <- data[1:2,] +#' head(all_the_shingles <- apply(minidata,1,shingles,k=8)) +#' head(minhash.minidata <- minhash_v2(all_the_shingles, p=10)) +#' hash_signature(minhash.minidata, b=2) +#' hash_signature(minhash.minidata, b=5) + +#assumes that signature matrix has been computed + #take signature matrix M into b bands of r rows + #returns the bucket for each band for each record +hash_signature <- function(signature,b){ + # need to divide signature into b bands of rows + # so r = nrow(signature)/b, rounded down + r = floor(nrow(signature)/b) + extract_band <- function(i) { signature[((i-1)*r+1):(i*r),] } + bands <- lapply(1:b,extract_band) + # for each band, hash each portion of its columns to a hash table with k buckets + #make k as large as possible + band_hash <- sapply(bands,my_hash) + # sapply builds up its output columnwise, so transpose to keep records as columns + return(t(band_hash)) +} + +#' Function that applies a hash function to each column of the band from the +#' signature matrix +#' import bit64 +#' +#' @import bit64 +#' @param a_band Band from the signature matrix M +#' @return a 64 bit integer +#' @export +#' @examples +#' band1 <- c(2,1,2,1,2) +#' band2 <- c(4,5,2,1,9) +#' combined_band <- rbind(band1,band2) +#' my_hash(combined_band) + +my_hash <- function(a_band) { + hash64 <- function(x) { + # if x is a vector, concatenate all its digits into one long number + # use paste() to concatenate, then as.integer64 to turn into a big number + y <- as.integer64(paste(x,sep="",collapse="")) + return(hashfun(y,hashbits=64)) + } + return(apply(a_band,2,hash64)) +} + + +#' Function that extracts pairs of records from a band in the signature matrix M +#' import bit64 +#' +#' @param a_band Band of the signature matrix M +#' @return The edgelist of record pairs that are connected +#' @export +#' @examples +#' band1 <- c(2,1,2,1,2) +#' extract_pairs_from_band(band1) +#' band2 <- c(6,7,8,9,6) +#' extract_pairs_from_band(band2) +#' band.12 <- rbind(band1, band2) +#' apply(band.12,1,extract_pairs_from_band) + +extract_pairs_from_band <- function(a_band) { + # Each record has been mapped to some bucket within this band + # We now want to note down which pairs of records got mapped to the _same_ + # bucket in this band (not caring about whether they got put in the same + # bucket in other bands) + record_pairs_in_bucket <- function(a_bucket) { + # print(paste("In bucket",a_bucket)) + records_in_the_bucket <- which(a_band==a_bucket) + # print(paste(length(records_in_the_bucket),"records")) + if (length(records_in_the_bucket) > 1) { + recpairs <- as.matrix(combn(records_in_the_bucket,m=2)) + } else { + recpairs <- matrix(records_in_the_bucket,nrow=2,ncol=1) + } + return(recpairs) + } + # Which buckets did we actually see in this band? + observed_buckets <- unique(a_band) + # Extract common pairs of records for each observed bucket + edgelist <- lapply(observed_buckets, record_pairs_in_bucket) + # We'll get back a list so bind them together columnwise + edgelist <- do.call(cbind,edgelist) + rownames(edgelist) <- c("rec1","rec2") + # We want the transpose + return(t(edgelist)) +} + +#' Function that creates a similarity graph and divides it into communities (or blocks) for entity resolution +#' +#' @import igraph +#' @import plyr +#' @param hashed_signatures The hashed signatures +#' @param max_bucket_size The largest block size allowed by user +#' @return max_bucket_size The largest bucket size (or block size) that one +#' can handle +#' @export +#' @examples +#' head(data <- RLdata500[-c(2,4)]) +#' minidata <- data[1:2,] +#' head(all_the_shingles <- apply(minidata,1,shingles,k=8)) +#' head(minhash.minidata <- minhash_v2(all_the_shingles, p=10)) +#' hashed_signature <- hash_signature(minhash.minidata, b=5) +#' compare_buckets(hashed_signature, max_bucket_size=200) + +# Create a similarity graph and divide it into communities, as blocks for record linkage +# Inputs: minhashed signature matrix, maximum block size +# Presumes: the signature matrix has been created by minhashing (or something like + # it), so 2 records matching in some row indicates non-trivial similarity +compare_buckets <- function(hashed_signatures, max_bucket_size=1000) { + # Create blocks from the buckets the bands were mapped to + # General idea: each record gets put in multiple buckets from the multiple minhashes + # Two records are a "candidate pair" if they get mapped to the same bucket + # by some minhash or other + # Form a graph, with records as nodes, and edges between candidate pairs + # Divide the graph into dense sub-graphs (communities), subject to a maximum + # size limit + + # Each row of hashed_signatures represents the bucket-mapping of + # the records for a different minhash permutation + # Apply extract_pairs_from_band to each row, and then combine the resulting + # matrices of candidate-pair records into one big edgelist + + # TODO: Try using plyr rather than apply + do.call to see about speed + print("Creating edgelist") + edgelisting <- system.time(edgelist <- as.matrix(do.call(rbind,apply(hashed_signatures,1,extract_pairs_from_band))), gcFirst=FALSE) + print(edgelisting) + print(dim(edgelist)) + # Actually build the graph + # edgelist contains only edges in one direction, so we need to tell igraph + # that edges are directionless + print("Building graph from edgelist") + graphing <- system.time(candidate_pairs_graph <- graph.edgelist(edgelist, directed=FALSE), gcFirst=FALSE) # Actually build the graph + # edgelist isn't needed any more and can be quite big, so remove it from memory + rm(edgelist) + print(graphing) + # Remove multiple and self edges, if they exist + candidate_pairs_graph <- simplify(candidate_pairs_graph) + + # Try dividing the graph into communities. Use a hierarchical community method + # so that if the initial cut has communities which are too big, we can go further down + # until they are small enough to work with. + print("Dividing graph into communities initially") + communitying <- system.time(initial_community <- fastgreedy.community(candidate_pairs_graph), gcFirst=FALSE) + print(communitying) + #save(candidate_pairs_graph,file = "candidate_pairs_graph.Rdata") + + + # The graph has served its purpose and should go away + rm(candidate_pairs_graph) + + # Sub-divide communities if too big + max_comm_size <- max(sizes(initial_community)) + comm_number <- length(initial_community) + comm_membership <- membership(initial_community) + print("Subdividng communities") + subdividing <- system.time( + while(max_comm_size > max_bucket_size) { + comm_number <- comm_number+1 + comm_membership <- cutat(initial_community, no=comm_number) + max_comm_size <- max(table(comm_membership)) + } + ,gcFirst=FALSE) + print(subdividing) + blocks_members <- comm_membership + num_blocks <- comm_number + #save(blocks_members, file="blocks_members.Rdata") + + # Now create a list, saying which records are in which block + records_per_block <- function(b) { which(blocks_members == b)} + blocks <- lapply(1:num_blocks,records_per_block) + return(blocks) +} diff --git a/R/tlsh.R b/R/tlsh.R new file mode 100644 index 0000000..f174ad9 --- /dev/null +++ b/R/tlsh.R @@ -0,0 +1,115 @@ +# This is the one of the main blocking methods in Steorts, Ventura, Sadinle, +# Fienberg (2014), Privacy in Statistical Databases. +# If you use this code, please cite Steorts, R., Ventura, S., Sadinle, M., and +# Fienberg, S. (2014). "Blocking Comparisons for Record Linkage." Privacy +# in Statistical Databases (Lecture Notes in Computer Science 8744), ed. J +# Domingo-Ferrer, Springer, 252-268, doi:10.1007/978-3-319-11257-220. + +# tlsh Copyright 2018 Rebecca C. Steorts (beka@stat.duke.edu) + +# tlsh is free software: you can redistribute it and/or modify it +# under the terms of the Creative Commons license, either version 3 of the +# license, or (at your option) any later version. + +# tlsh is distributed in the hope it will be useful, but without ANY WARRANTY; +# without even the implied warranty of merchantability or fitness for a particular +# purpose. Specifically, you may share the software in any medium or format and +# you may adapt the software. Credit must be given when either of these are +# given to indicate if and what changes were made. The software may not be +# used for noncommerical purposes. If you are interested in using the software +# for commercial purposes, please contact the author above. +########################################################################################################### + + +#Begin working example +# TODO: make sure the blocks are saved. + +#library(plyr) +#library(digest) +#library(RecordLinkage) +#data(RLdata500) +#minidata <- RLdata500[-c(2,4)] +#The command +#rl_data_500_b26 <- adply(1:5, .margins=1, .fun = eval.blocksetup, dat=minidata, b=26, .expand=F,key=identity.RLdata500) +#plot(1:5, rl_data_500_b26[,2],xlab="k",ylab="Recall") +#plot(1:5, rl_data_500_b26[,3],xlab="k",ylab="Elaped Time") + + +# will loop through shingles 1:5 and save the recall and the runtime. We should also +# save the precision and reduction ratio as well. + +#rl_data_500_b22_30 <- adply(2:8, .margins=1, .fun = eval.blocksetup, dat = #RLdata500, b=22, .expand=F,key=identity.RLdata500) +#save(rl_data_500_b22_30, file="rl_data_500_b22_10.Rdata") + +# plot(2:8, rl_data_500_b22_30[,2],xlab="k",ylab="Recall",ylim=c(0,0.95),type="b") +# points(2:8, rl_data_500_b22_30[,2], xlab="k",ylab="Recall",ylim=c(0,0.1),pch=2,type="b") +# points(2:8, rl_data_500_b22_50[,2], xlab="k",ylab="Recall",ylim=c(0,0.1),pch=3,type="b") +# legend("bottomright", legend= c("10%", "30%","50%"), pch=c(1,2,3)) + +#End working example + +# ATTN: There are additional functions below that will allow TLSH +# to be integrated into random forests with a mapping function for +# parallezation. + + +#' Function to evaluate the blocking step +#' +#' import blink +#' @param dat Data set +#' @param b Number of buckets +#' @param k Parameter k, which is the number of shingle, tokens, or grams to break the string into +#' @param key Unique identifier +#' @return Recall and runtime +#' @export +#' @examples +#' r.set <- RLdata500[1:50,c(-2)] +#' eval.blocksetup(r.set, k=2, b=22, key=identity.RLdata500) + +eval.blocksetup <- function(dat, k=5, b=21, key){ + #runtime <- as.numeric((mapping <- block_setup_v2(dat, b=b, k=k))[3] ) + mapping <- block_setup_v2(dat, b=b, k=k) + recall<- confusion.from.blocking (blocking=mapping,true_ids=key,recall.only=TRUE)[[1]] + return(data.frame(recall)) +} + + + +#' Function that divides all records into bins using locality sensitive hashing and using TLSH (based upon community detection technique) +#' +#' import blink +#' @param r.set Record set (shingled records) +#' @param b Band +#' @param save_signature Flag of whether or not to save the signature +#' @param k Shingle size +#' @return List of blocks where a particular index is the record id in the original +#' data set +#' @export +#' @examples +#' r.set <- RLdata500[1:3,c(-2)] +#' block_setup_v2(r.set = RLdata500[1:3,c(-2)], b=22, save_signature=FALSE, k=2) + +block_setup_v2 <- function(r.set, b=22, save_signature=FALSE, k=5) { + # for each record r in r.set + # calculate the hash function of the record r, say h + # store r under h in the hash map + # return hash map from hash values to sets of records + + # Convert each record (= row of r.set) to k-token shingles + shingled_records <- apply(r.set,1,shingles,k=k) + # Create the matrix of minhashed signatures, using p random permutations + # ATTN: Put this in parallel and test that it works + + minhash_time <- system.time(minhashed_records <- minhash_v2(shingled_records,p=100),gcFirst=FALSE) + print(minhash_time) + if(save_signature) { + timestamp <- format(Sys.time(), "%Y_%m_%d_%H_%M_%S") + save(minhashed_records, file=paste("minhashed_signature", timestamp)) + } + + # Get rid of the shingled records as they've served their purpose + rm(shingled_records) + + # Calculate signatures, put into buckets, make the graph, return blocks + return(compare_buckets(hash_signature(minhashed_records,b=b))) +} diff --git a/build/vignette.rds b/build/vignette.rds new file mode 100644 index 0000000000000000000000000000000000000000..1ae48dc855b8c744718fe07a08ca7ca47f2c7cd7 GIT binary patch literal 188 zcmV;t07L&DiwFP!000001B>8dU|?WkU;$z#W+0PU7)Y=Iu>cS=0>wFjG)GBJafV({ zZVH+>3rHMIj1#Ojqa-&6O@a+#2+%fRuqZ?pWC|0KD%QNj+|*(;t?VwT1*v%{AmM-5 z^)voQcd~bCWqE!POb?10Hkk9>GILU4_J(AZz&XqTE^bgJV-UIIf=#S9_30&EJ@T$D+1~Vxf=kY(T_cy0RRAJR7d{+ literal 0 HcmV?d00001 diff --git a/inst/doc/tlsh.R b/inst/doc/tlsh.R new file mode 100644 index 0000000..94104df --- /dev/null +++ b/inst/doc/tlsh.R @@ -0,0 +1,19 @@ +## ---- echo=TRUE, message=FALSE, knitr::opts_chunk$set(cache=TRUE)------------- +library(blink) +library(plyr) +library(tlsh) +data(RLdata500) +head(RLdata500) +data.500 <- RLdata500[-c(2,4)] +head(data.500) + +## ----------------------------------------------------------------------------- + blocks <- block_setup_v2(RLdata500, b=22, k=2) + summary(blocks) + +## ----------------------------------------------------------------------------- +eval.blocksetup(RLdata500, b=26, key=identity.RLdata500) + +## ----------------------------------------------------------------------------- +(rr <- reduction.ratio.from.blocking(blocks)) + diff --git a/inst/doc/tlsh.Rmd b/inst/doc/tlsh.Rmd new file mode 100644 index 0000000..f265a6b --- /dev/null +++ b/inst/doc/tlsh.Rmd @@ -0,0 +1,86 @@ +--- +title: "tlsh" +author: "Rebecca C. Steorts" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + fig_caption: yes +vignette: > + %\VignetteIndexEntry{tlsh} + %\VignetteEngine{knitr::rmarkdown} + %\usepackage[utf8]{inputenc} +--- +We present a small example from Steorts, R., Ventura, S., Sadinle, M., and Fienberg, S. (2014). "Blocking Comparisons for Record Linkage." Privacy in Statistical Databases (Lecture Notes in Computer Science 8744), ed. J Domingo-Ferrer, Springer, 252-268, \doi{10.1007/978-3-319-11257-2_20}. We will be using the blink package in R and the RLdata500 data set, which was previously available in the Record Linkage package (but has been deprecated). Here, we illustrate transitive LSH. + +In a record linkage task one wants to remove duplicate +entries from multiple databases. However, before performing this task, one needs to perform a means of dimension reduction so that the record linkage task is computationally scalable. + +Using the TLSH algorithm, we illustrate an example of using this package using a German dataset comprised of first and last name and full date of birth. + +Our goals include + +- Presenting the RLdata500 dataset with summary information. +- Illustrating how we can format the RLdata500 dataset to work with the klsh +- Running TLSH on the RLdata500 data set to create blocks +- Explaining the tuning parameters of TLSH and how to choose these in practice with evaluation metrics. +- Sample output and visualizations + +## Understanding the RLdata500 dataset + +The RLdata500 dataset exists already in the blink package in R. We review this data set for the user. + +The RLdata500 data consists of 500 records with 10 percent duplication. Thus, there are 450 unique individuals. There is full information on each record containing first name, last name, and full date of birth. + +We first load the blink package and load the RLdata500 data set. We also, provide the first few lines of the data. We also remove missing values (they are all missing in this data set). + +```{r, echo=TRUE, message=FALSE, knitr::opts_chunk$set(cache=TRUE)} +library(blink) +library(plyr) +library(tlsh) +data(RLdata500) +head(RLdata500) +data.500 <- RLdata500[-c(2,4)] +head(data.500) +``` + +## TLSH applied to RLdata500 + + + +We now explain how to run TLSH on the RLdata500 data set, piece by piece. + +1. We first must creat a universal set of tokens. +2. We then number find the number of tokens in the universal set. +3. Then we must generate a vector of random hash functions. +4. Next, we must creating an index vector and apply the hash functions to each record +5. Then we build an edgelist, divide the graph into communities initially, sub-divide the communities more if needed +6. Finally, we have our blocks. +7. Then we can compute the dimension reduction and the recall. + +The function that find the blocks is called **block_setup_v2. + +```{r} + blocks <- block_setup_v2(RLdata500, b=22, k=2) + summary(blocks) +``` + +where b is the number of **buckets** and k is the **shingle size**. + +Observe that the blocks are roughly about the same size, however, this does not have to be the case. + + +The function that allows us to find the recall is **eval.blocksetup**. + +```{r} +eval.blocksetup(RLdata500, b=26, key=identity.RLdata500) +``` + +The function that allows us to find the reduction ratio is **reduction.ratio.from.blocking**. + +```{r} +(rr <- reduction.ratio.from.blocking(blocks)) +``` + +To summarize, we have reduced the entire space by roughly 66 percent and the recall is 0.90, which means we are only splitting records across blocks 10 percent of the time. + + diff --git a/inst/doc/tlsh.html b/inst/doc/tlsh.html new file mode 100644 index 0000000..3c06daa --- /dev/null +++ b/inst/doc/tlsh.html @@ -0,0 +1,464 @@ + + + + + + + + + + + + + + + + +tlsh + + + + + + + + + + + + + + + + + + + + + + +

tlsh

+

Rebecca C. Steorts

+

2020-11-02

+ + + +

We present a small example from Steorts, R., Ventura, S., Sadinle, M., and Fienberg, S. (2014). “Blocking Comparisons for Record Linkage.” Privacy in Statistical Databases (Lecture Notes in Computer Science 8744), ed. J Domingo-Ferrer, Springer, 252-268, . We will be using the blink package in R and the RLdata500 data set, which was previously available in the Record Linkage package (but has been deprecated). Here, we illustrate transitive LSH.

+

In a record linkage task one wants to remove duplicate entries from multiple databases. However, before performing this task, one needs to perform a means of dimension reduction so that the record linkage task is computationally scalable.

+

Using the TLSH algorithm, we illustrate an example of using this package using a German dataset comprised of first and last name and full date of birth.

+

Our goals include

+
    +
  • Presenting the RLdata500 dataset with summary information.
  • +
  • Illustrating how we can format the RLdata500 dataset to work with the klsh
  • +
  • Running TLSH on the RLdata500 data set to create blocks
  • +
  • Explaining the tuning parameters of TLSH and how to choose these in practice with evaluation metrics.
  • +
  • Sample output and visualizations
  • +
+
+

Understanding the RLdata500 dataset

+

The RLdata500 dataset exists already in the blink package in R. We review this data set for the user.

+

The RLdata500 data consists of 500 records with 10 percent duplication. Thus, there are 450 unique individuals. There is full information on each record containing first name, last name, and full date of birth.

+

We first load the blink package and load the RLdata500 data set. We also, provide the first few lines of the data. We also remove missing values (they are all missing in this data set).

+
library(blink)
+library(plyr)
+library(tlsh)
+data(RLdata500)
+head(RLdata500)
+
##   fname_c1 fname_c2 lname_c1 lname_c2   by bm bd
+## 1  CARSTEN     <NA>    MEIER     <NA> 1949  7 22
+## 2     GERD     <NA>    BAUER     <NA> 1968  7 27
+## 3   ROBERT     <NA> HARTMANN     <NA> 1930  4 30
+## 4   STEFAN     <NA>    WOLFF     <NA> 1957  9  2
+## 5     RALF     <NA>  KRUEGER     <NA> 1966  1 13
+## 6  JUERGEN     <NA>   FRANKE     <NA> 1929  7  4
+
data.500 <- RLdata500[-c(2,4)]
+head(data.500)
+
##   fname_c1 lname_c1   by bm bd
+## 1  CARSTEN    MEIER 1949  7 22
+## 2     GERD    BAUER 1968  7 27
+## 3   ROBERT HARTMANN 1930  4 30
+## 4   STEFAN    WOLFF 1957  9  2
+## 5     RALF  KRUEGER 1966  1 13
+## 6  JUERGEN   FRANKE 1929  7  4
+
+
+

TLSH applied to RLdata500

+

We now explain how to run TLSH on the RLdata500 data set, piece by piece.

+
    +
  1. We first must creat a universal set of tokens.
  2. +
  3. We then number find the number of tokens in the universal set.
  4. +
  5. Then we must generate a vector of random hash functions.
  6. +
  7. Next, we must creating an index vector and apply the hash functions to each record
  8. +
  9. Then we build an edgelist, divide the graph into communities initially, sub-divide the communities more if needed
  10. +
  11. Finally, we have our blocks.
  12. +
  13. Then we can compute the dimension reduction and the recall.
  14. +
+

The function that find the blocks is called **block_setup_v2.

+
 blocks <- block_setup_v2(RLdata500, b=22, k=2)
+
## [1] "Creating the universal set of tokens"
+## elapsed 
+##   0.005 
+## [1] "Number of tokens in universal set"
+## [1] 404
+## [1] "Generating a vector of random hash functions"
+## elapsed 
+##   0.003 
+## [1] "Creating index vector and applying hash functions to first record"
+##    user  system elapsed 
+##   0.006   0.000   0.006 
+##    user  system elapsed 
+##   3.205   0.021   3.234 
+## [1] "Creating edgelist"
+##    user  system elapsed 
+##   0.207   0.007   0.214 
+## [1] 23146     2
+## [1] "Building graph from edgelist"
+##    user  system elapsed 
+##   0.001   0.000   0.002 
+## [1] "Dividing graph into communities initially"
+##    user  system elapsed 
+##   0.017   0.000   0.017 
+## [1] "Subdividng communities"
+##    user  system elapsed 
+##       0       0       0
+
 summary(blocks)
+
##      Length Class  Mode   
+## [1,]  48    -none- numeric
+## [2,]  62    -none- numeric
+## [3,] 141    -none- numeric
+## [4,] 249    -none- numeric
+

where b is the number of buckets and k is the shingle size.

+

Observe that the blocks are roughly about the same size, however, this does not have to be the case.

+

The function that allows us to find the recall is eval.blocksetup.

+
eval.blocksetup(RLdata500, b=26, key=identity.RLdata500)
+
## [1] "Creating the universal set of tokens"
+## elapsed 
+##   0.004 
+## [1] "Number of tokens in universal set"
+## [1] 2516
+## [1] "Generating a vector of random hash functions"
+## elapsed 
+##   0.003 
+## [1] "Creating index vector and applying hash functions to first record"
+##    user  system elapsed 
+##   0.006   0.000   0.006 
+##    user  system elapsed 
+##   3.298   0.020   3.328 
+## [1] "Creating edgelist"
+##    user  system elapsed 
+##   0.260   0.005   0.269 
+## [1] 13434     2
+## [1] "Building graph from edgelist"
+##    user  system elapsed 
+##   0.001   0.001   0.002 
+## [1] "Dividing graph into communities initially"
+##    user  system elapsed 
+##   0.003   0.000   0.003 
+## [1] "Subdividng communities"
+##    user  system elapsed 
+##       0       0       0
+
##   recall
+## 1   0.86
+

The function that allows us to find the reduction ratio is reduction.ratio.from.blocking.

+
(rr <- reduction.ratio.from.blocking(blocks)) 
+
## [1] 0.6491784
+

To summarize, we have reduced the entire space by roughly 66 percent and the recall is 0.90, which means we are only splitting records across blocks 10 percent of the time.

+
+ + + + + + + + + + + diff --git a/man/block.ids.from.blocking.Rd b/man/block.ids.from.blocking.Rd new file mode 100644 index 0000000..1b1166d --- /dev/null +++ b/man/block.ids.from.blocking.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/blocking-evaluations.R +\name{block.ids.from.blocking} +\alias{block.ids.from.blocking} +\title{Returns the block ids associated with a blocking method.} +\usage{ +block.ids.from.blocking(blocking) +} +\arguments{ +\item{blocking}{A list of the blocks.} +} +\value{ +A list of the blocks ids that corresponds to each block +} +\description{ +Returns the block ids associated with a blocking method. +} +\examples{ +tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:250,c(-2,-4)], b=10, save_signature=FALSE, k=1) +block.ids.from.blocking(tlsh.blocks) +} diff --git a/man/block_setup_v2.Rd b/man/block_setup_v2.Rd new file mode 100644 index 0000000..3f354b9 --- /dev/null +++ b/man/block_setup_v2.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tlsh.R +\name{block_setup_v2} +\alias{block_setup_v2} +\title{Function that divides all records into bins using locality sensitive hashing and using TLSH (based upon community detection technique)} +\usage{ +block_setup_v2(r.set, b = 22, save_signature = FALSE, k = 5) +} +\arguments{ +\item{r.set}{Record set (shingled records)} + +\item{b}{Band} + +\item{save_signature}{Flag of whether or not to save the signature} + +\item{k}{Shingle size} +} +\value{ +List of blocks where a particular index is the record id in the original +data set +} +\description{ +import blink +} +\examples{ +r.set <- RLdata500[1:3,c(-2)] +block_setup_v2(r.set = RLdata500[1:3,c(-2)], b=22, save_signature=FALSE, k=2) +} diff --git a/man/compare_buckets.Rd b/man/compare_buckets.Rd new file mode 100644 index 0000000..7ca604b --- /dev/null +++ b/man/compare_buckets.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/minhash_v2.R +\name{compare_buckets} +\alias{compare_buckets} +\title{Function that creates a similarity graph and divides it into communities (or blocks) for entity resolution} +\usage{ +compare_buckets(hashed_signatures, max_bucket_size = 1000) +} +\arguments{ +\item{hashed_signatures}{The hashed signatures} + +\item{max_bucket_size}{The largest block size allowed by user} +} +\value{ +max_bucket_size The largest bucket size (or block size) that one +can handle +} +\description{ +Function that creates a similarity graph and divides it into communities (or blocks) for entity resolution +} +\examples{ +head(data <- RLdata500[-c(2,4)]) +minidata <- data[1:2,] +head(all_the_shingles <- apply(minidata,1,shingles,k=8)) +head(minhash.minidata <- minhash_v2(all_the_shingles, p=10)) +hashed_signature <- hash_signature(minhash.minidata, b=5) +compare_buckets(hashed_signature, max_bucket_size=200) +} diff --git a/man/confusion.from.blocking.Rd b/man/confusion.from.blocking.Rd new file mode 100644 index 0000000..5e6441a --- /dev/null +++ b/man/confusion.from.blocking.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/blocking-evaluations.R +\name{confusion.from.blocking} +\alias{confusion.from.blocking} +\title{Perform evaluations (recall) for blocking.} +\usage{ +confusion.from.blocking(blocking, true_ids, recall.only = FALSE) +} +\arguments{ +\item{blocking}{A list of the blocks} + +\item{true_ids}{The true identifiers for comparisons} + +\item{recall.only}{Flag that when true only prints the recall, otherwise +prints many evaluation metrics in a list} +} +\value{ +A vector of that returns the recall and the precision +} +\description{ +Perform evaluations (recall) for blocking. +} +\examples{ +r.set <- RLdata500[1:250,c(-2)] +tlsh.blocks <- block_setup_v2(r.set, b=22, save_signature=FALSE, k=2) +confusion.from.blocking(tlsh.blocks, identity.RLdata500, recall.only=TRUE) +} diff --git a/man/eval.blocksetup.Rd b/man/eval.blocksetup.Rd new file mode 100644 index 0000000..28a5326 --- /dev/null +++ b/man/eval.blocksetup.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tlsh.R +\name{eval.blocksetup} +\alias{eval.blocksetup} +\title{Function to evaluate the blocking step} +\usage{ +eval.blocksetup(dat, k = 5, b = 21, key) +} +\arguments{ +\item{dat}{Data set} + +\item{k}{Parameter k, which is the number of shingle, tokens, or grams to break the string into} + +\item{b}{Number of buckets} + +\item{key}{Unique identifier} +} +\value{ +Recall and runtime +} +\description{ +import blink +} +\examples{ +r.set <- RLdata500[1:50,c(-2)] +eval.blocksetup(r.set, k=2, b=22, key=identity.RLdata500) +} diff --git a/man/extract_pairs_from_band.Rd b/man/extract_pairs_from_band.Rd new file mode 100644 index 0000000..cd0f9d6 --- /dev/null +++ b/man/extract_pairs_from_band.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/minhash_v2.R +\name{extract_pairs_from_band} +\alias{extract_pairs_from_band} +\title{Function that extracts pairs of records from a band in the signature matrix M +import bit64} +\usage{ +extract_pairs_from_band(a_band) +} +\arguments{ +\item{a_band}{Band of the signature matrix M} +} +\value{ +The edgelist of record pairs that are connected +} +\description{ +Function that extracts pairs of records from a band in the signature matrix M +import bit64 +} +\examples{ +band1 <- c(2,1,2,1,2) +extract_pairs_from_band(band1) +band2 <- c(6,7,8,9,6) +extract_pairs_from_band(band2) +band.12 <- rbind(band1, band2) +apply(band.12,1,extract_pairs_from_band) +} diff --git a/man/hash_signature.Rd b/man/hash_signature.Rd new file mode 100644 index 0000000..0385540 --- /dev/null +++ b/man/hash_signature.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/minhash_v2.R +\name{hash_signature} +\alias{hash_signature} +\title{Function to take a signature matrix M composed of b bands and r rows and return +a bucket for each band for each record} +\usage{ +hash_signature(signature, b) +} +\arguments{ +\item{signature}{Signature matrix M composed of b bands and r rows} + +\item{b}{Number of bands} +} +\value{ +Bucket for each band for each record +} +\description{ +Function to take a signature matrix M composed of b bands and r rows and return +a bucket for each band for each record +} +\examples{ +head(data <- RLdata500[-c(2,4)]) +minidata <- data[1:2,] +head(all_the_shingles <- apply(minidata,1,shingles,k=8)) +head(minhash.minidata <- minhash_v2(all_the_shingles, p=10)) +hash_signature(minhash.minidata, b=2) +hash_signature(minhash.minidata, b=5) +} diff --git a/man/minhash_v2.Rd b/man/minhash_v2.Rd new file mode 100644 index 0000000..a41e0ef --- /dev/null +++ b/man/minhash_v2.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/minhash_v2.R +\name{minhash_v2} +\alias{minhash_v2} +\title{Function to create a matrix of minhashed signatures} +\usage{ +minhash_v2( + shingled_records, + p, + do_one_hash_and_record = do_one_hash_and_record +) +} +\arguments{ +\item{shingled_records}{Shingled records} + +\item{p}{Number of permutations to be applied to the hash function} + +\item{do_one_hash_and_record}{Combination of one hash and one record} +} +\value{ +Computes an integer-valued matrix of minhash signatures with one row per permutation and one column per record +} +\description{ +Function to create a matrix of minhashed signatures +} +\examples{ +head(data <- RLdata500[-c(2,4)]) +minidata <- data[1:2,] +head(all_the_shingles <- apply(minidata,1,shingles,k=8)) +head(minhash.minidata <- minhash_v2(all_the_shingles, p=10)) +} diff --git a/man/my_hash.Rd b/man/my_hash.Rd new file mode 100644 index 0000000..c286847 --- /dev/null +++ b/man/my_hash.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/minhash_v2.R +\name{my_hash} +\alias{my_hash} +\title{Function that applies a hash function to each column of the band from the +signature matrix +import bit64} +\usage{ +my_hash(a_band) +} +\arguments{ +\item{a_band}{Band from the signature matrix M} +} +\value{ +a 64 bit integer +} +\description{ +Function that applies a hash function to each column of the band from the +signature matrix +import bit64 +} +\examples{ +band1 <- c(2,1,2,1,2) +band2 <- c(4,5,2,1,9) +combined_band <- rbind(band1,band2) +my_hash(combined_band) +} diff --git a/man/primest.Rd b/man/primest.Rd new file mode 100644 index 0000000..64f99be --- /dev/null +++ b/man/primest.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/minhash_v2.R +\name{primest} +\alias{primest} +\title{Function to generate all primes larger than an integer n1 (lower limit) and less than any other integer n2 (upper limit)} +\usage{ +primest(n1 = 1, n2) +} +\arguments{ +\item{n1}{An integer taken to be 1 as the default} + +\item{n2}{Any integer n2} +} +\value{ +Generates all prime numbers with the above constraints +} +\description{ +Function to generate all primes larger than an integer n1 (lower limit) and less than any other integer n2 (upper limit) +} +\examples{ +primest(1, 5) +primest(1, 17) +} diff --git a/man/reduction.ratio.Rd b/man/reduction.ratio.Rd new file mode 100644 index 0000000..fc68218 --- /dev/null +++ b/man/reduction.ratio.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/blocking-evaluations.R +\name{reduction.ratio} +\alias{reduction.ratio} +\title{Returns the reduction ratio associated with a blocking method} +\usage{ +reduction.ratio(block.labels) +} +\arguments{ +\item{block.labels}{A list of the blocks labels.} +} +\value{ +The reduction ratio +} +\description{ +Returns the reduction ratio associated with a blocking method +} +\examples{ +tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:50,c(-2)], b=22, save_signature=FALSE, k=2) +block.ids <- block.ids.from.blocking(tlsh.blocks) +reduction.ratio(block.ids) +} diff --git a/man/reduction.ratio.from.blocking.Rd b/man/reduction.ratio.from.blocking.Rd new file mode 100644 index 0000000..6a6a51b --- /dev/null +++ b/man/reduction.ratio.from.blocking.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/blocking-evaluations.R +\name{reduction.ratio.from.blocking} +\alias{reduction.ratio.from.blocking} +\title{Returns the reduction ratio associated with a blocking method} +\usage{ +reduction.ratio.from.blocking(blocking) +} +\arguments{ +\item{blocking}{The actual blocks} +} +\value{ +The reduction ratio +} +\description{ +Returns the reduction ratio associated with a blocking method +} +\examples{ +tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:50,c(-2,-4)], b=10, save_signature=FALSE, k=1) +reduction.ratio.from.blocking(tlsh.blocks) +} diff --git a/man/rhash_funcs.Rd b/man/rhash_funcs.Rd new file mode 100644 index 0000000..e47622c --- /dev/null +++ b/man/rhash_funcs.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/minhash_v2.R +\name{rhash_funcs} +\alias{rhash_funcs} +\title{Function to generate a vector of random hash functions (or optionally one vector-valued function)} +\usage{ +rhash_funcs(n, size, vector.valued, perfect = FALSE) +} +\arguments{ +\item{n}{Number of random hash functions} + +\item{size}{Range of each size} + +\item{vector.valued}{Flag for outputing vector of functions or vector-valued function} + +\item{perfect}{Flag for whether a perfect permutation should be done, or just a hash function} +} +\value{ +Vector of n hash functions or a function which will take a number and return a vector of n different hashes of it +} +\description{ +Function to generate a vector of random hash functions (or optionally one vector-valued function) +} +\examples{ +rhash_funcs(1, 1, vector.valued=FALSE, perfect=FALSE) +rhash_funcs(5, 1, vector.valued=FALSE, perfect=FALSE) +} diff --git a/man/shingled_record_to_index_vec.Rd b/man/shingled_record_to_index_vec.Rd new file mode 100644 index 0000000..8fc0dbb --- /dev/null +++ b/man/shingled_record_to_index_vec.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/minhash_v2.R +\name{shingled_record_to_index_vec} +\alias{shingled_record_to_index_vec} +\title{Function to convert to tell what index the shingle corresponds to in the record} +\usage{ +shingled_record_to_index_vec(shingled_record, universal_set) +} +\arguments{ +\item{shingled_record}{Shingled record} + +\item{universal_set}{Universal set of all shingles} +} +\value{ +the index regarding where the shingle falls in the record +} +\description{ +Function to convert to tell what index the shingle corresponds to in the record +} +\examples{ +shingles("Alexander",2) +shingles("Alexander Smith", 2) +shingled_record_to_index_vec(shingles("Alexander",2), unique(shingles("Alexander Smith", 2))) +} diff --git a/man/shingles.Rd b/man/shingles.Rd new file mode 100644 index 0000000..80a2409 --- /dev/null +++ b/man/shingles.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/minhash_v2.R +\name{shingles} +\alias{shingles} +\title{Function to shingle (token or gram) a string into its k components} +\usage{ +shingles(record, k) +} +\arguments{ +\item{record}{String or record} + +\item{k}{Parameter k, which is the number of shingle, tokens, or grams to break the string into} +} +\value{ +Computes the shingled (tokened or grammed) version of a string +} +\description{ +Function to shingle (token or gram) a string into its k components +} +\examples{ +shingles("Alexander",2) +shingles("Alexander Smith", 2) +} diff --git a/vignettes/blocks_members.Rdata b/vignettes/blocks_members.Rdata new file mode 100644 index 0000000000000000000000000000000000000000..45416a37dec9324926bd2b155a4e079507cb0cad GIT binary patch literal 946 zcmV;j15NxNiwFP!0000019ey1P82~9ZMh`M1K9^2c<^CDh%pAq%0(A1(A;k#u+q7kBzALvivQ|43n1Rs1hj`led>%f|rL-lmkIaO2DJzHOCPY<<+oOAtd!1cNQ z!QxLh*leu5tT{K(R~V0q2TuwdKEL{Q@%5&AeR+L;`QxVW7%m?E@=x8rfBqG)0{jSk z0i5G{h`0mW0e7&TzPcjs zyv*w@a7y^JALu_%9l`SKsN(uAMqDpGv9sitOOfxTlP7LH}0%n z)`9(~^`?II8J_#P*i&LN-@3Q!@2swJPT5y{ztlyY)ItAxuzvO2#^e43cp!NlK-U4{ zRJfkod>E%8x_q$v#_aQwhwrBWzR@208rWBLHZJ)Z(6tq8oTjt-9gua}1hOto=x7F; zUo*FHIY%48>d^C$yzAK4A}?(A8|RGv%#V6GFD-BRs6+dKHtJs1(TN}VMrB_sSpDoz z&RHUSn(rIoW8GT0^(VjTUkujYi9X(2^g9IdeehnH7Co!tSNi*hc~lqcrhWYZ$i8QO zhrz~q4`dxV&&-o~GcN0MnA>%|=jcB!eib0=umQtzMk?oFN#ap&MNseI*T5>{ zA}{vAtg}3AtXBuf^Sa}0T*l*fK?lhDmpVGR)ptZhydL-By*^+akH9k}>pE{o z;C0UOXkQ(REp=u*#WEj$pKvZF1R3u*D)C$5ulJgszwB?GmwG-C`JF&M_B;FbH~{mA zNe{l{VLfG^l85h~b>sODLH@*v9selbGyClX$oU{2_c34Qb&^{>x?k=)e=2%Uz4f0I z%1Icdp)pX=-&4Q}FX#t#bV~qRwTS zTfQ0b<9CIgtLdHf%OtNXiI_+}%#Z!RdsBbcInk-#4~)z6m+>+WOf7GZbFYk_Ay1xX zdVgoP5w&mST)8&~S#I_6ywZD_=bzql)S2B`KkbKk%P+s;2QRL&o8rg2tNgb(gx~a^ UzkDyO-+%xA0h5z_8W|7(03Le$F#rGn literal 0 HcmV?d00001 diff --git a/vignettes/candidate_pairs_graph.Rdata b/vignettes/candidate_pairs_graph.Rdata new file mode 100644 index 0000000000000000000000000000000000000000..15d888a8160a50afb11af0c9f6842b8626839c97 GIT binary patch literal 12763 zcmaKz30P8F+xP2~Svin()GU?K%F2Pv%2`q~?Uqf=Xu`meV+HduJ7ZzZua8d_g?p2>)IEr`}bc! zv(~Qo=WEqKk>~vY%A@jAgp9e)=PLv&&L{X@J+S7(#U1(mjc$VDIemul5YBY5#plOf z#~L4XW*#|j^ytq;-z0isjK3!=X8N*4!14=ko%3y zmX|BIU;Va%jOI>kM7)q|b}P#~Xf;9e0+A|@$OSegp!6Kod&{j^bMDDw`m^@clQM{CoO=U_ZfK>P@*opa#Bh~#)_ljnR zNS}pU1d3cH!J;7ns~$b-^Gt{SmR`@^C_Ia+!qbt^`UGUp&sANH=dLcQQv#pMO{Mv& z$lmM%{dis3d(7>MtAjtj&2fPjo^BO zF|AeAoa|0Qo|-2${*>zt#}xCRDfpfms(#b@AhvjR4sa$lwVBp)O&SZLU5BG`gs4Cg zU=cDnE8Zg*`zapjN8`cboMvD*UaD{sh(~Wa!j@49Y5^Vd!!B7B%i!D!Y5x*($eywY z4WY1;Jy3Y*1=XR_;_7`_ZFSI}pK|)}-hSMJ7UL;%pAlqJF z)DaYxj8f(l`I4G%7IZc@O6De2kvQ>*5>oR~DZy86Ae4>^l9KvaU1yds-J&{S_ad5O&t4We zK(5jsv3|sf9p8B-W8CN8&_(H7RW5?Q$<%mQV?6B2NPGq};$#k!E*%oshhgD`-QStb z3ws6hw=$P3oXxmYX7+>-o&>EDRSwz$3PR2WJ|*?v0qpE>FdOqF4ujFhPA<(Rlt~;G(Sp{~VG76W z*0(;S33CY(&D0zj_iGRh-jLwwe!|dk?t(A(ExotcB$=>0CX`(u&|)0o`(uijNlvi7 z2wEyL7sX}Qh=L5E_Sr4R?YV+0BmS)rr ze9#~3jQCl!a6Lzr8KIZhZllP97 zL&?zjp5oyp99}vqPl66>N||VmlVI$8$cS*B_89Q7=Or_wBpaZ=0%|TDDaH47%#uXv!fAc)JDcZMcn*klk6{33b{oue8om zc>bx#ab9l2Migr*Uf^=+ig?29uZ)CyAkW{u6R*EXUO$+kZvsUAdHu>FVi`pIc&z$} zW@{k>NVauP-IfR5U(vxy*A>`CMIXpVuA75u6{O{L_!jeq9>x*Jb`(13b-8vw05z{~ z3}LfLX5+v&i)c&{3et>oB{W}xUFP1CblWZ}H?8<;?glj5HmLYl zm~ZjRdszk^*hvGm{_c7=RMWeM7%DUV=j0umWV?Lm?1#?vnj3o|%eCHxPKK`7TrD)u zFl7h_-+&$Q98xVv%lGp!#vFe)!FHQ@i14F}7O(2S{a?f3 zzou*qx68aM=k!A!&V8wbn3m*o@su;wEBj8LzrJvz#m#M;Efy(!Kc^56X$E>5=QO;m zdq*U+u>0quAeXtoX_J68ncIW|$pKGyjN%~!6`v8GNhZ)}`*+?j0`sXk0j)nS>c z1r%<=+11ihs4*d{dg&AX+4n9*@)FdGhC@`s0HWtMPCl+RBXGH6;!;cG`L2qf(JW*-AkW4>w=p(c-a~b%ZbGjM!`VH3B|U-Im=J06(mvPcojZxQ znWvG`Pnh2&ZYsud5?6sGl7sv#@>sq2QlkJkWg|P%vRxASpg*cGxoGd{X z5-Qnz^aQbuTSN~BORTE~QH@MfdJ`G(*mfM%#Py3+$(j?9ULfcq62^7Vmgs8Vzw)Ke z61q(867+tN$h=M`tm={yXSqx08tD6|Um9~SOA)#(Ex$$$1)dC@txx{E?E+^U^b(sNGJ|^OgKTCmkpz;F!S0o58Z|K=7>tD8|j^d zbRi$;i0Wp_#-*g#fT^Tkh5~yC&#?;lih9^GB^`=!JbqtScM8Vo$(cy<#tQXL1c@93 z_L|>m6E@=67391xM9QkbRU!fjiRVCQ4es4N zaP~&ZL{*^43;Iy3#mx8bwFG+5SR>sMu!l@#%5pHr!Nc8v0j;ZVmK|BFZ|69dR-6$n z8HjlEn$Eg3Gl-u{I^#%UstI)B8{At9gsEB%t%-M4(dIQV#3c~juec@QFWGkYDYn>& zSIwRB23=f1iF<(i;;lH~C`odvDAbnrF55s*{&hd+hQXdX!}n2HiECni&&CZZdVW4= z9Pe2-r5k1VPVSafbD^a(O2X1tGzWc*4d8{$boIX|{3X&p0iXU@>$dzgws8=yx`$wR zf%KhveHJ+5UKZiv^kH#!RV_-Z%#h+$ljT2BQH;srP0y4Oc&#ngf&;TaX%U|)P>*lR zkblCaS)hl1ezgQk6*Er^Z++bucQun(EkDc7oVl@Om?@*9(vsfF>%E3exH79@6Xv=jrM)Nw5|dl=tx{*j?G)(Ozo0zOxr)$e&Q zoDXU3?i=u|;Wkv(97}$)bU60tSDd*#Rd8_-#gN>II{`@6$Ii_Hq41Fcd_(hBu!4Tg z04=IS7BN|!e8M#Qor_ceQ{2H&m%#P?h=B#n(1-5tVGG+ml7TXE&s}K!6w!?DbFm}# z0^9CJG`X>kopd8LuDK?pl@yg~9bf#5=tk;W!>uBEpM_^G{gpv)x|+H4?#A4WT5PKD zlT{ab%C8?DrTo!8*)3?)h}+sl=p&eS-QY$UCUiH2(RZ;?1P#Izf!(d|RLwrDJIESPX#mA#6aXef5Ne%ro2 zJXLN-8-I?U@jvtq*;t9pVkZ=^&yJ#>NC>ip*y|IZ$*GAfwfJV*08LgHM-9hZ zgJ;xdam?rGueF4`fz%=|$xfdtAP{)32@XFWwHu7dV#Z&skh`F&P5YN_$sH%e+EUu$ z$Pj`uZoy@qqj=ha#uxa=ES~+S-&U&3TiTyK?GspGP(X%P#uu=!qpREo?^=%|0&rg0 zBf$y_QI(wMMN^|7eK7^n&`dTK7snA^AiBa?z@#4^cYq($Cy&N*?jN2R@n3BK{|02l zA$!KXX6Pl!t@VB+!66Yao7NBeRQfUTWx9?ayo%tL^9owQARZ{pjIH6W5jItZs9L|`vn70=hy-X%a$J&p z8nD|55(|Vldi>SQS*~F%7RTG_K-e0PX;9Ol$r*2xTX0tj(q?qd+Cpml!fM}M29kYJ zX0r)Ho}E<%ySsG#!8W z50V1}G9u*nCZ>i2rf}IRCx8j+bFSMI)wk_u>K8ip-Mx5xu|$9NFQUA5$0(&|J{Wsk>%nV5etUEDt}N|^dS_SxUK?4&E{ zJRT-}{vW(E$vatUfjd|NHEqe!E^5K=61Oh=i=+}nSyG8#S<-j&F8400jO>TJBDC#Pf>Y}D*IOXuty>oa!APEn7>EqeFlDDNvQ?t9F zcJ^KN5VMIi$}=#CUZS1y>c7ppD>SeGW3_yJndLHgbh)R_HwaVF z^VT;Aa6Ushy1bob9~(;DCg?gvd+j=++JNo}nd+`30A)QT!3a#F@*U2qAaW2A@dQ2^6byvXfC>s8y(x zp4Y*M@j$0ja*rT4MTJC7@Eh+*3ATfbdD7Dnj2QDM$?3qn>Z6d83W)dYRxAggu*SwC0^<$Z3N9{jG@ zL)e9Q3HZx{zzm1$3nj|X4H6iB7yK`|bMa9{ytrc;aCks$cBq$CCazkBgEF3sH_6+} z3*%`^N(Cm>xLn$4;JN1Zgi{izJX!)x+6zM9b|6mmVP-CnTYwG!P_T}%N^ZwL zSMYA}p7Zd$99|x;p$+(kMeRcEMXg1xUfvHXc&B-WkZJ6K1|NZ0Lq2Tymq;7A@I^5T zK2jLi)|}MP-FT&VvQk`R^1*H2(!QmSmtHO&Y!Dw?`e>c_`u4$MlS{u)Eel`QfB&g5 z@N=DGSU}17q>^h%+iE|Ws1R1`EGF-d4lpOvs>aRYK8`R%`X&`>tQXf7V}3n|KlK2e z7wJFOBz|lY7Bw%18L{>w2GRo;{cqE{hIW*Z_2YLYmQO(CrlCHtcb z9rB1L56ahoYsc4`iH)L$L)sK4EX0Nw*w2&4_jan!9lV#YLLVwy&AN@)0468eXrkBY+U3Ui zp;V)u)E=|>!=`wXiMGSlx^plebShy(ruN^pnfC=V@tylaYM7ART;UqREz*KCl_8=v z=SQy2cAkOB#Py~!p`LUbM;-16LKLkY$qXa~!!XgU+l1k5(QI?fjY|Uxgul<;3v*}q zblbe1Xfw;8Kb7#+GT{l9!SL;zbpSU(r-NYHo3lUOr$E^n@>N^)I zE(2Rw?f7!~^}^h^Ooitc%syZ25bo{JG^8|;LEl)AR+=Uu@GndgdSPdl+%=p*H@>yZ zUe!DUT9J`8v7(>;7yfc#3=?!g(?i1YJ1;jL~#pnei%JN zcthU%u|XE}z-Mqr+2}21NC0)Q1pBl36|yU8&?WxEVhJpy)zjp6)EBYiP5XREQTC@O z%ki?L`=&oMi_|H)i%%L zT+! zt(s3M^*8$%Z^Ze;ByI+L5%)(vgtc;cxZaC&)iOSEB#Y`5m#wK*GPQAdZhWp&JR!+Rj3pGEgfHujG1 zRm|Ev60V_}9@75P;p_t0h*=ggD)o1fY;{0>bm-Pb%)M91UD`;r;6UVOM?F1&+ko~v z3rji6ZZ+ZA!zFzOQ{ioEgZPQib)rp}Am{mlz1Y3c zKGK%4jghW7Nz{?ea5rY}6-(2YY4|e>5bZ+wBZe;JzN)s27kM2vh@B|4BFoa6&$VsB zZVyN_bCRH^zhP-2^0OvUaUM>{o#ligt}QNyLP73R_nln@z}?t= z3TN?X4Z7)uYv>kHX}L77Ccmcl#)kS0tQRk&&kvMQ#|kstZMMx^==3%>D4;@Oxok$m z0Z!D5%eb@l*X+)Z8SB2+@9w0>JVS*wco=_NwCIR>mlyHEX$lql<0*UKvTpUz=jna1 zmjHKC%Oi5t&B+$6fREE}+^0(m8^1}ZmwkiWah(2M|3s&j-bbyYJ?!#X)Mr}ieDjkG zSp9C2nStr+9VCK%^<(cL_W1PMzB6Mmi%kpUevK)fH6bs;e(a={MS3E!ra@Zw*h0`; z3>(9f2@dq$w3aa*`YS8Gfj@82)N^BIdB8xIv3)v+ih4ol2>jGY9VpXk?IjkM6jJA+ zhr{XYE;Wl`SdXZLJ-~Gf%AS0KptlyoaINpmKFhxpM4-|WW9_GZ4SXGb8uaW2Exw%l zUqXd*58-;N%-^~xBJ}b`3*E2N?_0->ZC{s0>s&UpfGvT)-h_`m>YcJkeJETax;yG~ z_n*bV+5bzN$ag!^>^{uce|FlNwcEmP;OiXttYftkCt>nYBhqqmSexi)YPJE1besE0@gBa36o4f2k3s7yfa!)&S!Z@M|*u1Iw2TGa@yD zB?cB~TzAjwaA{d;z?vzsbuR%DDy>fXl&G)!hu>6ptZg3kK8Y4+f&UL7;>AuP!XVQi zwy%g9`C+v6PeSg7E|Oo8fM_uqZ!ERq#)~UOPx+I1ql>d6W#(4IoN1E6=CKRxva&gv{U6p7P$JN2I=9J@0Xu8QZE;Nza({p z7GtLa074y6mye3a%u6Xto2m3F_rx#4FJMp!b{tWC(GcJNad9!IoIY-X3L1!|-yEZm z5$cqQ{XI4EW86QC+5A~de8Bt`#;>|k{PROT{N$wms7yZb&AVDxv<7>EJ6`cHqu>Va zvlJAyPDoE}x%oiu0Xw=y&~}}KlD{(RWp!AbiuWf{3j?#w4d$0~79JR9S120THWl#r zf2j^9&Q1xQ4np$#YNVJ?gXZHGhO(BtQS=K9qOU|z>YH1`AfAf^MxyYiu!Awb!1-82 z1BUZ9umG1pj^2VFUXBgG^8MjclqQt$97`j!npER;IKSt{6+=y$6B9nKL1u6a4dta14;1R zt(A`0<##j=O;p0Eg>Nr-?$mK%citIHII3gAq_kaxj>1&=z|U)W8wrdDt6%MC-*du; zEYIGWr1=+CnhAx;?XS!SsZ?RMTI*v) zthw6n2>hmS>Z$}dSeO(>X{KcfR z`*N;cLC50nQr6q-dnj90UPzs~l`+N5rl&N-Dl0JfNRC=H)yeA6+jC~p>sKt>areF! z+SqQWSwWDQqt?0oL!8`&->b2yR-f&EmVa2->4%#<`9#FRdq^T!RX7Y zPx6qb769ce{j=H8&0`j2GHJf)V@qr8NwLwZ2wkD62gz-7 zP#rzROQnj?GMCu@bG_hqM!c0bgkogIo!Hub#k+@3--2_G|NPw`@kVO*1XquV?*RFNl7^39+5| zt@(7DbNlw89r`JG>%BuQ6V#(Dw+oHG#2uZ}W*mFGNpsVDigaC=pVs3eWr14qXA+}P z!45_nV>6@lNA^CN%rLmfZ6;^oi>GDx9#)JnC(MM z+MF8ywwOb|jC(uFjUxY5+{aB>S_8ym2!hNO^|AlIQn2}#m%($%0l<9*HmRadv+ovy z*M<%8@nABBCRk->KH5a{D7Tt>2IUFz z%DI-KE@>jBsHdUNkWS@k&-^Gt*uw^BBo{~CCrkxPO;B2C?FWQDTZ!Y^=PaSQqVG{9 zOM1Amf6D|{akN&$T_3a5__(+!_eiG-xKw>NA>7W0(k9W>!!x2b*}+<3fn8=Q0moJkz$m{CNM?lZd8jd%HQ?Dl{kYhrwxt~^-_3TGO@ z?By}bFSq{R8i9Ih$^X&_w0@<;ZLK+697sxIY*crjNTM%jX88Be*fu_j@Tmz%t&goh zEd$;I|7o|r!*cVlGuW3)W^CJx)Kkc7PgFX#Hvd~5&_Et;dHgSNAZ%#Wpt;_mBS0Y& zI!JBK_6#~6__QUzPZlu*1TNm&`md$%PK42L064&&lZWg-8jc@+w)A52c6MPNw7$e7xgEy9sEqrs7*!#Z7CKlUu3GY0oL++|p*BSV^V zVrMK1?<67xH#5>L$8IHFjKyWRM+Jos!3C=2PfyaXopbF-dNTh(5F)7_=zJ#cJg~Xk z?EvZd^n>G&OExYY$jy&Gq8fv*^uK+wW=C(pg8tU>BmbiRdXK+Fre>`CmJ^X+ux%8Nbn+Ddh*eu)w11?rJ);gZY$~!kRwq!POo*g?eKpV*RfzCQDePU&noEK zpf~>`(xH50pL%()TZjRS#m_ln1L5GA;gaHkn;se34)eaTQ)oQ%Ibom_7!??cxui!iCkgJ;y#@2FAcHa!ksIZ!QnCQ2x z(cyY@i}jrQ7L&Q7Ho@KQX&264_YJIS(HXRyt+3W_oCeghuD6tA=1cc*tj2B=3#awH zXrYpQeW#z+?C#wV1>YivfA8!?EJFr6k8HjMn4X$pp|*WF1$%A1`@6On6SN}64qtQA zzTtqKzcsKr?nOt{^oY-GTauTTd$HhWa$T4&_IwHbN^1}DZD3zi{EJfC&bX;^CDo^0 z4;_of1=-Htc^i@Nl;Z+(2&?N6y#aKJB{zh+E$5WL{?#1Bz*Oo=>5^YNzxh7^e*d-M z%|cx}U!i@31^?OHAYAGs&AMr&jNT<>LHV0>CC8P~kLL*K98GWs!iQ7;`EAh3&S53$Kh)*^1l5(9{ebr{ z`wlN%d35cHpXq+=JY(Lg!ATj~bMvb7c^u@t`k0wwD;1k_V<@h!q?#(W(#c)kwt z-|Rk{?=l4316>dq$>5<9lMYOYsnT>iFjK9y+~)e1qIUUSA?uqFs3nXktn0a<6aX#^ z4I=4P%yvu(yB%`|pAH%YZWOu_(vwjyT?px*hw3e`=aQ18pBu`#xpD}CT_rTWu|oJa zzRx#a_s6ND1^j5bb@c9I><4w+DeTN2PTi(3Vt80dk3vmkP)|jy$SxRdrSfDrWsm zd(ZY@QK8FQL7C)+F-Zl46sdrIqLO(RMXZ=Qg*~W|k)#3=%1O@^ZW1*KbQ;E|_bNQl z&WbF)E>D{e?+xftc*H9|-bC_&XY$)}-LUZ~;GYD(+hdixLU1V)nk`qQA5{owI$>5* z*s{;h6%P+(%iSe55@^_@($S}M1ZB(p`o4TqcmT@#%9<7RBmKQ7B|xEERq=-+6pQB_`qRd$$S^!4xNhF-;la+=DR&lLmp`h38Hs+@}Yt;!VP%Aho( z2P(=SPi3~(tHI37sTT+CVP9k2v#U>cq^CVKvE7SduZz<9;_!vI_~crWmpP`FxcDHb z`r-rj=O?IQi=?-=8xmX_5?p+?_*Lp4Dz`w_zFIuiW}4!bguk!vm&J{FX`NNHsN8Bv z`8@4{syZ(#L@~!sGskxX=t#}UJH5{r+RL*%-hJ_l4facVT-bP!5>~~#aB<<>9Ax}j zw2h8T8&E}d#YPAS!q{GG4OiRSk`FIHv)-P2pu4S?@Fo2B#T7PA;&ojr zz&44LANgOKvTAsmKlcUa{mo=ey!R&bgwzEU3)djFYnnEz&YKiAo?Fn;tJ9Tz*tVUc z5wsG`5UF->@lksiYxPt65#q%#KQYHj=sc{4NmObj@ed{K^lY%*$0Ec;HhED0wsND_ zTC6o^&v)&<>~?%%lyXa!zdB$gnAX+AE`6y`MbHNpCh*%Zj-TWkXc_I<&@Mwd5IwEp7MBMiV@gSPv z(}z6d?a|ty*m(v1q!K6J^Dxgd9a`PY$ zeb6Z|FE@Hsc2<-*B~yNv`G&lQ5~wz&Mad-J;aC)%n6y+DK%5g!Hm2zuQ#)`S3ts9M}M+1sqtIaH8d z=-Z)JWG7j9;h#`<7E5g^w>h{hQ1rt57AV=5!YUjhP3We@9vFcpDA;yzz5MKC%Lg%Tb_4O+(e9Xq zS3_#!H%C+OJ4b!QD3y1}m$Ir)%j=8tg=bOUwnXDqeV#p(>&Pl?R#vQo80J`J6BtGb z`x#YfeCvds;P!?zw00S`)4{oLhs=SYt34^vCWe08WYB!Tr17zKYd|fr8BC~dV6?tlze8U(+7rKXsfQ`t9cTNlVbO1C&_zXXR&)zunI}0 z2_X9Q*&2{`b$#Xn@~(IUmN{t}N>&Vu5X + %\VignetteIndexEntry{tlsh} + %\VignetteEngine{knitr::rmarkdown} + %\usepackage[utf8]{inputenc} +--- +We present a small example from Steorts, R., Ventura, S., Sadinle, M., and Fienberg, S. (2014). "Blocking Comparisons for Record Linkage." Privacy in Statistical Databases (Lecture Notes in Computer Science 8744), ed. J Domingo-Ferrer, Springer, 252-268, \doi{10.1007/978-3-319-11257-2_20}. We will be using the blink package in R and the RLdata500 data set, which was previously available in the Record Linkage package (but has been deprecated). Here, we illustrate transitive LSH. + +In a record linkage task one wants to remove duplicate +entries from multiple databases. However, before performing this task, one needs to perform a means of dimension reduction so that the record linkage task is computationally scalable. + +Using the TLSH algorithm, we illustrate an example of using this package using a German dataset comprised of first and last name and full date of birth. + +Our goals include + +- Presenting the RLdata500 dataset with summary information. +- Illustrating how we can format the RLdata500 dataset to work with the klsh +- Running TLSH on the RLdata500 data set to create blocks +- Explaining the tuning parameters of TLSH and how to choose these in practice with evaluation metrics. +- Sample output and visualizations + +## Understanding the RLdata500 dataset + +The RLdata500 dataset exists already in the blink package in R. We review this data set for the user. + +The RLdata500 data consists of 500 records with 10 percent duplication. Thus, there are 450 unique individuals. There is full information on each record containing first name, last name, and full date of birth. + +We first load the blink package and load the RLdata500 data set. We also, provide the first few lines of the data. We also remove missing values (they are all missing in this data set). + +```{r, echo=TRUE, message=FALSE, knitr::opts_chunk$set(cache=TRUE)} +library(blink) +library(plyr) +library(tlsh) +data(RLdata500) +head(RLdata500) +data.500 <- RLdata500[-c(2,4)] +head(data.500) +``` + +## TLSH applied to RLdata500 + + + +We now explain how to run TLSH on the RLdata500 data set, piece by piece. + +1. We first must creat a universal set of tokens. +2. We then number find the number of tokens in the universal set. +3. Then we must generate a vector of random hash functions. +4. Next, we must creating an index vector and apply the hash functions to each record +5. Then we build an edgelist, divide the graph into communities initially, sub-divide the communities more if needed +6. Finally, we have our blocks. +7. Then we can compute the dimension reduction and the recall. + +The function that find the blocks is called **block_setup_v2. + +```{r} + blocks <- block_setup_v2(RLdata500, b=22, k=2) + summary(blocks) +``` + +where b is the number of **buckets** and k is the **shingle size**. + +Observe that the blocks are roughly about the same size, however, this does not have to be the case. + + +The function that allows us to find the recall is **eval.blocksetup**. + +```{r} +eval.blocksetup(RLdata500, b=26, key=identity.RLdata500) +``` + +The function that allows us to find the reduction ratio is **reduction.ratio.from.blocking**. + +```{r} +(rr <- reduction.ratio.from.blocking(blocks)) +``` + +To summarize, we have reduced the entire space by roughly 66 percent and the recall is 0.90, which means we are only splitting records across blocks 10 percent of the time. + +