From 42beae160f662fa6f4c67735f55ee13debed7769 Mon Sep 17 00:00:00 2001
From: Rebecca Steorts <beka@stat.duke.edu>
Date: Fri, 6 Nov 2020 16:00:02 +0000
Subject: [PATCH] version 0.1.0

---
 DESCRIPTION                           |  21 ++
 MD5                                   |  27 ++
 NAMESPACE                             |  22 ++
 R/blocking-evaluations.R              |  99 ++++++
 R/minhash_v2.R                        | 380 +++++++++++++++++++++
 R/tlsh.R                              | 115 +++++++
 build/vignette.rds                    | Bin 0 -> 188 bytes
 inst/doc/tlsh.R                       |  19 ++
 inst/doc/tlsh.Rmd                     |  86 +++++
 inst/doc/tlsh.html                    | 464 ++++++++++++++++++++++++++
 man/block.ids.from.blocking.Rd        |  21 ++
 man/block_setup_v2.Rd                 |  28 ++
 man/compare_buckets.Rd                |  28 ++
 man/confusion.from.blocking.Rd        |  27 ++
 man/eval.blocksetup.Rd                |  27 ++
 man/extract_pairs_from_band.Rd        |  27 ++
 man/hash_signature.Rd                 |  29 ++
 man/minhash_v2.Rd                     |  31 ++
 man/my_hash.Rd                        |  27 ++
 man/primest.Rd                        |  23 ++
 man/reduction.ratio.Rd                |  22 ++
 man/reduction.ratio.from.blocking.Rd  |  21 ++
 man/rhash_funcs.Rd                    |  27 ++
 man/shingled_record_to_index_vec.Rd   |  24 ++
 man/shingles.Rd                       |  23 ++
 vignettes/blocks_members.Rdata        | Bin 0 -> 946 bytes
 vignettes/candidate_pairs_graph.Rdata | Bin 0 -> 12763 bytes
 vignettes/tlsh.Rmd                    |  86 +++++
 28 files changed, 1704 insertions(+)
 create mode 100644 DESCRIPTION
 create mode 100644 MD5
 create mode 100644 NAMESPACE
 create mode 100644 R/blocking-evaluations.R
 create mode 100644 R/minhash_v2.R
 create mode 100644 R/tlsh.R
 create mode 100644 build/vignette.rds
 create mode 100644 inst/doc/tlsh.R
 create mode 100644 inst/doc/tlsh.Rmd
 create mode 100644 inst/doc/tlsh.html
 create mode 100644 man/block.ids.from.blocking.Rd
 create mode 100644 man/block_setup_v2.Rd
 create mode 100644 man/compare_buckets.Rd
 create mode 100644 man/confusion.from.blocking.Rd
 create mode 100644 man/eval.blocksetup.Rd
 create mode 100644 man/extract_pairs_from_band.Rd
 create mode 100644 man/hash_signature.Rd
 create mode 100644 man/minhash_v2.Rd
 create mode 100644 man/my_hash.Rd
 create mode 100644 man/primest.Rd
 create mode 100644 man/reduction.ratio.Rd
 create mode 100644 man/reduction.ratio.from.blocking.Rd
 create mode 100644 man/rhash_funcs.Rd
 create mode 100644 man/shingled_record_to_index_vec.Rd
 create mode 100644 man/shingles.Rd
 create mode 100644 vignettes/blocks_members.Rdata
 create mode 100644 vignettes/candidate_pairs_graph.Rdata
 create mode 100644 vignettes/tlsh.Rmd

diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..8f7a986
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,21 @@
+Package: tlsh
+Type: Package
+Title: Transitive Locality-Sensitive Hashing (LSH) for Record Linkage
+Version: 0.1.0
+Authors@R: person("Rebecca", "Steorts", email = "beka@stat.duke.edu",
+  role = c("aut", "cre"))
+Depends: R (>= 3.5.0), blink, stats, utils, plyr, igraph, bit64
+Imports:
+Suggests: knitr, ggplot2, rmarkdown
+VignetteBuilder: knitr
+Description: An implementation of the blocking algorithm transitive locality-sensitive hashing (TLSH) in Steorts, Ventura, Sadinle, Fienberg (2014) <DOI:10.1007/978-3-319-11257-2_20>, which is a k-means variant of locality sensitive hashing. The method is illustrated with examples and a vignette. 
+Encoding: UTF-8
+LazyData: true
+License: GPL-3
+RoxygenNote: 7.1.1.9000
+NeedsCompilation: no
+Packaged: 2020-11-02 16:22:06 UTC; rebeccasteorts
+Author: Rebecca Steorts [aut, cre]
+Maintainer: Rebecca Steorts <beka@stat.duke.edu>
+Repository: CRAN
+Date/Publication: 2020-11-06 17:00:02 UTC
diff --git a/MD5 b/MD5
new file mode 100644
index 0000000..5d0b21c
--- /dev/null
+++ b/MD5
@@ -0,0 +1,27 @@
+395f2eae2242b16010635c0159710d32 *DESCRIPTION
+b308deeacf7be1a89891ae2523b10aed *NAMESPACE
+883a5648af177469263685d621b6968b *R/blocking-evaluations.R
+54acb8a8a2d3db0d8846427f6096e22d *R/minhash_v2.R
+10eb3c9ee8f70eb94b3e60164268dc45 *R/tlsh.R
+b5b601a33c1238f64d7521e55fa5a4ca *build/vignette.rds
+4c8da6ba82b58b79ae31cb933f833730 *inst/doc/tlsh.R
+043c61554a7b1f24e72a3b813bf9cef4 *inst/doc/tlsh.Rmd
+7c94d16e855c117546c0893ddba9edf5 *inst/doc/tlsh.html
+239e26c6ffab4f19c228f2b4965ef831 *man/block.ids.from.blocking.Rd
+68b865ead685190f24280d8bf17d4378 *man/block_setup_v2.Rd
+431dfd43b092d3359adf85ffcb837de9 *man/compare_buckets.Rd
+cf7770443b48e38aa67e4551b58ce8c1 *man/confusion.from.blocking.Rd
+eed4ab096a637c80534def65b5e11a42 *man/eval.blocksetup.Rd
+b634e13566eabb03db9f007c03806fe3 *man/extract_pairs_from_band.Rd
+4154bc7a1e4f546ac23c29bda5995a67 *man/hash_signature.Rd
+8d6a8176be4b368f9eb7bb39aacd90e8 *man/minhash_v2.Rd
+27ebdba13d154cc2051cbb266b2282ee *man/my_hash.Rd
+2be0caecb0d785afb805c7ee861b4c42 *man/primest.Rd
+b28b796a274f35ee917457e8727d3b49 *man/reduction.ratio.Rd
+87093270eb12fdf33bffe80d6bb5c3ba *man/reduction.ratio.from.blocking.Rd
+88bc1a83dda8e41d19bc2f11a4055f3b *man/rhash_funcs.Rd
+eeb46ff58a2f61eed834eec73ada48f8 *man/shingled_record_to_index_vec.Rd
+71ce81206422e086585828a257af8ab3 *man/shingles.Rd
+da525e154d909a4b9ec3184d02867e5f *vignettes/blocks_members.Rdata
+91073e85e36c66cafac6f0b6bb54e1c0 *vignettes/candidate_pairs_graph.Rdata
+043c61554a7b1f24e72a3b813bf9cef4 *vignettes/tlsh.Rmd
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..ba3292d
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,22 @@
+# Generated by roxygen2: do not edit by hand
+
+export(block.ids.from.blocking)
+export(block_setup_v2)
+export(compare_buckets)
+export(confusion.from.blocking)
+export(eval.blocksetup)
+export(extract_pairs_from_band)
+export(hash_signature)
+export(minhash_v2)
+export(my_hash)
+export(primest)
+export(reduction.ratio)
+export(reduction.ratio.from.blocking)
+export(rhash_funcs)
+export(shingled_record_to_index_vec)
+export(shingles)
+import(bit64)
+import(blink)
+import(igraph)
+import(plyr)
+import(utils)
diff --git a/R/blocking-evaluations.R b/R/blocking-evaluations.R
new file mode 100644
index 0000000..04cb2dd
--- /dev/null
+++ b/R/blocking-evaluations.R
@@ -0,0 +1,99 @@
+#' Perform evaluations (recall) for blocking.
+#'
+#' @import blink
+#' @param blocking A list of the blocks
+#' @param true_ids The true identifiers for comparisons
+#' @param recall.only Flag that when true only prints the recall, otherwise
+#' prints many evaluation metrics in a list
+#' @return A vector of that returns the recall and the precision
+#' @export
+#' @examples
+#' r.set <- RLdata500[1:250,c(-2)]
+#' tlsh.blocks <- block_setup_v2(r.set, b=22, save_signature=FALSE, k=2)
+#' confusion.from.blocking(tlsh.blocks, identity.RLdata500, recall.only=TRUE)
+
+confusion.from.blocking <- function(blocking, true_ids, recall.only=FALSE) {
+	# convert blocking into a vector of labels
+	nn <- sum(sapply(blocking,length))
+	block.ids = rep(NA, nn)
+	for(ii in 1:length(blocking))  block.ids[blocking[[ii]]] = ii
+	# For each pair of records, check whether they are in the same block
+
+    candidate.pairs = combn(length(block.ids), 2)
+    same.block <- block.ids[candidate.pairs[1,]] == block.ids[candidate.pairs[2,]]
+    same.truth <- true_ids[candidate.pairs[1,]] == true_ids[candidate.pairs[2,]]
+
+	#same.block <- outer(block.ids,block.ids,"==")
+	# For each pair of records, check whether the true ids match
+	#same.truth <- outer(true_ids,true_ids,"==")
+	# table same-block vs. same-truth
+	confusion <- table(same.block,same.truth, dnn=c("same block?","actually same?"))
+	# In the confusion matrix, rows refer to the blocks and columns refer to the truth
+
+	false.positives <- confusion[2,1]
+	false.negatives <- confusion[1,2]
+	true.positives <- confusion[2,2]
+	true.negatives <- confusion[1,1]
+	recall <- true.positives/(false.negatives + true.positives)
+
+
+	#true.positives <- confusion[2,2]
+	#misses <- confusion[1,2]
+	#recall <- true.positives/(true.positives+misses)
+	#precision <- true.positives/(same.truth)
+	if (recall.only) {
+		return(recall)
+	} else {
+		return(return(list(confusion,
+              recall = true.positives/(false.negatives + true.positives),
+              precision = true.positives/(true.positives + false.positives),
+              fpr = false.positives/(false.positives + true.negatives),
+              fnr = false.negatives/(false.negatives + true.positives),
+              accuracy = (true.positives + true.negatives)/(true.positives + 	true.negatives + false.negatives + false.positives),
+ 			        specificity = true.negatives/(true.negatives + false.positives))))
+	}
+}
+
+#' Returns the block ids associated with a blocking method.
+#'
+#' @import blink
+#' @param blocking A list of the blocks.
+#' @return A list of the blocks ids that corresponds to each block
+#' @export
+#' @examples
+#' tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:250,c(-2,-4)], b=10, save_signature=FALSE, k=1)
+#' block.ids.from.blocking(tlsh.blocks)
+
+block.ids.from.blocking <- function(blocking) {
+	nn <- sum(sapply(blocking,length))
+	block.ids = rep(NA, nn)
+	for(ii in 1:length(blocking))block.ids[blocking[[ii]]] = ii
+	return(block.ids)
+}
+
+#' Returns the reduction ratio associated with a blocking method
+#'
+#' @import blink
+#' @import utils
+#' @param block.labels A list of the blocks labels.
+#' @return The reduction ratio
+#' @export
+#' @examples
+#' tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:50,c(-2)], b=22, save_signature=FALSE, k=2)
+#' block.ids <- block.ids.from.blocking(tlsh.blocks)
+#' reduction.ratio(block.ids)
+
+reduction.ratio <- function(block.labels) 1 - sum(choose(table(block.labels),2)) / choose(length(block.labels),2)
+
+#' Returns the reduction ratio associated with a blocking method
+#'
+#' @import blink
+#' @param blocking The actual blocks
+#' @return The reduction ratio
+#' @export
+#' @examples
+#' tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:50,c(-2,-4)], b=10, save_signature=FALSE, k=1)
+#' reduction.ratio.from.blocking(tlsh.blocks)
+reduction.ratio.from.blocking <- function(blocking) {
+	reduction.ratio(block.ids.from.blocking(blocking))
+}
diff --git a/R/minhash_v2.R b/R/minhash_v2.R
new file mode 100644
index 0000000..fa2ac53
--- /dev/null
+++ b/R/minhash_v2.R
@@ -0,0 +1,380 @@
+#' Function to shingle (token or gram) a string into its k components
+#'
+#' @param record String or record
+#' @param k Parameter k, which is the number of shingle, tokens, or grams to break the string into
+#' @return Computes the shingled (tokened or grammed) version of a string
+#' @export
+#' @examples
+#' shingles("Alexander",2)
+#' shingles("Alexander Smith", 2)
+
+shingles <- function(record,k){
+	# factors only convert to characters properly elementwise,
+	# not by applying as.character() to a whole row of a data
+	# frame
+	char_record <- lapply(record,as.character)
+	string <- paste(char_record, collapse=" ")
+	k_substring <- function(start){
+		substring(string,start,start + k - 1)
+	}
+	tokens <- lapply(X=seq(1,nchar(string)-k+1), k_substring)
+	return(tokens)
+}
+
+#' Function to convert to tell what index the shingle corresponds to in the record
+#'
+#' @param shingled_record Shingled record
+#' @param universal_set Universal set of all shingles
+#' @return the index regarding where the shingle falls in the record
+#' @export
+#' @examples
+#' shingles("Alexander",2)
+#' shingles("Alexander Smith", 2)
+#' shingled_record_to_index_vec(shingles("Alexander",2), unique(shingles("Alexander Smith", 2)))
+
+shingled_record_to_index_vec <- function(shingled_record, universal_set) {
+	unique_tokens_in_record <- unique(shingled_record)
+	token_indices <- match(unique_tokens_in_record, universal_set)
+	return(token_indices)
+	# TODO: Check if next line ever needed?
+	#return(1:length(universal_set) %in% token_indices)
+}
+
+#' Function to create a matrix of minhashed signatures
+#'
+#' @import blink
+#' @import plyr
+#' @param shingled_records Shingled records
+#' @param p Number of permutations to be applied to the hash function
+#' @param do_one_hash_and_record Combination of one hash and one record
+#' @return Computes an integer-valued matrix of minhash signatures with one row per permutation and one column per record
+#' @export
+#' @examples
+#' head(data <- RLdata500[-c(2,4)])
+#' minidata <- data[1:2,]
+#' head(all_the_shingles <- apply(minidata,1,shingles,k=8))
+#' head(minhash.minidata <- minhash_v2(all_the_shingles, p=10))
+
+minhash_v2 <- function(shingled_records, p, do_one_hash_and_record=do_one_hash_and_record) {
+	n.records <- length(shingled_records)
+
+	# Figure out the universal set of all tokens
+	print("Creating the universal set of tokens")
+	print(system.time(universal_set_tokens <- unique(do.call(c, shingled_records)))[3])
+	n.shingles <- length(universal_set_tokens)
+	print("Number of tokens in universal set")
+	print(n.shingles)
+
+	# Generate a vector of p random hash functions
+	print("Generating a vector of random hash functions")
+	print(system.time(vector_of_hash_funcs <- rhash_funcs(n=p, size=n.shingles, vector.valued=FALSE))[3])
+
+	# prepare a function to do the combination of one hash and one record
+	# Presumes: rec_col is a vector saying which shingles (from the universal set) # are present in the shingled record
+	do_one_hash_and_record <- function(h, rec_col) {
+		# if (!timing) {
+		# 	updated_v <- vector_of_hash_funcs[[h]](rec_col)
+		# } else {
+			applying_hash <- (updated_v <- vector_of_hash_funcs[[h]](rec_col))[3]
+		# }
+		# if (!timing) {
+		# 	min_value <- min(updated_v)
+		# } else {
+			taking_min <- (min_value <- min(updated_v))[3]
+		# }
+		# if (timing) {
+		# 	print("Applying hash function")
+		# 	print(applying_hash)
+		# 	print("Getting values")
+		# 	print(getting_values)
+		# 	print("Taking the minimum")
+		# 	print(taking_min)
+		#}
+		return(min_value)
+	}
+	# Create a function to apply all the hash functions to one record
+	 # This function must turn the shingled record into an indicator vector
+	 # then apply all the functions
+	  # then discard the indicator vector
+	multiple_hash_one_record <- function(record) {
+		index_vec <- shingled_record_to_index_vec(record, universal_set_tokens)
+		multi_hash <- sapply(1:p, do_one_hash_and_record, rec_col=index_vec)
+		return(multi_hash)
+	}
+	# Timing applying all the hash functions at once to one record
+	print("Creating index vector and applying hash functions to first record")
+	print(system.time(multiple_hash_one_record(shingled_records[[1]])))
+	# Apply that multi-hash-function to all records
+	signatures <- sapply(shingled_records, multiple_hash_one_record)
+	# Return the matrix of minhash signatures
+	return(signatures)
+}
+
+#' Function to generate all primes larger than an integer n1 (lower limit) and less than any other integer n2 (upper limit)
+#'
+#' @param n1 An integer taken to be 1 as the default
+#' @param n2 Any integer n2
+#' @return Generates all prime numbers with the above constraints
+#' @export
+#' @examples
+#' primest(1, 5)
+#' primest(1, 17)
+primest <- function(n1=1, n2){
+    p <- (n1+1):n2
+    i <- 1
+    while (p[i] <= sqrt(n2)) {
+        p <-  p[p %% p[i] != 0 | p==p[i]]
+        i <- i+1
+    }
+    p
+}
+
+#' Function to generate a vector of random hash functions (or optionally one vector-valued function)
+#'
+#' @param n Number of random hash functions
+#' @param size Range of each size
+#' @param vector.valued Flag for outputing vector of functions or vector-valued function
+#' @param perfect Flag for whether a perfect permutation should be done, or just a hash function
+#' @return Vector of n hash functions or a function which will take a number and return a vector of n different hashes of it
+#' @export
+#' @examples
+#' rhash_funcs(1, 1, vector.valued=FALSE, perfect=FALSE)
+#' rhash_funcs(5, 1, vector.valued=FALSE, perfect=FALSE)
+
+# TODO: replace this with digest
+rhash_funcs <- function(n, size, vector.valued, perfect=FALSE) {
+	# Determine a suitable prime greater than size and =< 2*size
+	candidate_primes <- primest(size,2*size)
+	# Take the first suitable prime for simplicity's sake
+	the_prime <- candidate_primes[1]
+	# Create a single random hash function and return it
+	if (!perfect) {
+		# Make up a random hash function by modulo arithmetic
+		make_one_hash_func <- function() {
+			# Generate a function of the form ((ax+b) mod the_prime ) mod size
+			# a,b < the_prime, a non-zero
+			a <- as.integer64(sample(1:(the_prime-1),size=1))
+			b <- as.integer64(sample(0:(the_prime-1),size=1))
+			# Cast to a 64-bit integer
+			the_prime <- as.integer64(the_prime)
+			size <- as.integer64(size)
+			hash_func <- function(x) {
+				x <- as.integer64(x)
+				h <- ((a*x+b) %% the_prime) %% size
+				return(as.integer(h))
+			}
+			return(hash_func)
+		}
+	} else {
+		# Make a perfect hash function that permutes the whole domain
+		make_one_hash_func <- function() {
+				perm <- sample(size)
+				hash_func <- function(x) { perm[x] }
+				return(hash_func)
+		}
+	}
+	# Make a list of n random hash functions
+	hash_func_list <- replicate(n, make_one_hash_func())
+	if (vector.valued) {
+		# Create a function which takes a number and returns a vector,
+		# each component a different hash function's evaluation
+  		   # TODO: replace iteration with something more vectorized
+		   # want many functions with one input, not many inputs
+		   # to one function
+		vector_hash_func <- function(x) {
+			h <- vector(length=n)
+			for (i in 1:length(h)) {
+				h[i] <- (hash_func_list[[i]])(x)
+			}
+			return(h)
+		}
+		# return the vector-valued hash function
+		return(vector_hash_func)
+	} else {
+		return(hash_func_list)
+	}
+}
+
+
+#' Function to take a signature matrix M composed of b bands and r rows and return
+#' a bucket for each band for each record
+#'
+#' @param signature Signature matrix M composed of b bands and r rows
+#' @param b Number of bands
+#' @return Bucket for each band for each record
+#' @export
+#' @examples
+#' head(data <- RLdata500[-c(2,4)])
+#' minidata <- data[1:2,]
+#' head(all_the_shingles <- apply(minidata,1,shingles,k=8))
+#' head(minhash.minidata <- minhash_v2(all_the_shingles, p=10))
+#' hash_signature(minhash.minidata, b=2)
+#' hash_signature(minhash.minidata, b=5)
+
+#assumes that signature matrix has been computed
+	#take signature matrix M into b bands of r rows
+	#returns the bucket for each band for each record
+hash_signature <- function(signature,b){
+	# need to divide signature into b bands of rows
+	# so r = nrow(signature)/b, rounded down
+	r = floor(nrow(signature)/b)
+	extract_band <- function(i) { signature[((i-1)*r+1):(i*r),] }
+	bands <- lapply(1:b,extract_band)
+	# for each band, hash each portion of its columns to a hash table with k buckets
+		#make k as large as possible
+	band_hash <- sapply(bands,my_hash)
+	# sapply builds up its output columnwise, so transpose to keep records as columns
+	return(t(band_hash))
+}
+
+#' Function that applies a hash function to each column of the band from the
+#' signature matrix
+#' import bit64
+#'
+#' @import bit64
+#' @param a_band Band from the signature matrix M
+#' @return a 64 bit integer
+#' @export
+#' @examples
+#' band1 <- c(2,1,2,1,2)
+#' band2 <- c(4,5,2,1,9)
+#' combined_band <- rbind(band1,band2)
+#' my_hash(combined_band)
+
+my_hash <- function(a_band) {
+	hash64 <- function(x) {
+		# if x is a vector, concatenate all its digits into one long number
+		# use paste() to concatenate, then as.integer64 to turn into a big number
+		y <- as.integer64(paste(x,sep="",collapse=""))
+		return(hashfun(y,hashbits=64))
+	}
+	return(apply(a_band,2,hash64))
+}
+
+
+#' Function that extracts pairs of records from a band in the signature matrix M
+#' import bit64
+#'
+#' @param a_band Band of the signature matrix M
+#' @return The edgelist of record pairs that are connected
+#' @export
+#' @examples
+#' band1 <- c(2,1,2,1,2)
+#' extract_pairs_from_band(band1)
+#' band2 <- c(6,7,8,9,6)
+#' extract_pairs_from_band(band2)
+#' band.12 <- rbind(band1, band2)
+#' apply(band.12,1,extract_pairs_from_band)
+
+extract_pairs_from_band <- function(a_band) {
+   	# Each record has been mapped to some bucket within this band
+   	# We now want to note down which pairs of records got mapped to the _same_
+   	# bucket in this band (not caring about whether they got put in the same
+   	# bucket in other bands)
+   	record_pairs_in_bucket <- function(a_bucket) {
+   		# print(paste("In bucket",a_bucket))
+   		records_in_the_bucket <- which(a_band==a_bucket)
+   		# print(paste(length(records_in_the_bucket),"records"))
+   		if (length(records_in_the_bucket) > 1) {
+	   		recpairs <- as.matrix(combn(records_in_the_bucket,m=2))
+	   	} else {
+	   		recpairs <- matrix(records_in_the_bucket,nrow=2,ncol=1)
+	   	}
+   		return(recpairs)
+   	}
+   	# Which buckets did we actually see in this band?
+   	observed_buckets <- unique(a_band)
+   	# Extract common pairs of records for each observed bucket
+   	edgelist <- lapply(observed_buckets, record_pairs_in_bucket)
+   	# We'll get back a list so bind them together columnwise
+   	edgelist <- do.call(cbind,edgelist)
+   	rownames(edgelist) <- c("rec1","rec2")
+   	# We want the transpose
+   	return(t(edgelist))
+}
+
+#' Function that creates a similarity graph and divides it into communities (or blocks) for entity resolution
+#'
+#' @import igraph
+#' @import plyr
+#' @param hashed_signatures The hashed signatures
+#' @param max_bucket_size The largest block size allowed by user
+#' @return max_bucket_size The largest bucket size (or block size) that one
+#' can handle
+#' @export
+#' @examples
+#' head(data <- RLdata500[-c(2,4)])
+#' minidata <- data[1:2,]
+#' head(all_the_shingles <- apply(minidata,1,shingles,k=8))
+#' head(minhash.minidata <- minhash_v2(all_the_shingles, p=10))
+#' hashed_signature <- hash_signature(minhash.minidata, b=5)
+#' compare_buckets(hashed_signature, max_bucket_size=200)
+
+# Create a similarity graph and divide it into communities, as blocks for record linkage
+# Inputs: minhashed signature matrix, maximum block size
+# Presumes: the signature matrix has been created by minhashing (or something like
+  # it), so 2 records matching in some row indicates non-trivial similarity
+compare_buckets <- function(hashed_signatures, max_bucket_size=1000) {
+	# Create blocks from the buckets the bands were mapped to
+	# General idea: each record gets put in multiple buckets from the multiple minhashes
+	# Two records are a "candidate pair" if they get mapped to the same bucket
+	# by some minhash or other
+	# Form a graph, with records as nodes, and edges between candidate pairs
+	# Divide the graph into dense sub-graphs (communities), subject to a maximum
+	# size limit
+
+    # Each row of hashed_signatures represents the bucket-mapping of
+    # the records for a different minhash permutation
+    # Apply extract_pairs_from_band to each row, and then combine the resulting
+    # matrices of candidate-pair records into one big edgelist
+
+    # TODO: Try using plyr rather than apply + do.call to see about speed
+    print("Creating edgelist")
+    edgelisting <- system.time(edgelist <- as.matrix(do.call(rbind,apply(hashed_signatures,1,extract_pairs_from_band))), gcFirst=FALSE)
+    print(edgelisting)
+    print(dim(edgelist))
+	# Actually build the graph
+	    # edgelist contains only edges in one direction, so we need to tell igraph
+	    # that edges are directionless
+	print("Building graph from edgelist")
+	graphing <- system.time(candidate_pairs_graph <- graph.edgelist(edgelist, directed=FALSE), gcFirst=FALSE) # Actually build the graph
+	# edgelist isn't needed any more and can be quite big, so remove it from memory
+	rm(edgelist)
+	print(graphing)
+	# Remove multiple and self edges, if they exist
+	candidate_pairs_graph <- simplify(candidate_pairs_graph)
+
+	# Try dividing the graph into communities. Use a hierarchical community method
+	# so that if the initial cut has communities which are too big, we can go further down
+	# until they are small enough to work with.
+	print("Dividing graph into communities initially")
+	communitying <- system.time(initial_community <- fastgreedy.community(candidate_pairs_graph), gcFirst=FALSE)
+	print(communitying)
+	#save(candidate_pairs_graph,file = "candidate_pairs_graph.Rdata")
+
+
+	# The graph has served its purpose and should go away
+	rm(candidate_pairs_graph)
+
+	# Sub-divide communities if too big
+	max_comm_size <- max(sizes(initial_community))
+	comm_number <- length(initial_community)
+	comm_membership <- membership(initial_community)
+	print("Subdividng communities")
+	subdividing <- system.time(
+	while(max_comm_size > max_bucket_size) {
+		comm_number <- comm_number+1
+		comm_membership <- cutat(initial_community, no=comm_number)
+		max_comm_size <- max(table(comm_membership))
+	}
+	,gcFirst=FALSE)
+	print(subdividing)
+	blocks_members <- comm_membership
+	num_blocks <- comm_number
+	#save(blocks_members, file="blocks_members.Rdata")
+
+	# Now create a list, saying which records are in which block
+	records_per_block <- function(b) { which(blocks_members == b)}
+	blocks <- lapply(1:num_blocks,records_per_block)
+	return(blocks)
+}
diff --git a/R/tlsh.R b/R/tlsh.R
new file mode 100644
index 0000000..f174ad9
--- /dev/null
+++ b/R/tlsh.R
@@ -0,0 +1,115 @@
+# This is the one of the main blocking methods in Steorts, Ventura, Sadinle,
+# Fienberg (2014), Privacy in Statistical Databases.
+# If you use this code, please cite Steorts, R., Ventura, S., Sadinle, M., and
+# Fienberg, S. (2014). "Blocking Comparisons for Record Linkage." Privacy
+# in Statistical Databases (Lecture Notes in Computer Science 8744), ed. J
+# Domingo-Ferrer, Springer, 252-268, doi:10.1007/978-3-319-11257-220.
+
+# tlsh Copyright 2018 Rebecca C. Steorts (beka@stat.duke.edu)
+
+# tlsh is free software: you can redistribute it and/or modify it
+# under the terms of the Creative Commons license, either version 3 of the
+# license, or (at your option) any later version.
+
+# tlsh is distributed in the hope it will be useful, but without ANY WARRANTY;
+# without even the implied warranty of merchantability or fitness for a particular
+# purpose. Specifically, you may share the software in any medium or format and
+# you may adapt the software. Credit must be given when either of these are
+# given to indicate if and what changes were made. The software may not be
+# used for noncommerical purposes. If you are interested in using the software
+#  for commercial purposes, please contact the author above.
+###########################################################################################################
+
+
+#Begin working example
+# TODO: make sure the blocks are saved.
+
+#library(plyr)
+#library(digest)
+#library(RecordLinkage)
+#data(RLdata500)
+#minidata <- RLdata500[-c(2,4)]
+#The command
+#rl_data_500_b26 <- adply(1:5, .margins=1, .fun = eval.blocksetup,  dat=minidata, b=26, .expand=F,key=identity.RLdata500)
+#plot(1:5, rl_data_500_b26[,2],xlab="k",ylab="Recall")
+#plot(1:5, rl_data_500_b26[,3],xlab="k",ylab="Elaped Time")
+
+
+# will loop through shingles 1:5 and save the recall and the runtime. We should also
+# save the precision and reduction ratio as well.
+
+#rl_data_500_b22_30 <- adply(2:8, .margins=1, .fun = eval.blocksetup, dat = #RLdata500, b=22, .expand=F,key=identity.RLdata500)
+#save(rl_data_500_b22_30, file="rl_data_500_b22_10.Rdata")
+
+# plot(2:8, rl_data_500_b22_30[,2],xlab="k",ylab="Recall",ylim=c(0,0.95),type="b")
+# points(2:8, rl_data_500_b22_30[,2], xlab="k",ylab="Recall",ylim=c(0,0.1),pch=2,type="b")
+# points(2:8, rl_data_500_b22_50[,2], xlab="k",ylab="Recall",ylim=c(0,0.1),pch=3,type="b")
+# legend("bottomright", legend= c("10%", "30%","50%"), pch=c(1,2,3))
+
+#End working example
+
+# ATTN: There are additional functions below that will allow TLSH
+# to be integrated into random forests with a mapping function for
+# parallezation.
+
+
+#' Function to evaluate the blocking step
+#'
+#' import blink
+#' @param dat Data set
+#' @param b Number of buckets
+#' @param k Parameter k, which is the number of shingle, tokens, or grams to break the string into
+#' @param key Unique identifier
+#' @return Recall and runtime
+#' @export
+#' @examples
+#' r.set <- RLdata500[1:50,c(-2)]
+#' eval.blocksetup(r.set, k=2, b=22, key=identity.RLdata500)
+
+eval.blocksetup <- function(dat, k=5, b=21, key){
+	#runtime <- as.numeric((mapping <- block_setup_v2(dat, b=b, k=k))[3] )
+  mapping <- block_setup_v2(dat, b=b, k=k)
+	recall<- confusion.from.blocking (blocking=mapping,true_ids=key,recall.only=TRUE)[[1]]
+	return(data.frame(recall))
+}
+
+
+
+#' Function that divides all records into bins using locality sensitive hashing and using TLSH (based upon community detection technique)
+#'
+#' import blink
+#' @param r.set Record set (shingled records)
+#' @param b Band
+#' @param save_signature Flag of whether or not to save the signature
+#' @param k Shingle size
+#' @return List of blocks where a particular index is the record id in the original
+#' data set
+#' @export
+#' @examples
+#' r.set <- RLdata500[1:3,c(-2)]
+#' block_setup_v2(r.set = RLdata500[1:3,c(-2)], b=22, save_signature=FALSE, k=2)
+
+block_setup_v2 <- function(r.set, b=22, save_signature=FALSE,  k=5) {
+	# for each record r in r.set
+	  # calculate the hash function of the record r, say h
+	  # store r under h in the hash map
+	# return hash map from hash values to sets of records
+
+	# Convert each record (= row of r.set) to k-token shingles
+	shingled_records <- apply(r.set,1,shingles,k=k)
+	# Create the matrix of minhashed signatures, using p random permutations
+	# ATTN: Put this in parallel and test that it works
+
+	minhash_time <- system.time(minhashed_records <- minhash_v2(shingled_records,p=100),gcFirst=FALSE)
+	print(minhash_time)
+	if(save_signature) {
+		timestamp <- format(Sys.time(), "%Y_%m_%d_%H_%M_%S")
+		save(minhashed_records, file=paste("minhashed_signature", timestamp))
+	}
+
+	# Get rid of the shingled records as they've served their purpose
+	rm(shingled_records)
+
+	# Calculate signatures, put into buckets, make the graph, return blocks
+	return(compare_buckets(hash_signature(minhashed_records,b=b)))
+}
diff --git a/build/vignette.rds b/build/vignette.rds
new file mode 100644
index 0000000000000000000000000000000000000000..1ae48dc855b8c744718fe07a08ca7ca47f2c7cd7
GIT binary patch
literal 188
zcmV;t07L&DiwFP!000001B>8dU|?WkU;$z#W+0PU7)Y=Iu>cS=0>wFjG)GBJafV({
zZVH+>3rHMIj1#Ojqa-&6O@a+#2+%fRuqZ?pWC|0KD%QNj+|*(;t?VwT1*v%{AmM-5
z^)voQcd~bCWqE!POb?10Hkk9>GILU4_J(AZz&XqTE^bgJV-UI<Fw94BKW9;XxgN|9
qyr8gc022R!0Nta>If=#S9_30&EJ@T$D+1~Vxf=kY(T_cy0RRAJR7d{+

literal 0
HcmV?d00001

diff --git a/inst/doc/tlsh.R b/inst/doc/tlsh.R
new file mode 100644
index 0000000..94104df
--- /dev/null
+++ b/inst/doc/tlsh.R
@@ -0,0 +1,19 @@
+## ---- echo=TRUE, message=FALSE, knitr::opts_chunk$set(cache=TRUE)-------------
+library(blink)
+library(plyr)
+library(tlsh)
+data(RLdata500)
+head(RLdata500)
+data.500 <- RLdata500[-c(2,4)]
+head(data.500)
+
+## -----------------------------------------------------------------------------
+ blocks <- block_setup_v2(RLdata500, b=22, k=2)
+ summary(blocks)
+
+## -----------------------------------------------------------------------------
+eval.blocksetup(RLdata500, b=26, key=identity.RLdata500)
+
+## -----------------------------------------------------------------------------
+(rr <- reduction.ratio.from.blocking(blocks)) 
+
diff --git a/inst/doc/tlsh.Rmd b/inst/doc/tlsh.Rmd
new file mode 100644
index 0000000..f265a6b
--- /dev/null
+++ b/inst/doc/tlsh.Rmd
@@ -0,0 +1,86 @@
+---
+title: "tlsh"
+author: "Rebecca C. Steorts"
+date: "`r Sys.Date()`"
+output: 
+    rmarkdown::html_vignette:
+        fig_caption: yes
+vignette: >
+  %\VignetteIndexEntry{tlsh}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\usepackage[utf8]{inputenc}
+---
+We present a small example from Steorts, R., Ventura, S., Sadinle, M., and Fienberg, S. (2014). "Blocking Comparisons for Record Linkage." Privacy in Statistical Databases (Lecture Notes in Computer Science 8744), ed. J Domingo-Ferrer, Springer, 252-268, \doi{10.1007/978-3-319-11257-2_20}. We will be using the blink package in R and the RLdata500 data set, which was previously available in the Record Linkage package (but has been deprecated). Here, we illustrate transitive LSH. 
+
+In a record linkage task one wants to remove duplicate 
+entries from multiple databases. However, before performing this task, one needs to perform a means of dimension reduction so that the record linkage task is computationally scalable. 
+ 
+Using the TLSH algorithm, we illustrate an example of using this package using a German dataset comprised of first and last name and full date of birth. 
+
+Our goals include
+
+- Presenting the RLdata500 dataset with summary information.
+- Illustrating how we can format the RLdata500 dataset to work with the klsh
+- Running TLSH on the RLdata500 data set to create blocks
+- Explaining the tuning parameters of TLSH and how to choose these in practice with evaluation metrics.
+- Sample output and visualizations 
+
+## Understanding the RLdata500 dataset
+
+The RLdata500 dataset exists already in the blink package in R. We review this data set for the user. 
+
+The RLdata500 data consists of 500 records with 10 percent duplication. Thus, there are 450 unique individuals. There is full information on each record containing first name, last name, and full date of birth. 
+
+We first load the blink package and load the RLdata500 data set. We also, provide the first few lines of the data. We also remove missing values (they are all missing in this data set). 
+
+```{r, echo=TRUE, message=FALSE, knitr::opts_chunk$set(cache=TRUE)}
+library(blink)
+library(plyr)
+library(tlsh)
+data(RLdata500)
+head(RLdata500)
+data.500 <- RLdata500[-c(2,4)]
+head(data.500)
+```
+
+## TLSH applied to RLdata500
+
+
+
+We now explain how to run TLSH on the RLdata500 data set, piece by piece. 
+
+1. We first must creat a universal set of tokens.
+2. We then number find the number of tokens in the universal set.
+3. Then we must generate a vector of random hash functions.
+4. Next, we must creating an index vector and apply the hash functions to each record
+5. Then we build an edgelist, divide the graph into communities initially, sub-divide the communities more if needed
+6. Finally, we have our blocks.
+7. Then we can compute the dimension reduction and the recall. 
+
+The function that find the blocks is called **block_setup_v2. 
+
+```{r} 
+ blocks <- block_setup_v2(RLdata500, b=22, k=2)
+ summary(blocks)
+```
+
+where b is the number of **buckets** and k is the **shingle size**. 
+
+Observe that the blocks are roughly about the same size, however, this does not have to be the case.  
+
+
+The function that allows us to find the recall is **eval.blocksetup**.
+
+```{r}
+eval.blocksetup(RLdata500, b=26, key=identity.RLdata500)
+```
+
+The function that allows us to find the reduction ratio is **reduction.ratio.from.blocking**.
+
+```{r}
+(rr <- reduction.ratio.from.blocking(blocks)) 
+```
+
+To summarize, we have reduced the entire space by roughly 66 percent and the recall is 0.90, which means we are only splitting records across blocks 10 percent of the time. 
+
+
diff --git a/inst/doc/tlsh.html b/inst/doc/tlsh.html
new file mode 100644
index 0000000..3c06daa
--- /dev/null
+++ b/inst/doc/tlsh.html
@@ -0,0 +1,464 @@
+<!DOCTYPE html>
+
+<html>
+
+<head>
+
+<meta charset="utf-8" />
+<meta name="generator" content="pandoc" />
+<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
+
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+
+<meta name="author" content="Rebecca C. Steorts" />
+
+<meta name="date" content="2020-11-02" />
+
+<title>tlsh</title>
+
+<script>// Hide empty <a> tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) -->
+// v0.0.1
+// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020.
+
+document.addEventListener('DOMContentLoaded', function() {
+  const codeList = document.getElementsByClassName("sourceCode");
+  for (var i = 0; i < codeList.length; i++) {
+    var linkList = codeList[i].getElementsByTagName('a');
+    for (var j = 0; j < linkList.length; j++) {
+      if (linkList[j].innerHTML === "") {
+        linkList[j].setAttribute('aria-hidden', 'true');
+      }
+    }
+  }
+});
+</script>
+
+
+<style type="text/css">code{white-space: pre;}</style>
+<style type="text/css" data-origin="pandoc">
+code.sourceCode > span { display: inline-block; line-height: 1.25; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode { white-space: pre; position: relative; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+code.sourceCode { white-space: pre-wrap; }
+code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+    color: #aaaaaa;
+  }
+pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+code span.al { color: #ff0000; font-weight: bold; } /* Alert */
+code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
+code span.at { color: #7d9029; } /* Attribute */
+code span.bn { color: #40a070; } /* BaseN */
+code span.bu { } /* BuiltIn */
+code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
+code span.ch { color: #4070a0; } /* Char */
+code span.cn { color: #880000; } /* Constant */
+code span.co { color: #60a0b0; font-style: italic; } /* Comment */
+code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
+code span.do { color: #ba2121; font-style: italic; } /* Documentation */
+code span.dt { color: #902000; } /* DataType */
+code span.dv { color: #40a070; } /* DecVal */
+code span.er { color: #ff0000; font-weight: bold; } /* Error */
+code span.ex { } /* Extension */
+code span.fl { color: #40a070; } /* Float */
+code span.fu { color: #06287e; } /* Function */
+code span.im { } /* Import */
+code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
+code span.kw { color: #007020; font-weight: bold; } /* Keyword */
+code span.op { color: #666666; } /* Operator */
+code span.ot { color: #007020; } /* Other */
+code span.pp { color: #bc7a00; } /* Preprocessor */
+code span.sc { color: #4070a0; } /* SpecialChar */
+code span.ss { color: #bb6688; } /* SpecialString */
+code span.st { color: #4070a0; } /* String */
+code span.va { color: #19177c; } /* Variable */
+code span.vs { color: #4070a0; } /* VerbatimString */
+code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
+
+</style>
+<script>
+// apply pandoc div.sourceCode style to pre.sourceCode instead
+(function() {
+  var sheets = document.styleSheets;
+  for (var i = 0; i < sheets.length; i++) {
+    if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
+    try { var rules = sheets[i].cssRules; } catch (e) { continue; }
+    for (var j = 0; j < rules.length; j++) {
+      var rule = rules[j];
+      // check if there is a div.sourceCode rule
+      if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") continue;
+      var style = rule.style.cssText;
+      // check if color or background-color is set
+      if (rule.style.color === '' && rule.style.backgroundColor === '') continue;
+      // replace div.sourceCode by a pre.sourceCode rule
+      sheets[i].deleteRule(j);
+      sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
+    }
+  }
+})();
+</script>
+
+
+
+<style type="text/css">body {
+background-color: #fff;
+margin: 1em auto;
+max-width: 700px;
+overflow: visible;
+padding-left: 2em;
+padding-right: 2em;
+font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
+font-size: 14px;
+line-height: 1.35;
+}
+#TOC {
+clear: both;
+margin: 0 0 10px 10px;
+padding: 4px;
+width: 400px;
+border: 1px solid #CCCCCC;
+border-radius: 5px;
+background-color: #f6f6f6;
+font-size: 13px;
+line-height: 1.3;
+}
+#TOC .toctitle {
+font-weight: bold;
+font-size: 15px;
+margin-left: 5px;
+}
+#TOC ul {
+padding-left: 40px;
+margin-left: -1.5em;
+margin-top: 5px;
+margin-bottom: 5px;
+}
+#TOC ul ul {
+margin-left: -2em;
+}
+#TOC li {
+line-height: 16px;
+}
+table {
+margin: 1em auto;
+border-width: 1px;
+border-color: #DDDDDD;
+border-style: outset;
+border-collapse: collapse;
+}
+table th {
+border-width: 2px;
+padding: 5px;
+border-style: inset;
+}
+table td {
+border-width: 1px;
+border-style: inset;
+line-height: 18px;
+padding: 5px 5px;
+}
+table, table th, table td {
+border-left-style: none;
+border-right-style: none;
+}
+table thead, table tr.even {
+background-color: #f7f7f7;
+}
+p {
+margin: 0.5em 0;
+}
+blockquote {
+background-color: #f6f6f6;
+padding: 0.25em 0.75em;
+}
+hr {
+border-style: solid;
+border: none;
+border-top: 1px solid #777;
+margin: 28px 0;
+}
+dl {
+margin-left: 0;
+}
+dl dd {
+margin-bottom: 13px;
+margin-left: 13px;
+}
+dl dt {
+font-weight: bold;
+}
+ul {
+margin-top: 0;
+}
+ul li {
+list-style: circle outside;
+}
+ul ul {
+margin-bottom: 0;
+}
+pre, code {
+background-color: #f7f7f7;
+border-radius: 3px;
+color: #333;
+white-space: pre-wrap; 
+}
+pre {
+border-radius: 3px;
+margin: 5px 0px 10px 0px;
+padding: 10px;
+}
+pre:not([class]) {
+background-color: #f7f7f7;
+}
+code {
+font-family: Consolas, Monaco, 'Courier New', monospace;
+font-size: 85%;
+}
+p > code, li > code {
+padding: 2px 0px;
+}
+div.figure {
+text-align: center;
+}
+img {
+background-color: #FFFFFF;
+padding: 2px;
+border: 1px solid #DDDDDD;
+border-radius: 3px;
+border: 1px solid #CCCCCC;
+margin: 0 5px;
+}
+h1 {
+margin-top: 0;
+font-size: 35px;
+line-height: 40px;
+}
+h2 {
+border-bottom: 4px solid #f7f7f7;
+padding-top: 10px;
+padding-bottom: 2px;
+font-size: 145%;
+}
+h3 {
+border-bottom: 2px solid #f7f7f7;
+padding-top: 10px;
+font-size: 120%;
+}
+h4 {
+border-bottom: 1px solid #f7f7f7;
+margin-left: 8px;
+font-size: 105%;
+}
+h5, h6 {
+border-bottom: 1px solid #ccc;
+font-size: 105%;
+}
+a {
+color: #0033dd;
+text-decoration: none;
+}
+a:hover {
+color: #6666ff; }
+a:visited {
+color: #800080; }
+a:visited:hover {
+color: #BB00BB; }
+a[href^="http:"] {
+text-decoration: underline; }
+a[href^="https:"] {
+text-decoration: underline; }
+
+code > span.kw { color: #555; font-weight: bold; } 
+code > span.dt { color: #902000; } 
+code > span.dv { color: #40a070; } 
+code > span.bn { color: #d14; } 
+code > span.fl { color: #d14; } 
+code > span.ch { color: #d14; } 
+code > span.st { color: #d14; } 
+code > span.co { color: #888888; font-style: italic; } 
+code > span.ot { color: #007020; } 
+code > span.al { color: #ff0000; font-weight: bold; } 
+code > span.fu { color: #900; font-weight: bold; } 
+code > span.er { color: #a61717; background-color: #e3d2d2; } 
+</style>
+
+
+
+
+</head>
+
+<body>
+
+
+
+
+<h1 class="title toc-ignore">tlsh</h1>
+<h4 class="author">Rebecca C. Steorts</h4>
+<h4 class="date">2020-11-02</h4>
+
+
+
+<p>We present a small example from Steorts, R., Ventura, S., Sadinle, M., and Fienberg, S. (2014). “Blocking Comparisons for Record Linkage.” Privacy in Statistical Databases (Lecture Notes in Computer Science 8744), ed. J Domingo-Ferrer, Springer, 252-268, . We will be using the blink package in R and the RLdata500 data set, which was previously available in the Record Linkage package (but has been deprecated). Here, we illustrate transitive LSH.</p>
+<p>In a record linkage task one wants to remove duplicate entries from multiple databases. However, before performing this task, one needs to perform a means of dimension reduction so that the record linkage task is computationally scalable.</p>
+<p>Using the TLSH algorithm, we illustrate an example of using this package using a German dataset comprised of first and last name and full date of birth.</p>
+<p>Our goals include</p>
+<ul>
+<li>Presenting the RLdata500 dataset with summary information.</li>
+<li>Illustrating how we can format the RLdata500 dataset to work with the klsh</li>
+<li>Running TLSH on the RLdata500 data set to create blocks</li>
+<li>Explaining the tuning parameters of TLSH and how to choose these in practice with evaluation metrics.</li>
+<li>Sample output and visualizations</li>
+</ul>
+<div id="understanding-the-rldata500-dataset" class="section level2">
+<h2>Understanding the RLdata500 dataset</h2>
+<p>The RLdata500 dataset exists already in the blink package in R. We review this data set for the user.</p>
+<p>The RLdata500 data consists of 500 records with 10 percent duplication. Thus, there are 450 unique individuals. There is full information on each record containing first name, last name, and full date of birth.</p>
+<p>We first load the blink package and load the RLdata500 data set. We also, provide the first few lines of the data. We also remove missing values (they are all missing in this data set).</p>
+<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1"></a><span class="kw">library</span>(blink)</span>
+<span id="cb1-2"><a href="#cb1-2"></a><span class="kw">library</span>(plyr)</span>
+<span id="cb1-3"><a href="#cb1-3"></a><span class="kw">library</span>(tlsh)</span>
+<span id="cb1-4"><a href="#cb1-4"></a><span class="kw">data</span>(RLdata500)</span>
+<span id="cb1-5"><a href="#cb1-5"></a><span class="kw">head</span>(RLdata500)</span></code></pre></div>
+<pre><code>##   fname_c1 fname_c2 lname_c1 lname_c2   by bm bd
+## 1  CARSTEN     &lt;NA&gt;    MEIER     &lt;NA&gt; 1949  7 22
+## 2     GERD     &lt;NA&gt;    BAUER     &lt;NA&gt; 1968  7 27
+## 3   ROBERT     &lt;NA&gt; HARTMANN     &lt;NA&gt; 1930  4 30
+## 4   STEFAN     &lt;NA&gt;    WOLFF     &lt;NA&gt; 1957  9  2
+## 5     RALF     &lt;NA&gt;  KRUEGER     &lt;NA&gt; 1966  1 13
+## 6  JUERGEN     &lt;NA&gt;   FRANKE     &lt;NA&gt; 1929  7  4</code></pre>
+<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1"></a>data<span class="fl">.500</span> &lt;-<span class="st"> </span>RLdata500[<span class="op">-</span><span class="kw">c</span>(<span class="dv">2</span>,<span class="dv">4</span>)]</span>
+<span id="cb3-2"><a href="#cb3-2"></a><span class="kw">head</span>(data<span class="fl">.500</span>)</span></code></pre></div>
+<pre><code>##   fname_c1 lname_c1   by bm bd
+## 1  CARSTEN    MEIER 1949  7 22
+## 2     GERD    BAUER 1968  7 27
+## 3   ROBERT HARTMANN 1930  4 30
+## 4   STEFAN    WOLFF 1957  9  2
+## 5     RALF  KRUEGER 1966  1 13
+## 6  JUERGEN   FRANKE 1929  7  4</code></pre>
+</div>
+<div id="tlsh-applied-to-rldata500" class="section level2">
+<h2>TLSH applied to RLdata500</h2>
+<p>We now explain how to run TLSH on the RLdata500 data set, piece by piece.</p>
+<ol style="list-style-type: decimal">
+<li>We first must creat a universal set of tokens.</li>
+<li>We then number find the number of tokens in the universal set.</li>
+<li>Then we must generate a vector of random hash functions.</li>
+<li>Next, we must creating an index vector and apply the hash functions to each record</li>
+<li>Then we build an edgelist, divide the graph into communities initially, sub-divide the communities more if needed</li>
+<li>Finally, we have our blocks.</li>
+<li>Then we can compute the dimension reduction and the recall.</li>
+</ol>
+<p>The function that find the blocks is called **block_setup_v2.</p>
+<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1"></a> blocks &lt;-<span class="st"> </span><span class="kw">block_setup_v2</span>(RLdata500, <span class="dt">b=</span><span class="dv">22</span>, <span class="dt">k=</span><span class="dv">2</span>)</span></code></pre></div>
+<pre><code>## [1] &quot;Creating the universal set of tokens&quot;
+## elapsed 
+##   0.005 
+## [1] &quot;Number of tokens in universal set&quot;
+## [1] 404
+## [1] &quot;Generating a vector of random hash functions&quot;
+## elapsed 
+##   0.003 
+## [1] &quot;Creating index vector and applying hash functions to first record&quot;
+##    user  system elapsed 
+##   0.006   0.000   0.006 
+##    user  system elapsed 
+##   3.205   0.021   3.234 
+## [1] &quot;Creating edgelist&quot;
+##    user  system elapsed 
+##   0.207   0.007   0.214 
+## [1] 23146     2
+## [1] &quot;Building graph from edgelist&quot;
+##    user  system elapsed 
+##   0.001   0.000   0.002 
+## [1] &quot;Dividing graph into communities initially&quot;
+##    user  system elapsed 
+##   0.017   0.000   0.017 
+## [1] &quot;Subdividng communities&quot;
+##    user  system elapsed 
+##       0       0       0</code></pre>
+<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1"></a> <span class="kw">summary</span>(blocks)</span></code></pre></div>
+<pre><code>##      Length Class  Mode   
+## [1,]  48    -none- numeric
+## [2,]  62    -none- numeric
+## [3,] 141    -none- numeric
+## [4,] 249    -none- numeric</code></pre>
+<p>where b is the number of <strong>buckets</strong> and k is the <strong>shingle size</strong>.</p>
+<p>Observe that the blocks are roughly about the same size, however, this does not have to be the case.</p>
+<p>The function that allows us to find the recall is <strong>eval.blocksetup</strong>.</p>
+<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1"></a><span class="kw">eval.blocksetup</span>(RLdata500, <span class="dt">b=</span><span class="dv">26</span>, <span class="dt">key=</span>identity.RLdata500)</span></code></pre></div>
+<pre><code>## [1] &quot;Creating the universal set of tokens&quot;
+## elapsed 
+##   0.004 
+## [1] &quot;Number of tokens in universal set&quot;
+## [1] 2516
+## [1] &quot;Generating a vector of random hash functions&quot;
+## elapsed 
+##   0.003 
+## [1] &quot;Creating index vector and applying hash functions to first record&quot;
+##    user  system elapsed 
+##   0.006   0.000   0.006 
+##    user  system elapsed 
+##   3.298   0.020   3.328 
+## [1] &quot;Creating edgelist&quot;
+##    user  system elapsed 
+##   0.260   0.005   0.269 
+## [1] 13434     2
+## [1] &quot;Building graph from edgelist&quot;
+##    user  system elapsed 
+##   0.001   0.001   0.002 
+## [1] &quot;Dividing graph into communities initially&quot;
+##    user  system elapsed 
+##   0.003   0.000   0.003 
+## [1] &quot;Subdividng communities&quot;
+##    user  system elapsed 
+##       0       0       0</code></pre>
+<pre><code>##   recall
+## 1   0.86</code></pre>
+<p>The function that allows us to find the reduction ratio is <strong>reduction.ratio.from.blocking</strong>.</p>
+<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1"></a>(rr &lt;-<span class="st"> </span><span class="kw">reduction.ratio.from.blocking</span>(blocks)) </span></code></pre></div>
+<pre><code>## [1] 0.6491784</code></pre>
+<p>To summarize, we have reduced the entire space by roughly 66 percent and the recall is 0.90, which means we are only splitting records across blocks 10 percent of the time.</p>
+</div>
+
+
+
+<!-- code folding -->
+
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+  (function () {
+    var script = document.createElement("script");
+    script.type = "text/javascript";
+    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+    document.getElementsByTagName("head")[0].appendChild(script);
+  })();
+</script>
+
+</body>
+</html>
diff --git a/man/block.ids.from.blocking.Rd b/man/block.ids.from.blocking.Rd
new file mode 100644
index 0000000..1b1166d
--- /dev/null
+++ b/man/block.ids.from.blocking.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/blocking-evaluations.R
+\name{block.ids.from.blocking}
+\alias{block.ids.from.blocking}
+\title{Returns the block ids associated with a blocking method.}
+\usage{
+block.ids.from.blocking(blocking)
+}
+\arguments{
+\item{blocking}{A list of the blocks.}
+}
+\value{
+A list of the blocks ids that corresponds to each block
+}
+\description{
+Returns the block ids associated with a blocking method.
+}
+\examples{
+tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:250,c(-2,-4)], b=10, save_signature=FALSE, k=1)
+block.ids.from.blocking(tlsh.blocks)
+}
diff --git a/man/block_setup_v2.Rd b/man/block_setup_v2.Rd
new file mode 100644
index 0000000..3f354b9
--- /dev/null
+++ b/man/block_setup_v2.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/tlsh.R
+\name{block_setup_v2}
+\alias{block_setup_v2}
+\title{Function that divides all records into bins using locality sensitive hashing and using TLSH (based upon community detection technique)}
+\usage{
+block_setup_v2(r.set, b = 22, save_signature = FALSE, k = 5)
+}
+\arguments{
+\item{r.set}{Record set (shingled records)}
+
+\item{b}{Band}
+
+\item{save_signature}{Flag of whether or not to save the signature}
+
+\item{k}{Shingle size}
+}
+\value{
+List of blocks where a particular index is the record id in the original
+data set
+}
+\description{
+import blink
+}
+\examples{
+r.set <- RLdata500[1:3,c(-2)]
+block_setup_v2(r.set = RLdata500[1:3,c(-2)], b=22, save_signature=FALSE, k=2)
+}
diff --git a/man/compare_buckets.Rd b/man/compare_buckets.Rd
new file mode 100644
index 0000000..7ca604b
--- /dev/null
+++ b/man/compare_buckets.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/minhash_v2.R
+\name{compare_buckets}
+\alias{compare_buckets}
+\title{Function that creates a similarity graph and divides it into communities (or blocks) for entity resolution}
+\usage{
+compare_buckets(hashed_signatures, max_bucket_size = 1000)
+}
+\arguments{
+\item{hashed_signatures}{The hashed signatures}
+
+\item{max_bucket_size}{The largest block size allowed by user}
+}
+\value{
+max_bucket_size The largest bucket size (or block size) that one
+can handle
+}
+\description{
+Function that creates a similarity graph and divides it into communities (or blocks) for entity resolution
+}
+\examples{
+head(data <- RLdata500[-c(2,4)])
+minidata <- data[1:2,]
+head(all_the_shingles <- apply(minidata,1,shingles,k=8))
+head(minhash.minidata <- minhash_v2(all_the_shingles, p=10))
+hashed_signature <- hash_signature(minhash.minidata, b=5)
+compare_buckets(hashed_signature, max_bucket_size=200)
+}
diff --git a/man/confusion.from.blocking.Rd b/man/confusion.from.blocking.Rd
new file mode 100644
index 0000000..5e6441a
--- /dev/null
+++ b/man/confusion.from.blocking.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/blocking-evaluations.R
+\name{confusion.from.blocking}
+\alias{confusion.from.blocking}
+\title{Perform evaluations (recall) for blocking.}
+\usage{
+confusion.from.blocking(blocking, true_ids, recall.only = FALSE)
+}
+\arguments{
+\item{blocking}{A list of the blocks}
+
+\item{true_ids}{The true identifiers for comparisons}
+
+\item{recall.only}{Flag that when true only prints the recall, otherwise
+prints many evaluation metrics in a list}
+}
+\value{
+A vector of that returns the recall and the precision
+}
+\description{
+Perform evaluations (recall) for blocking.
+}
+\examples{
+r.set <- RLdata500[1:250,c(-2)]
+tlsh.blocks <- block_setup_v2(r.set, b=22, save_signature=FALSE, k=2)
+confusion.from.blocking(tlsh.blocks, identity.RLdata500, recall.only=TRUE)
+}
diff --git a/man/eval.blocksetup.Rd b/man/eval.blocksetup.Rd
new file mode 100644
index 0000000..28a5326
--- /dev/null
+++ b/man/eval.blocksetup.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/tlsh.R
+\name{eval.blocksetup}
+\alias{eval.blocksetup}
+\title{Function to evaluate the blocking step}
+\usage{
+eval.blocksetup(dat, k = 5, b = 21, key)
+}
+\arguments{
+\item{dat}{Data set}
+
+\item{k}{Parameter k, which is the number of shingle, tokens, or grams to break the string into}
+
+\item{b}{Number of buckets}
+
+\item{key}{Unique identifier}
+}
+\value{
+Recall and runtime
+}
+\description{
+import blink
+}
+\examples{
+r.set <- RLdata500[1:50,c(-2)]
+eval.blocksetup(r.set, k=2, b=22, key=identity.RLdata500)
+}
diff --git a/man/extract_pairs_from_band.Rd b/man/extract_pairs_from_band.Rd
new file mode 100644
index 0000000..cd0f9d6
--- /dev/null
+++ b/man/extract_pairs_from_band.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/minhash_v2.R
+\name{extract_pairs_from_band}
+\alias{extract_pairs_from_band}
+\title{Function that extracts pairs of records from a band in the signature matrix M
+import bit64}
+\usage{
+extract_pairs_from_band(a_band)
+}
+\arguments{
+\item{a_band}{Band of the signature matrix M}
+}
+\value{
+The edgelist of record pairs that are connected
+}
+\description{
+Function that extracts pairs of records from a band in the signature matrix M
+import bit64
+}
+\examples{
+band1 <- c(2,1,2,1,2)
+extract_pairs_from_band(band1)
+band2 <- c(6,7,8,9,6)
+extract_pairs_from_band(band2)
+band.12 <- rbind(band1, band2)
+apply(band.12,1,extract_pairs_from_band)
+}
diff --git a/man/hash_signature.Rd b/man/hash_signature.Rd
new file mode 100644
index 0000000..0385540
--- /dev/null
+++ b/man/hash_signature.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/minhash_v2.R
+\name{hash_signature}
+\alias{hash_signature}
+\title{Function to take a signature matrix M composed of b bands and r rows and return
+a bucket for each band for each record}
+\usage{
+hash_signature(signature, b)
+}
+\arguments{
+\item{signature}{Signature matrix M composed of b bands and r rows}
+
+\item{b}{Number of bands}
+}
+\value{
+Bucket for each band for each record
+}
+\description{
+Function to take a signature matrix M composed of b bands and r rows and return
+a bucket for each band for each record
+}
+\examples{
+head(data <- RLdata500[-c(2,4)])
+minidata <- data[1:2,]
+head(all_the_shingles <- apply(minidata,1,shingles,k=8))
+head(minhash.minidata <- minhash_v2(all_the_shingles, p=10))
+hash_signature(minhash.minidata, b=2)
+hash_signature(minhash.minidata, b=5)
+}
diff --git a/man/minhash_v2.Rd b/man/minhash_v2.Rd
new file mode 100644
index 0000000..a41e0ef
--- /dev/null
+++ b/man/minhash_v2.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/minhash_v2.R
+\name{minhash_v2}
+\alias{minhash_v2}
+\title{Function to create a matrix of minhashed signatures}
+\usage{
+minhash_v2(
+  shingled_records,
+  p,
+  do_one_hash_and_record = do_one_hash_and_record
+)
+}
+\arguments{
+\item{shingled_records}{Shingled records}
+
+\item{p}{Number of permutations to be applied to the hash function}
+
+\item{do_one_hash_and_record}{Combination of one hash and one record}
+}
+\value{
+Computes an integer-valued matrix of minhash signatures with one row per permutation and one column per record
+}
+\description{
+Function to create a matrix of minhashed signatures
+}
+\examples{
+head(data <- RLdata500[-c(2,4)])
+minidata <- data[1:2,]
+head(all_the_shingles <- apply(minidata,1,shingles,k=8))
+head(minhash.minidata <- minhash_v2(all_the_shingles, p=10))
+}
diff --git a/man/my_hash.Rd b/man/my_hash.Rd
new file mode 100644
index 0000000..c286847
--- /dev/null
+++ b/man/my_hash.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/minhash_v2.R
+\name{my_hash}
+\alias{my_hash}
+\title{Function that applies a hash function to each column of the band from the
+signature matrix
+import bit64}
+\usage{
+my_hash(a_band)
+}
+\arguments{
+\item{a_band}{Band from the signature matrix M}
+}
+\value{
+a 64 bit integer
+}
+\description{
+Function that applies a hash function to each column of the band from the
+signature matrix
+import bit64
+}
+\examples{
+band1 <- c(2,1,2,1,2)
+band2 <- c(4,5,2,1,9)
+combined_band <- rbind(band1,band2)
+my_hash(combined_band)
+}
diff --git a/man/primest.Rd b/man/primest.Rd
new file mode 100644
index 0000000..64f99be
--- /dev/null
+++ b/man/primest.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/minhash_v2.R
+\name{primest}
+\alias{primest}
+\title{Function to generate all primes larger than an integer n1 (lower limit) and less than any other integer n2 (upper limit)}
+\usage{
+primest(n1 = 1, n2)
+}
+\arguments{
+\item{n1}{An integer taken to be 1 as the default}
+
+\item{n2}{Any integer n2}
+}
+\value{
+Generates all prime numbers with the above constraints
+}
+\description{
+Function to generate all primes larger than an integer n1 (lower limit) and less than any other integer n2 (upper limit)
+}
+\examples{
+primest(1, 5)
+primest(1, 17)
+}
diff --git a/man/reduction.ratio.Rd b/man/reduction.ratio.Rd
new file mode 100644
index 0000000..fc68218
--- /dev/null
+++ b/man/reduction.ratio.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/blocking-evaluations.R
+\name{reduction.ratio}
+\alias{reduction.ratio}
+\title{Returns the reduction ratio associated with a blocking method}
+\usage{
+reduction.ratio(block.labels)
+}
+\arguments{
+\item{block.labels}{A list of the blocks labels.}
+}
+\value{
+The reduction ratio
+}
+\description{
+Returns the reduction ratio associated with a blocking method
+}
+\examples{
+tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:50,c(-2)], b=22, save_signature=FALSE, k=2)
+block.ids <- block.ids.from.blocking(tlsh.blocks)
+reduction.ratio(block.ids)
+}
diff --git a/man/reduction.ratio.from.blocking.Rd b/man/reduction.ratio.from.blocking.Rd
new file mode 100644
index 0000000..6a6a51b
--- /dev/null
+++ b/man/reduction.ratio.from.blocking.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/blocking-evaluations.R
+\name{reduction.ratio.from.blocking}
+\alias{reduction.ratio.from.blocking}
+\title{Returns the reduction ratio associated with a blocking method}
+\usage{
+reduction.ratio.from.blocking(blocking)
+}
+\arguments{
+\item{blocking}{The actual blocks}
+}
+\value{
+The reduction ratio
+}
+\description{
+Returns the reduction ratio associated with a blocking method
+}
+\examples{
+tlsh.blocks <- block_setup_v2(r.set = RLdata500[1:50,c(-2,-4)], b=10, save_signature=FALSE, k=1)
+reduction.ratio.from.blocking(tlsh.blocks)
+}
diff --git a/man/rhash_funcs.Rd b/man/rhash_funcs.Rd
new file mode 100644
index 0000000..e47622c
--- /dev/null
+++ b/man/rhash_funcs.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/minhash_v2.R
+\name{rhash_funcs}
+\alias{rhash_funcs}
+\title{Function to generate a vector of random hash functions (or optionally one vector-valued function)}
+\usage{
+rhash_funcs(n, size, vector.valued, perfect = FALSE)
+}
+\arguments{
+\item{n}{Number of random hash functions}
+
+\item{size}{Range of each size}
+
+\item{vector.valued}{Flag for outputing vector of functions or vector-valued function}
+
+\item{perfect}{Flag for whether a perfect permutation should be done, or just a hash function}
+}
+\value{
+Vector of n hash functions or a function which will take a number and return a vector of n different hashes of it
+}
+\description{
+Function to generate a vector of random hash functions (or optionally one vector-valued function)
+}
+\examples{
+rhash_funcs(1, 1, vector.valued=FALSE, perfect=FALSE)
+rhash_funcs(5, 1, vector.valued=FALSE, perfect=FALSE)
+}
diff --git a/man/shingled_record_to_index_vec.Rd b/man/shingled_record_to_index_vec.Rd
new file mode 100644
index 0000000..8fc0dbb
--- /dev/null
+++ b/man/shingled_record_to_index_vec.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/minhash_v2.R
+\name{shingled_record_to_index_vec}
+\alias{shingled_record_to_index_vec}
+\title{Function to convert to tell what index the shingle corresponds to in the record}
+\usage{
+shingled_record_to_index_vec(shingled_record, universal_set)
+}
+\arguments{
+\item{shingled_record}{Shingled record}
+
+\item{universal_set}{Universal set of all shingles}
+}
+\value{
+the index regarding where the shingle falls in the record
+}
+\description{
+Function to convert to tell what index the shingle corresponds to in the record
+}
+\examples{
+shingles("Alexander",2)
+shingles("Alexander Smith", 2)
+shingled_record_to_index_vec(shingles("Alexander",2), unique(shingles("Alexander Smith", 2)))
+}
diff --git a/man/shingles.Rd b/man/shingles.Rd
new file mode 100644
index 0000000..80a2409
--- /dev/null
+++ b/man/shingles.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/minhash_v2.R
+\name{shingles}
+\alias{shingles}
+\title{Function to shingle (token or gram) a string into its k components}
+\usage{
+shingles(record, k)
+}
+\arguments{
+\item{record}{String or record}
+
+\item{k}{Parameter k, which is the number of shingle, tokens, or grams to break the string into}
+}
+\value{
+Computes the shingled (tokened or grammed) version of a string
+}
+\description{
+Function to shingle (token or gram) a string into its k components
+}
+\examples{
+shingles("Alexander",2)
+shingles("Alexander Smith", 2)
+}
diff --git a/vignettes/blocks_members.Rdata b/vignettes/blocks_members.Rdata
new file mode 100644
index 0000000000000000000000000000000000000000..45416a37dec9324926bd2b155a4e079507cb0cad
GIT binary patch
literal 946
zcmV;j15NxNiwFP!0000019ey1P82~9ZMh`M1K9^2c<^CDh%pAq%0(A1(A;k#u<S0o
z+;#>+q7kBzALvivQ|43n1Rs1hj`led>%f|rL-lmkIaO2DJzHOCPY<<+oOAtd!1cNQ
z!QxLh*leu5tT{K(R~V0q2TuwdKEL{Q@%5&AeR+L;`QxVW7%m?E@=x8rfBqG)0{jSk
z0i5G{h`0mW0e7&Tz<wNY;it9zGvFw21o%nqVE1jq-V8SGD%N+!e;Mn`!Y?|?IE(U}
zc~yYB!P?uvJzy26b*wp?#|&^xo|n3<zEVdoU5WV9o|OEai_Q1G7bE%!;h7X3>Pcjs
zyv*w@a7y^JALu_%9l`SK<q<bT*E(<?=xBSx>sN(uAMqDpGv9sitOOfxTlP7LH}0%n
z)`9(~^`?II8J_#P*i&LN-@3Q!@2swJPT5y{ztlyY)ItAxuzvO2#^e43cp!NlK-U4{
zRJfkod>E%8x_q$v#_aQwhwrBWzR@208rWBLHZJ)Z(6tq8oTjt-9gua}1hOto=x7F;
zUo*FHIY%48>d^C$yzAK4A}?(A8|RGv%#V6GFD-BRs6+dKHtJs1(TN}VMrB_sSpDoz
z&RHUSn(rIoW8GT0^(VjTUkujYi9X(2^g9IdeehnH7Co!tSNi*hc~lqcrhWYZ$i8QO
zhrz~q4`dxV&&-o~GcN0MnA>%|=jcB!eib0=um<G&Y>QtzMk?oFN#ap&MNseI*T5>{
zA}{vAtg}3A<hO=>tXBuf^Sa}0T*l*fK?lhDmpVGR)ptZhydL-By*^+akH9k}>pE{o
z;C0UOXkQ(REp=u*#WEj$pKvZF1R3u*D)C$5ulJgszwB?GmwG-C`JF&M_B;FbH~{mA
zNe{l{VLfG^l85h~b>sODLH@*v9selbGyClX$oU{2_c34Qb&^{>x?k=)e=2%Uz4f0I
z<b0e$SCw|;E$eb7amg?9U+&j)ntsgdOmsQm3ZFyIS=8aXSg*<cuD3e6!N%!I9@Ins
z)at$WeWvre6s*m8;9SnizBGx*{Jrpxh@B!Io>%1Icdp)pX=-&4Q}FX#t#bV~qRwTS
zTfQ0b<9CIgtLdHf%OtNXiI_+}%#Z!RdsBbcInk-#4~)z6m+>+WOf7GZbFYk_Ay1xX
zdVgoP5w&mST)8&~S#I_6ywZD_=bzql)S2B`KkbKk%P+s;2QRL&o8rg2tNgb(gx~a^
UzkDyO-+%xA0h5z_8W|7(03Le$F#rGn

literal 0
HcmV?d00001

diff --git a/vignettes/candidate_pairs_graph.Rdata b/vignettes/candidate_pairs_graph.Rdata
new file mode 100644
index 0000000000000000000000000000000000000000..15d888a8160a50afb11af0c9f6842b8626839c97
GIT binary patch
literal 12763
zcmaKz30P8F+xP2~Svin()GU?K%F2Pv%2`q~?U<TPW~ro|GPA@X2Sg!rsw}lfi^_pK
znnRXoIi<i(%_u1`hcqW7B%BaMCV>qf=Xu`meV+HduJ7ZzZua8d_g?p2>)IEr`}bc!
zv(~Qo=WEqKk>~vY%A@jAgp9e)=PLv&&L{X@J+S7(#U1(mjc$VDIemul5YBY5#plOf
z#~L4XW*#|j^ytq;-z0isjK3!=X8N*4!<QGhhW`Dy^WU$mcs*AV3ver!j;J7Y{P$1%
zR))9cKCsae=|#!m-6$!gLctiR0LEqo#j}odbfaigEl2(X%_zbvsHP@p5*wdfzxbup
zfS#D9Sf1-*#pu#r2iPmxB3VBN1|Uu?E&GWJl@p#haq@Q=3O+kk3XwR<$>14=ko%3y
zmX|BIU;Va%jOI>kM7)q|b}P#~Xf;9e0+A|@$<u+C7Z0TBU_S|AmE5sL4Uk?7Y6+6}
z-&i~~<IfU>OSegp!6Kod&{j^bMDDw`m^@clQM{CoO=U_ZfK>P@*opa#Bh~#)_ljnR
zNS}pU1d3cH!J;7ns~$b-^Gt{SmR`@^C_Ia+!qbt^`UGUp&sANH=dLcQQv#pMO{Mv&
z$lmM%{dis3d(7>MtAjtj&2fPj<DPu31E={?vc6&+1?%Cc_!I}_6E){xBYvVO>o^BO
zF|AeAoa|0Qo|-2${*>zt#}xCRDfpfms(#b@AhvjR4sa$lwVBp)O&SZLU5BG`gs4Cg
zU=cDnE8Zg*`zapjN8`cboMvD*UaD{sh(~Wa!j@49Y5^Vd!!B7B%i!D!Y5x*($eywY
z4WY1;Jy3Y*1=XR_;_7<j0|^mEZ**!Tc>`_ZFSI}pK|)}-h<UJ!J)00f8&m|3h#s+}
zT^l(iW6Pwt6p2%L5wwCAjy5N|+0wX|8%YF0AO5l=xtNnvi%#Kq>SMJ7UL;%pAlqJF
z)DaYxj8f(l`I4G%7IZc@O6De2kvQ>*5>oR~DZy86Ae4>^l9K<x63l5r3g%`5Ha!t(
zST9~V0u50Z36=vS4iZ`5pCe10Wx&HeIfz^(x=(96IE*dgAZitU6MDn3k2yJ4xjP#P
zzBRLs(cyY!j_?ZEihwcXH_-^JiYuw@+%6dZ)vVe-i^@r~ksD(a2L+R&vz&LI=*jpz
zS<n(O!O2PQ1pQ*;nC=g<b-6yMFlP~JIp8(vsG^?v&`N`{*gD8?og05%d@#E;{;++e
z=q^P?<|G2Ip|56&s%YTBnbuq2nYqAHx(HVbu^X3Y$6>vaU1yds-J&{S_ad5O&t4We
zK(5jsv3|sf9p8B-W8CN8&_(H7RW5?Q$<%mQV?6B2NPGq};$#k!E*%oshhgD`-QStb
z3ws6hw=$P3oXxmYX7+>-o&>EDRSwz$3PR2WJ|<Q_u0QIRmn15O`O{G)QISBZiIjN_
z=W%m<hRELNh(7YL4eJnVK0F6A)>*?v0qpE>FdOqF4ujFhPA<(Rlt~;G(Sp{~VG76W
z*0(;S33CY(&D0zj_iGRh-jLwwe!|dk?t(A(ExotcB$=>0CX`(u&|)0o`(uijNlvi7
z2wEyL7sX}Qh=L5E_Sr4R?YV+0BmS)rr<L6KuZ^?wyz7pQ^d-hJa2ySiB?j8b<3)I>
ze9<AuAhkaR9o*;`1NcqhMT2{vP)r)4l8CtJCeg8Czt}PBKv5kR=d&jsdjO%RIi)x#
ziMn(gsAa~p4TVH#=Fm5v69py`C#RX_OIP8zJcD8TQ9L*%<vRgr67mE<#{z0(IJc+!
zpd<z_AGdiwez9@f0ZYDDgr3;KAsus!uVf#h&@TdPdQ<^D%bpJi^)u<e-7J2YJi=N&
zS4tcV4T9$~uLA{SC@(fkq$B)EJT<O-bds&OU}Q&}=HcumIZf0==QPu*fLYrmjHrch
z9uWSh`ReI_Fr>#~3<lZkZZ=2r39SI^CDU5jV@vOjqOY+Vgsn@qG)nS_v($kePs!mT
zFm!PdAsOFNL(`Z6zGApp2|=Pg$_W#Nat4L$r3iZi+Dv>jQCl!a6Lzr8KIZhZllP97
zL&?zjp5oyp99}vqPl66>N||VmlVI$8$cS*B_89Q7=Or_wBpaZ=0%|TDDaH47%#u<A
zDR&!Yxy^0tZIULec##2&7noDzhD2u=2u@5y&WZMV#DaiVvskws@DU^SCj8vmt{6^H
zaGop<n0$DVMxfoLzn&1i=cXd{t%_GG7Po#>Xv!fAc)JDcZMcn*klk6{33b{oue8om
zc>bx#ab9l2Migr*Uf^=+ig?29uZ)CyAkW{u6R*EXUO$+kZvsUAdHu>FVi`pIc&z$}
zW@{k>NVauP-IfR5U(vxy*A>`CMIXpVuA75u6{O{L_!jeq9>x*Jb`(13b-8vw05z{~
z3}LfLX<r#)O({M2LJ()D_Lzrs(*Cy}3Qr-v9IN~e91CifTV6WV_L=EBzI3+^d)($}
zrwAQm#G+S>5+v&i)<??-&dcTAbAGvrSl8s4oxe#9hEC^7)?}?yov7^8p7Ad!bU2dI
z*RiP1$v4S^*u`D{^<nzkiDKi(x>c&{3et>oB{W}xUFP1CblWZ}H?8<;?glj5HmLYl
zm~ZjRdszk^*hvGm{_c7=RMWeM7%DUV=j0umWV?Lm?1#?vnj3o|%eCHxPKK`7TrD)u
zFl7h_-+&$Q98xVv%lGp!#vFe)!FHQ@i14F}7O(2S{a<Zkv9j!<?6P<nDi&vJl>?f3
zzou*qx68aM=k!A!&V8wbn3m*o@su;wEBj8LzrJvz#m#M;Efy(!Kc^56X$E>5=QO;m
zdq*U+u>0quAeXtoX_J68ncIW|$pKGyjN%~!6`v8GN<T>hZ)}`*+?j0`sXk0j)nS>c
z1r%<=+11ihs4*d{dg&AX+4n9*@)FdGhC@`s0HWtMPCl+RBXGH6;!;cG`L<LYLZ8!e
z=PY$%8zV@?eH~>2qf(JW*-A<SZR=Kw_S$%Rpd+q|CL;6k>kW4>w=p(c-<bXWm`cUl
z%?=sOPc}$-+f_nf2|H)j$({9gjjj@H>a~b%ZbGjM!`VH3B|U-Im=J06(mvPcojZxQ
znWvG`Pnh2&ZYsud5?6sGl7sv#@>sq2QlkJkWg|P<O%-glix}kEufX;cEYD&+><tk1
z^gcyxyPV8n*sv=kD`nj;FZAKLN&L}zx7c!wNs-A6J-#@q_y&mYM)rTR%a+&*swo4I
z*ZB9S#mimNVjj9$^AD%PMdfsc_RJu*B7(wgmi$4{aBZZQ8G>%vRxASpg*cGxoGd{X
z5-Qnz^aQbuTSN~BORTE~QH@MfdJ`G(*mfM%#Py3+$(j?9ULfcq62^7Vmgs8Vzw)Ke
z61q(867+tN$h=M`tm={yXSqx08tD6|Um9~SO<P<@WW{7S#*36)q2H9InU1lAoL0O7
zj>A)#(Ex$$$1)dC@txx{E?E+^U^b(sNGJ|^OgKTCmkpz;F!S0o58Z|K=7>tD8|j^d
zbRi$;i0Wp_#-*g#fT^Tkh5~yC&#?;lih9^GB^`=!JbqtScM8Vo$(cy<#tQXL1c@93
z_L|>m6E@<NoSVgSVj;JdD@6eH#2O~!Lcc<7VN>=67391xM9QkbRU!fjiRVCQ4es4N
zaP~&ZL{*^43;Iy3#mx8bwFG+5SR>sMu!l@#%5pHr!Nc8v0j;ZVmK|BFZ|69dR-6$n
z8HjlEn$Eg3Gl-u{I^#%UstI)B8{At9gsEB%t%-M4(dIQV#3c~juec@QFWGkYDYn>&
zSIwRB23=f1iF<(i;;lH~C`odvDAbnrF55s*{&hd+hQXdX!}n2HiECni&&CZZdVW4=
z9Pe2-r5k1VPVSafbD^a(O2X1tGzWc*4d8{$boIX|{3X&p0iXU@>$dzgws8=yx`$wR
zf%KhveHJ+5UKZiv^kH#!RV_-Z%#h+$ljT2BQH;srP0y4Oc&#ngf&;TaX%U|)P>*lR
zkblCaS)hl1ezgQk6*Er^Z++bucQun(EkDc7oVl@Om?@*9(vsfF>%E3exH79@6X<hj
znO>v=jrM)Nw5|dl=tx<G0X{X#8%$NrBYNn9ytxY5K}p@>{*j?G)(Ozo0zOxr)$e&Q
zoDXU3?i=u|;Wkv(97}$)bU60tSDd*#Rd8_-#gN>II{`@6$Ii_Hq41Fcd_(hBu!4Tg
z04=IS7BN|!e8M#Qor_ceQ{2H&m%#P?h=B#n(1-5tVGG+ml7TXE&s}K!6w!?DbFm}#
z0^9CJG`X>kopd8LuDK?pl@yg~9bf#5=tk;W!>uBEpM_^G{gpv)x|+H4?#A4WT5PKD
zlT{ab%C8?DrTo!8*)3?)h}+sl=p&eS-QY$UCUiH2(RZ;?1<dbNp;=PpsyMMlqnmb!
zeaf-6h3YkDLm2ysIf<?PxHxmcdWv>P#Izf!(d|RLwrDJIESPX#mA#6aXef5Ne%ro2
zJXLN-8-I?U@jvtq*;t9pVkZ=^&yJ<ezU>#>NC>ip*y|IZ$*GAfwfJV*08LgHM-9hZ
zgJ;xdam?rGueF4`fz%=|$xfdtAP{)32@XFWwHu7dV#Z&skh`F&P5YN_$sH%e+EUu$
z$Pj`uZoy@qqj=ha#uxa=ES~+S-&U&3TiTyK?GspGP(X%P#uu=!qpREo?^=%|0&rg0
zBf$y_QI(wMMN^|7eK7^n&`dTK7snA^AiBa?z@#4^cYq($Cy&N*?jN2R@n3BK{|02l
zA$!KXX6Pl!t@VB+!66Yao7NBeRQfUTWx9?ayo%tL^9owQARZ{pjIH6W5jItZs<n;?
zbK<IJ^N@o+*hg`dp@HmoqmvHE7^lW}@OYf_T*O6p(rY{a<u$RY8lb;&Bw%#1!autY
z^4du`trr?onL&RlEZCRYIF7?^ia0ZX9wK)AF@71CU142G7yOP)kt`l6@28?eaTw&q
z_)N@vbP(CqcG*+FnRP%GjQU8h11qibNB%50w_U3#7$=;;j!s~D03!J~u%}76hD_gj
z0_;B&*B|~8X#TAWJH<V#I=0{xi0=s+e(Qy9+yhXfTI-QG>9L|`vn70=hy-X%a$J&p
z8nD|55(|Vldi>SQS*~F%7RTG_K-e0PX;9Ol$r*2xTX0tj(q?qd+Cpml!fM}M29kYJ
zX0r)H<BP--7}4`idwwWR(tl8+O=B)d?-X=Ol5Ze+_mOMwI!wh324Bw^qM<tlS&}7d
zTX^eSPAqJQUMAC_rL=5cp4!@9N<Z%?wHx#ucP=8fh;Gc#e<j`EhpBSw#$M<^-;lav
zM}FG9Oiji`Z^cxI!R?%r+`T|lrto^?=o(?X4Mw%~@9ax;Sz}|JIs=bhhDRB^#Yi${
z0k`B4K3X@);~l=29Fw1iYjuB@!E8#%ZcK)zKx|=<chF3#`Nx)X5>o}E<%ySsG#!8W
z50V1}G9u*nCZ>i2rf}IRCx8j+bFSM<HMswG5WnKB+-Y_~kY@vn$9#W*gs+={aeE(z
zk`fTt&-dfcPBs0o;E!A%yg7Qt;Lu3LkmlHdk#zY}&Z`Uay{4fCQQtLSZv&^&P0V#9
zK|)<gfDrg9chRNFUlu<COD<i;PM!M!1GXbJB%T)f2`cAc{K`36%W3dZy2T?f-8&!5
zE>I)wk_u>K8ip-Mx5xu|$9NFQUA5$0(&|J{Wsk>%nV5etUEDt}N|^dS_SxUK?4&E{
zJRT-}{vW(E$vatUfjd|NHEqe!E^5K=61Oh=i=+}nSyG8#S<-j&F8400jO>TJBD<qq
zx-G6<)atIBGlKMd2)K<^MAuDp>C#Pf>Y}D*IOXuty>oa!APEn7>EqeFlDDNvQ?t9F
zcJ^KN5VM<c`7ossW@dN8@H>Ii$}=#CUZS1y>c7ppD>SeGW3_yJndLHgbh)R_HwaVF
z^VT;Aa6Ushy1bob9~(;DCg?gvd+j=++JNo}nd+`30A)QT!3a#F@*U2<Is{YNAR=&z
zRtvQ|FgI)R9TG6rSBseXR0~bqKd0NGv_#9fOtP9Z+p{_mHb)3S7<vV{z$3G*bXv^k
zGBIjW3CBJ*K-p2i5MkzREsVAYes-#{7Upwis-YGJ51M+b?AmOunsCcqI!wQn9IGQi
zss-u@T{F|cXNFW1Vyd<lDlwg_##YWKTguQ(`Fd$dN4tEBP|ZL`Xgt1M=!#8G2H$Dv
zsz|#LEIf-oiFRp92YZmg2=XswPc&saCK!G*a46nMe|D@6F@REj3L3%@AY-~hSB3I)
zN-zR`ajLBr&wN!`y8C<8B?SDj!)?q-aD{|asCcQC>qAaW2A@dQ2^6byvXfC>s8y(x
zp4Y*M@j$0ja*rT4MTJC7@Eh+*3ATfbdD7Dnj2QDM$?3qn>Z6d83W)dYRxAggu*<VO
z1HlO8{5(#ola<Ny?zpBTFNcAtwn4AuTCwIcs?mijK(rQgg;WCcn^q3yx}Rk~&)&JD
zF{3q!Otq$9e$fQD+0fI7?Fe(9&9bNR8^cuyUyMKbWYAG3zi}_1IdMHI0(BU*7VMEc
z5{yjaz7dRIFz!ym&YW$KR~AR$`3^)`MbdM5A}Vm16Dxs1%jjpZB}77Ec8_xp7Mzw0
zOx6t;d~&DSQ|VLeL1eYu)o|ZU9X?_#vqcRYWWY)lyeb}_N0d|us8s4SJ&&G`?NyWm
zr{KjcxeN4O?6MoUz}wii)aBQMU3LdSUKj5#-aUy_;7)Ctw*F42gdz-26nh$^OV%#O
zEqgD!p<v(;BePV|*7OYvzU7_ZedfKN*~agip`F4#pTC$Vr;4Tk9N{td5$zE?ZxF|R
zjMOx|C#Q}~=?-$M=qE8QWM9x@N^1(cK;Xm0&JujA8jr$j;N6i>SwC0^<$Z3N9{jG@
zL)e9Q3HZx{zzm1$3nj|X4H6iB7yK`|bMa9{ytrc;aCks$cBq$CCazkBgEF3sH_6+}
z3*%`^N(Cm>xLn$4;JN1Zgi{izJX!)x+6zM9b|6mmVP-CnTY<C2<vgP0vT^R3m^*S^
znFWZ$=jrAHIKRi%|Dfj~{{iV$zz~KG2SA8Zrktwwi9$tu!hS|j->wG!P_T}%N^ZwL
zSMYA}p7Zd$99|x;p$+(kMeRcEMXg1xUfvHXc&B-WkZJ6K1|NZ0Lq2Tymq;7A@I^5T
zK2jLi)|}MP-FT&VvQk`R^1*H2(!QmSmtHO&Y!Dw?`e>c_`u4$MlS{u)Eel`QfB&g5
z@N=DGSU}17q>^h%+iE|Ws1R1`EGF-d4lpOvs>aRYK8`R%`X&`>tQXf7V}3n|KlK2e
z7wJFOBz|lY7Bw%18L{>w2GRo;{cqE{hIW*Z_2YLYm<d&p2W|D$<oD>QO(CrlCHtcb
z9rB1L56ahoYsc4`iH)L$L)sK4EX0Nw*w2&4_j<gWh~Et*W`A0AbGTQ2yT1C4RSN0$
z;M4Aq1z10BY1eAV?IL}|HgQg;|ErH`8>an!9lV#YLLVwy&AN@)0468eXrkBY+U3Ui
zp;V)u)E=|>!=`wXiMGSlx^plebShy(ruN^pnfC=V@tylaYM7ART;UqREz*KCl_8=v
z=SQy2cAkOB#Py~!p`LUbM;-16LKLkY$qXa~!!XgU+l1k5(QI?fjY|Uxgul<;3v*}q
zblbe1Xfw;8Kb7#+GT{l9!SL;zbpSU(r-NYHo3lU<v6j90sSHq@(N7s#r`XGZ$x8aG
z=~l8rBTx;vqI(i6w(s<9Uo^|W=ApdAd)#Ke%0DbX$c2%X+n3q|P6>Or$E^n@>N^)I
zE(2Rw?f7!~^}^h^Ooitc%syZ25bo{JG^8|;LEl)AR+=Uu@GndgdSPdl+%=p*H@>yZ
zUe!DUT9J`8v7(>;7yfc#<Ztt-eg0;$T|W@Ls+sFVhrzAN0yK1r@*KgyO_}u54=1CF
z6_HgG`j;dU?EBB(?6I%DLY9(DX{CjL4b+#Q!xX2v>3=?!g(?i1YJ1;jL~#pnei%JN
zcthU%u|XE}z-Mqr+2}21NC0)Q1pBl36|yU8&?WxEVhJpy)zjp6)EBYiP5XREQTC@O
z%ki?L`=&oMi_|H)i%%<I{M~aoi=!JomP7QLaFO|u4p}9uv@*AW8erztcgNb3Bki&*
z<$n=d9`uCvbYCmnD$3cIz3C*f1Y5a1&P8;Xl-`|!`5+F{)8BD@<NQwjuXmYg^`&>L
zT+!<TQ$IzOSbw_XC|F&xbwiaxhoT#GaVlO5k`c#l7P}kL%x~2j^H=5h-(NJ~s0YC>
zt(s3M^*8$%Z^Ze;ByI+L5%)(vgt<txW$}5Dg}I6AG3f1#lkFInVFQ^k$VEc`91aYn
zWKG;%Z(&jv*mj|>c;cxZaC&)iOSEB#Y`5m#wK*GPQAdZhWp&JR!+Rj3pGEgfHujG1
zRm|Ev60V_}9@75P;p_t0h*=ggD)o1fY;{0>bm-Pb%)M91UD`;r;6UVOM?F1&+ko~v
z3rji6ZZ+ZA!zFzOQ{ioEgZPQib)rp}Am{mlz1Y3c<l81MNN0lTGe+5Mc~{B+75HB7
z_hId+luP%S;mvD{k1HQ4P{Z)%lSp$V@0dnd`fM;+2lo-Xsl3CeaGx$7QMLGb*JP8>
zKGK%4jghW7Nz{?ea5rY}6-(2YY4|e>5bZ+wBZe;JzN)s27kM2vh@B|4BFoa6&$VsB
zZVyN_bCRH^zhP-2^0OvUaUM>{o#ligt}QNyLP73R_nl<HT0}E3dRm4L1l05e?t71`
z6oaSiek~Iej;9!JaozB1op?`7!7fMUi3jvz2K~lnmkNRRV1Oo$^}5%<mQ>n@z}?t=
z3TN?X4Z7)uYv>kHX}L77Ccmcl#)kS0tQRk&&kvMQ#|kstZMMx^==3%>D4;@Oxok$m
z0Z!D5%eb@l*X+)Z8SB2+@9w0>JVS*wco=_NwCIR>mlyHEX$lql<0*UKvTpUz=jna1
zmjHKC%Oi5t&B+$6fREE}+^0(m8^1}ZmwkiWah(2M|3s&j-bbyYJ?!#X)Mr}ieDjkG
zSp9C2nStr+9VCK%^<(cL_W1PMzB6Mmi%kpUevK)fH6bs;e(a={MS3E!ra@Zw*h0`;
z3>(9f2@dq$w3aa*`YS8Gfj@82)N^BIdB8xIv3)v+ih4ol2>jGY9VpXk?IjkM6jJA+
zhr{XYE;Wl`SdXZLJ-~Gf%AS0KptlyoaINpmKFhxpM4-|WW9_GZ4SXGb8uaW2Exw%l
zUqXd*58-;N%-^~xBJ}b`3*E2N?_0->ZC{s0>s&UpfGvT)-h_`m>YcJkeJETax;yG~
z_n*bV+5bzN$ag!^>^{uce|FlNwcEmP;OiXttYftkCt>n<N0e+nHqy41+7*!TVr0JR
zf5;Lf>YBhqqmSexi)YPJE1besE0@gBa36o4f2k3s7yfa!)&S!Z@M|*u1Iw2TGa@yD
zB?cB~TzAjwaA{d;z?vzsbuR%DDy>fXl&G)!hu>6ptZg3kK8Y4+f&UL7;>AuP!XVQi
zwy<ingLPrC_%{-k7-V65_WMIF<g$-e_aouJ$2~2az+JnNnLKOy5{70lC^2~CM5X00
zbCNK|L^X;wZ4=-Jcs>%g9`C+<i)>v6PeSg7E|Oo8fM_uqZ!ERq#)~UOPx+I1<Uv^k
zTK?L)H}h0C>ql>d6W#(4IoN1E6=CKRxva&gv{U6p7P$JN2I=9J@0Xu8QZE;Nza({p
z7GtLa074y6mye3a%u6Xto2m3F_rx#4FJMp!b{tWC(GcJNad9!IoIY-X3L1!|-yEZm
z5$cqQ{XI4EW86QC+5A~de8Bt`#;>|k{PROT{N$wms7yZb&AVDxv<7>EJ6`cHqu>Va
zvlJAyPDoE}x%oiu0Xw=y&~}}KlD{(RWp!AbiuWf{3j?#w4d$0~79JR9S120THWl#r
zf2j^9&Q1xQ4np$#YNVJ?gXZHGhO(BtQS=K9qOU|z>YH1`AfAf^MxyYiu!Awb!1-82
z1BUZ9umG1pj^2VFUXBgG<L&X*xF77LD?mMWX&xm?%DA7O$V0sb=aagZw5VUh@a&Px
z!Uj1@^nkuREUKE7>^8MjclqQt$97`j!npER;IKSt{6+=y$6B9nKL1u6a4dta14;1R
zt(A`0<##j=O;p0Eg>Nr-?$mK%citIHII3gAq_kaxj>1&=z|U)W8wrdDt6%MC-*du;
zEYIGWr1=+CnhAx<XGuSz=eRHSrO6FYD8qsDS9QCPvhv9B%W)>;?XS!SsZ?RMTI*v)
zthw6n2>hmS>Z<VI?Kx^bw;Ncgl=OzkM+cL7S$9EHU07Y?^*S}<>$}dSeO(>X{KcfR
z`*N;cLC50nQr6q-dnj90UPzs~l`+N5rl&N-Dl0JfNRC=H)yeA6+jC~p>sKt>areF!
z+SqQ<i5AGU>W<mi>SwWDQqt?0oL!8`&->b2yR-f&EmVa2->4%#<`9#FRdq^T!RX7Y
zPx6qb769c<KppnW&}k+7zJIyR=N@>e{j=H8&0`j2GHJf)V@qr8NwLwZ2wkD62gz-7
zP#rzROQnj?GMCu@bG_hqM!c0bgkogIo!Hub#k<f@!`Q60c_<|{&8uC@1JpXyOi$^@
zb*So6zqrkv!QtTBKk`bpSj24u@9z7c*I;$SvR#Xo#kX(&J)ZGDg@l~76{G2WluJ{P
z{UNaSeOW^}kzX#RUDA<W%**B3nmKD_`<E48J9njaEn?eAr}lmEjGH=H3u}Ics7NVk
zshZah4;xTtBnBSKYt_b5+@4&mt9s7b<^EyX?7tR(XK)(5<nGA6pH4PKoksUp32XS9
zEK*%JNb87^|A@ponYBAy1lF3G|A%N`eE-PDPTQ!eRZ@dy2a`j}HAgY4(q@fG4yC8c
z{A{k!{mJ!*aXQo^uj8@ag=ZI@cZT>+@3--2_G|NPw`@kVO*1XquV?*RFNl7^39+5|
zt@(7DbNlw89r`JG>%BuQ6V#(Dw+oHG#2uZ}W*mFGNpsVDigaC=pVs3eWr14qXA+}P
z!45_nV>6@lNA^CN%rLmfZ6;^oi>G<JvS3C&ho1&AFyERAbTr(bD~j3~*3m&V^q#0z
z41J|Ib^Hs$tTjG~ljNgFr7EP2nao1`jOv1Yb%Ym|yyV-lqS8uWKkeYFATO!+J)Bb9
z#FS=#kIHBK-p`!L{O&VB9IF4FTp=5OnfedNiHhm?p;}c5x1S?6@Iy>Dx9#)JnC(MM
z+MF8ywwOb|jC(uFjUxY5+{aB>S_8ym2!hNO^|AlIQn2}#m%($%0l<9*HmRadv+ovy
z*M<%8@nABBCRk<Dee)@~#i;L^Se{3)<m58<P|QnQOAy_w2H>->KH5a{D7Tt>2IUFz
z%DI-KE@>jBsHdUNkWS@k&-^Gt*uw^BBo{~CCrkxPO;B2C?FWQDTZ!Y^=PaSQqVG{9
zOM1Amf6D|{akN&$T_3a5__(+!_eiG-xKw>NA>7W0(k9W<S8XjEN4;TQ=y)%d55mj|
zvk77yK>>!!x2b*}+<3fn8=Q0moJkz$m{CNM?lZd8jd%HQ?Dl{kYhrwxt~^-_3TGO@
z?By}bFSq{R8i9Ih$^X&_w0@<;ZLK+697sxIY*crjNTM%jX88Be*fu_j@Tmz%t&goh
zEd$;I|7o|r!*cVlGuW3)W^CJx)Kkc7PgFX#Hvd~5&_Et;dHgSNAZ%#Wpt;_mBS0Y&
zI!JBK_6#~6__QUzPZlu*1<NvQbLsaZ75LPAzns@WvzHqM^q0mbt+zveiW}C(I6Gz+
z1n6F}u@~I>TNm&`md$%PK42L064&&lZWg-8jc@+w)A52c6MPNw7$e7xgEy<A{J_`U
zkb#jWn(fQL(%R)KU|2?IZoFKNlLzU6gxEkjBwM|mfpkoqKtDtG^K|!d<I6VdJL0E&
ztnUBCIN0y}a_fQDcNp7`BP)WsB8?@wMHSV!1EZNOy9A$=PlKLb@rOui`P=BAtIyVT
zoUA0AphQUSu0=2UXJw#!ke6*Xb*$dfcqHh!;u*%UBX|ZO*8(52t;}A__x9a+lCiy4
z-C=bVZQr|+P4KJVQ2tZ+!+lRTkJBRcGfE=KSI+%DcxLms%lIhj*^-XHJ7eeP**^s_
zHn9C;cY_ZX()7qi?Ik^isG}$dcu!(-T>9sEqrs7*!#Z7CKlUu3GY0oL++|p*BSV^V
zVrMK1?<67xH#5>L$8IHFjKyWRM+Jos!3C=2PfyaXopbF-dNTh(5F)7_=zJ#cJg~Xk
z?EvZd^n>G&OExYY$jy&Gq8fv*^uK+wW=C(pg8tU>BmbiRdXK+Fre>`CmJ^X+u<X)t
zrxI@&_>x%8Nbn+Ddh*eu)w11?rJ);gZY$~!kRwq!POo*g?eKpV*RfzCQDePU&noEK
zpf~>`(xH50pL%()TZjRS#m_ln<DneGhkZ{oD89Po@PT^UhZ*UK#$PEXvuZY3kL3nG
zQpbk1tnVnFf?0!}X;o#!;9VNCo2Pe4%(r|zBlpC(Ved}gJ3ex<tn)c}ttV4_SFShT
zAL)}}y1(##@X7uQ1-m}Py3FWK`d)GzP|AD_{e#u+-*YyFmDoVaefJlhDl2HKVc2Li
zmDc!>1L5GA;gaHkn;se34<hlSc&(#rS~H%WL{@vBB|DF$OqU<;`%|npD8jRBz@C{)
zUOty4`W;%NPneExX3Zq`)+szAS)Ka3{6wRMHPh~J2hdXd)BFUyI2fU!_UDtWedbTk
zcGlXIKgtD8(<6p=cZ5tU&Zv%_=sw_{5o;BZ%1GeDx12q$coQ705r27T_n_L=MlUdb
z`sRN!{u_z=`$C@{+S4mL;ndsQ<J6&Bbt=X&Uw*7(=2@{x?{E(`_74flOZW1Y9esn%
z^6q+M2J7wu5RkbR?%Arr{d4n<zv2FI=uDz^he5&k=L0So=KBNcrXQ(?Z)KEdo<9&+
zGJRD^_nkZ1^s@UR|E%$`40s%7tdX~qK53`kk*s%x`ZwJ7s=k~sdQNmZxkGE~_v2%^
zN%ahh;6in%wPOJ>)eaTQ)oQ%Ibom_7!??cxui!iCkgJ;y#@2FAcHa!ksIZ!QnCQ2x
z(cyY@i}jrQ7L&Q7Ho@KQX&264_YJIS(HXRyt+3W_oCeghuD6tA=1cc*tj2B=3#awH
zXrYpQeW#z+?C#wV1>YivfA8!?EJFr6k8HjMn4X$pp|*WF1$%A1`@6On6SN}64qtQA
zzTtqKzcsKr?nOt{^oY-GTauTTd$HhWa$T4&_IwHbN^1}DZD3zi{EJfC&bX;^CDo^0
z4;_of1=-Htc^i@Nl;Z+(2&?N6y#aKJB{zh+E$5WL{?#1Bz*Oo=>5^YNzxh7^e*d-M
z%|cx}U!i@31^?OHAYAGs&AMr&jNT<>LHV0>CC8P~kLL*K98GWs<xJn;xrpJAhaWR|
zTEYS)TQ5sGgE+SAqI(Plx}=jI@_4(_F%Nm6dk|B)O4HthkImA0T*xcARF+hsr1yc0
zFD%MSiAz@|CYRUERdd|VQgei|jIZ0VjFaqGZ#7}km9UfKo)KRsC8dWX8?anM-BIk6
z^zLll-*!gnF)p-csVR>!iQ7;`E<WzdF5IKbufqF~I>Ah3&S53$Kh)*^1l5(9{ebr{
z`wlN%d35cHpXq+=JY(Lg!ATj~bMvb7c^u@t`k0<D>wwD;1k_V<@h!q?#(W(#c)kwt
z-|Rk{?=l4316>dq$>5<9lMYOYsnT>iFjK9y+~)e1qIUUSA?uqFs3nXktn0a<6aX#^
z4I=4P%yvu(yB%`|pAH%YZWOu_(vwjyT?px*hw3e`=aQ18pBu`#xpD}CT_rTWu|oJa
zzRx<aVu+S##!hy#GnsQgzhcfW_S_Ie$?KJ@9_$GY#%f5@HPxhA+zrwapq(rqy_0o2
znN8r)9(pV3{zyA!s;L%+h*6Uc@KuH&fx5!1n`@K;0<jiRX8s;CMOJD8u(?c4b*W^f
zD<lKd58c_j?IK(W?g5XSTgl*x0sh({;Qu1~-={a$!jRiw8(I#|QCq2@UC__fQwp=*
zaODt@LxYe_i8gcYEo<g7Ei|Qd-cad4y;88~wY!K2a#OPUvt`Ne$^T^apvRJO3Jg{Y
zv+6e}18xPwdz~)=9%?C|k@;$&eV^+^_!9e};eFu|yS2kR{~OR-_PR|_9^&ruwdxT-
zLRsE>#a_s6ND1^j5bb@c9I><4w+DeTN2PTi(3Vt80dk3vmkP)|jy$SxRdrSfDrWsm
zd(ZY@QK8FQL7C)+F-Zl46sdrIqLO(RMXZ=Qg*~W|k)#3=%1O@^ZW1*KbQ;E|_bNQl
z&WbF)E>D{e?+xftc*H9|-bC_&XY$)}-LUZ~;GYD(+hdixLU1V)nk`qQA5{owI$>5*
z*s{;h6%P+(%iSe55@^_@($S}M1ZB(p`o4TqcmT@#%9<7RBmKQ7B|xEERq=-+6<Vgu
zt|@^bOmohaai1$BtSr6(AF#Ti3gR*~obu(WP!|QOZ_?ea53L-|GY`76!dL?e=G;tD
z-XTiqdgQ@Xy1f2&L6xqg9)(dd6omcVSSamNcyQg7BY^S|PK_Q?XJx{_6e@tHg2ld9
zuXK*@?!_oaDN^>pQB_`qRd$$S^!4xNhF-;la+=DR&lLmp`h38Hs+@}Yt;!VP%Aho(
z2P(=SPi3~(tHI37sTT+CVP9k2v#U>cq^CVKvE7SduZz<9;_!vI_~crWmpP`FxcDHb
z`r-rj=O?IQi=?-=8xmX_5?p+?_*Lp4Dz`w_zFIuiW}4!bguk!vm&J{FX`NNHsN8Bv
z`8@4{syZ(#L@~!sGskxX=t#}UJH5{r+RL*%-hJ_l4facVT-bP!5>~~#aB<<>9Ax}j
zw2h8T8&E}d#<jU#p0rU40xzz~G$dL?t#b=*KkGYhD@p<u9A!ZtUOYLS8{Nh|;gn$T
zkeQY6yO)Zx&<OnvtwOLW>YPAS!q{GG4OiRSk`FIHv)-P2pu4S?@Fo2B#T7PA;&ojr
zz&44LANgOKvTAsmKlcUa{mo=ey!R&bgwzEU3)djFYnnEz&YKiAo?Fn;tJ9Tz*tVUc
z5wsG`5UF->@lksiYxPt65#q%#KQYHj=sc{4NmObj@ed{K^lY%*$0Ec;HhED0wsND_
zTC6o^&v)&<>~?%%lyXa!zdB$gnAX+AE`<C|SvO(mi)P&gn*T}DKZnJrVRs8=L}tAc
zy}&|v+;)LqALAa=UtO_l>6y`MbHNpCh*%Zj-TWkXc_I<&@Mwd5IwEp7MBMiV@gSPv
z(}z6d?a|ty*m(v1q!K<y^`GNseydqP-t}ttbpib(ulat{xR=YN8PDZLlCAQhRGln-
zwVrO<%1XcuPPJjI9#yS)drs{21}KC4UCBwT`QcNvo*ZYO9rn$q5=fEo92Bz$?pZAf
z$mg=u=45q;;9r(rJLE*me<mFvoxjiAq!YEn#QaxOVu|W1pDcadC>6J^Dxgd9a`PY$
zeb6Z|FE@Hsc2<-*B~yNv`G&lQ5~wz&Mad-J;aC)%n6y+DK%5g!H<dL@XF?v&vgS|w
zoBQ<<A42~k_mtMcKgM`&V`W8+2DSfFU(h_Ff17X3hyKOdwNBJFZT3-|#hW?gf_JvB
zj*1wC|DoLFMoO2LWDKnvTP4u3Q<Z2jD=tUo)*frdMIJE8-qf3I)UD3hb5p~Jl0Lr8
zSlGWQE#h#Q{DLxJN(tS~Ve@bi^CuML(6pjAy9aXoisi@ndaDHRzDBcZ_LaKL^t<)O
zIZGk3qsf*Ln<~~#`z_GD+M`@%_J}#@X`btpRdD>m2zuQ#)`S3ts9M}M+1sqtIaH8d
z=-Z)JWG7j9;h#`<7E5g^w>h{hQ1rt57AV=5!YUjhP3We@9v<jw8G(6vcxw_txhHb*
zT$}XX{k(fo+|YH?$OWR_l8s+d-74L~-JT0A^>FcpDA;yzz5MKC%Lg%Tb_4O+(e9Xq
zS3_#!H%C+OJ4b!QD3y1}m$Ir)%j=8tg=bOUwnXDqeV#p(>&Pl?R#vQo80J`J6BtGb
z`x#YfeCvds;P!?zw00S`)4{oLhs=SYt34^vCWe08WYB!Tr1<y{_dP2uHBG5aEuCMe
z20U$EPH;D6Z^E2ygP)F+_+=pUtqe~uP&?{K0g9BIZZU6@5_R~<(j%wq#8ccC?g~e`
z^xF71+B;|%AT;5Vt2&kcQs8spjuOgm_Hqvc1B!#FLMR%y%6XMMt3{3!<~<rXqsgNH
z@5J8*9x>7zKYd|fr8BC~dV6?tlze8U(+7rKXsfQ`t9cTNlVbO1C&_zXXR&)zunI}0
z2_X9Q*&2{`b$#Xn@~(IUmN{t}N>&Vu5X<A?CJ1{4<HrjtByn+9vXnXa0LAdh36J3B
QD4?k)_GixgwsOV)0fhDMiU0rr

literal 0
HcmV?d00001

diff --git a/vignettes/tlsh.Rmd b/vignettes/tlsh.Rmd
new file mode 100644
index 0000000..f265a6b
--- /dev/null
+++ b/vignettes/tlsh.Rmd
@@ -0,0 +1,86 @@
+---
+title: "tlsh"
+author: "Rebecca C. Steorts"
+date: "`r Sys.Date()`"
+output: 
+    rmarkdown::html_vignette:
+        fig_caption: yes
+vignette: >
+  %\VignetteIndexEntry{tlsh}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\usepackage[utf8]{inputenc}
+---
+We present a small example from Steorts, R., Ventura, S., Sadinle, M., and Fienberg, S. (2014). "Blocking Comparisons for Record Linkage." Privacy in Statistical Databases (Lecture Notes in Computer Science 8744), ed. J Domingo-Ferrer, Springer, 252-268, \doi{10.1007/978-3-319-11257-2_20}. We will be using the blink package in R and the RLdata500 data set, which was previously available in the Record Linkage package (but has been deprecated). Here, we illustrate transitive LSH. 
+
+In a record linkage task one wants to remove duplicate 
+entries from multiple databases. However, before performing this task, one needs to perform a means of dimension reduction so that the record linkage task is computationally scalable. 
+ 
+Using the TLSH algorithm, we illustrate an example of using this package using a German dataset comprised of first and last name and full date of birth. 
+
+Our goals include
+
+- Presenting the RLdata500 dataset with summary information.
+- Illustrating how we can format the RLdata500 dataset to work with the klsh
+- Running TLSH on the RLdata500 data set to create blocks
+- Explaining the tuning parameters of TLSH and how to choose these in practice with evaluation metrics.
+- Sample output and visualizations 
+
+## Understanding the RLdata500 dataset
+
+The RLdata500 dataset exists already in the blink package in R. We review this data set for the user. 
+
+The RLdata500 data consists of 500 records with 10 percent duplication. Thus, there are 450 unique individuals. There is full information on each record containing first name, last name, and full date of birth. 
+
+We first load the blink package and load the RLdata500 data set. We also, provide the first few lines of the data. We also remove missing values (they are all missing in this data set). 
+
+```{r, echo=TRUE, message=FALSE, knitr::opts_chunk$set(cache=TRUE)}
+library(blink)
+library(plyr)
+library(tlsh)
+data(RLdata500)
+head(RLdata500)
+data.500 <- RLdata500[-c(2,4)]
+head(data.500)
+```
+
+## TLSH applied to RLdata500
+
+
+
+We now explain how to run TLSH on the RLdata500 data set, piece by piece. 
+
+1. We first must creat a universal set of tokens.
+2. We then number find the number of tokens in the universal set.
+3. Then we must generate a vector of random hash functions.
+4. Next, we must creating an index vector and apply the hash functions to each record
+5. Then we build an edgelist, divide the graph into communities initially, sub-divide the communities more if needed
+6. Finally, we have our blocks.
+7. Then we can compute the dimension reduction and the recall. 
+
+The function that find the blocks is called **block_setup_v2. 
+
+```{r} 
+ blocks <- block_setup_v2(RLdata500, b=22, k=2)
+ summary(blocks)
+```
+
+where b is the number of **buckets** and k is the **shingle size**. 
+
+Observe that the blocks are roughly about the same size, however, this does not have to be the case.  
+
+
+The function that allows us to find the recall is **eval.blocksetup**.
+
+```{r}
+eval.blocksetup(RLdata500, b=26, key=identity.RLdata500)
+```
+
+The function that allows us to find the reduction ratio is **reduction.ratio.from.blocking**.
+
+```{r}
+(rr <- reduction.ratio.from.blocking(blocks)) 
+```
+
+To summarize, we have reduced the entire space by roughly 66 percent and the recall is 0.90, which means we are only splitting records across blocks 10 percent of the time. 
+
+