-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b9baca0
commit 2c3f2d6
Showing
15 changed files
with
264 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
^.*\.Rproj$ | ||
^\.Rproj\.user$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
.Rproj.user | ||
.Rhistory | ||
.RData | ||
src/*.o | ||
src/*.so | ||
src/*.dll |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
Package: LSHR | ||
Type: Package | ||
Title: Locality Sensitive Hashing In R | ||
Version: 0.1.0 | ||
Date: 2015-06-03 | ||
Authors@R: person("Dmitriy", "Selivanov", role = c("aut", "cre"), | ||
email = "selivanov.dmitriy@gmail.com") | ||
Maintainer: Dmitriy Selivanov <selivanov.dmitriy@gmail.com> | ||
Description: More about what it does (maybe more than one line) | ||
License: MIT + file LICENSE | ||
URL: https://github.com/dselivanov/LSHR | ||
BugReports: https://github.com/dselivanov/LSHR/issues | ||
VignetteBuilder: knitr | ||
LazyData: TRUE | ||
Imports: | ||
data.table(>= 1.9.4), | ||
fastmatch (>= 1.0-4), | ||
magrittr (>= 1.5), | ||
parallel, | ||
Rcpp (>= 0.11.5) | ||
LinkingTo: | ||
Rcpp | ||
Suggests: | ||
testthat, | ||
roxygen2, | ||
knitr |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
YEAR: 2015 | ||
COPYRIGHT HOLDER: Dmitriy Selivanov |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
Version: 1.0 | ||
|
||
RestoreWorkspace: Default | ||
SaveWorkspace: Default | ||
AlwaysSaveHistory: Default | ||
|
||
EnableCodeIndexing: Yes | ||
UseSpacesForTab: Yes | ||
NumSpacesForTab: 2 | ||
Encoding: UTF-8 | ||
|
||
RnwWeave: Sweave | ||
LaTeX: pdfLaTeX | ||
|
||
AutoAppendNewline: Yes | ||
StripTrailingWhitespace: Yes | ||
|
||
BuildType: Package | ||
PackageUseDevtools: Yes | ||
PackageInstallArgs: --no-multiarch --with-keep.source | ||
PackageRoxygenize: rd,collate,namespace,vignette |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# Generated by roxygen2 (4.1.1): do not edit by hand | ||
|
||
export(get_candidate_pairs) | ||
export(get_signature_matrix) | ||
import(data.table) | ||
useDynLib(LSHR) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#' LSHR: Locality Sensitive haching in R | ||
#' | ||
#' @docType package | ||
#' @name LSHR | ||
#' @useDynLib LSHR | ||
#' @import data.table | ||
NULL |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# This file was generated by Rcpp::compileAttributes | ||
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | ||
|
||
get_hash_matrix <- function(unique_shingles_length, hashfun_number = 60L, cores = 2L) { | ||
.Call('LSHR_get_hash_matrix', PACKAGE = 'LSHR', unique_shingles_length, hashfun_number, cores) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#' @export | ||
#' @name get_candidate_pairs | ||
#' @title Calculating candidate pairs using locality sensitive hashing. | ||
#' | ||
#' @param signature_matrix input signature matrix - \code{\link{integer}} \code{\link{matrix}} | ||
#' @param bands_number number of bands for LSH algorithm. | ||
#' @param similarity target value of jacard similarity we are looking for. | ||
#' @param verbose - \link{logical} print useful algorithm information. | ||
#' @return pairs of candidates with similarity => \code{similarity} - | ||
#' \code{\link{data.table}} with 3 colums: index1, index2, N - | ||
#' index of first candidate, index of second candidate, | ||
#' and number of buckets where they share same value. The latter provided | ||
#' only for information. | ||
#' (Intutition is following: the bigger N - the bigger similarity) | ||
#' | ||
#' | ||
#' @examples | ||
#' sets <- lapply(1:10, function(x) sample(letters, sample(5:15))) | ||
#' # add set similar to first set to the end of list | ||
#' sets <- c(sets, list(c(sets[[1]], sample(letters, 5)))) | ||
#' sm <- get_signature_matrix(sets, 12, cores = 4) | ||
#' get_candidate_pairs(sm, 6, 0.9) | ||
get_candidate_pairs <- function(signature_matrix, bands_number, similarity, verbose = TRUE) { | ||
# signature_matrix <- get_signature_matrix(sets, hashfun_number, cores = cores) | ||
sm_nrow <- nrow(signature_matrix) | ||
if( sm_nrow %% bands_number != 0) | ||
stop("number of bands should be divisor of number of rows of signature matrix: 0 == nrow(signature_matrix) %% bands_number") | ||
if(verbose) { | ||
rows_per_band <- sm_nrow / bands_number | ||
prob_become_candidate <- 1 - (1 - similarity ^ rows_per_band) ^ bands_number | ||
print(paste('Looking for sets with similarity', | ||
round(similarity, 2), | ||
'with probablity of becoming candidate pair =', | ||
prob_become_candidate)) | ||
} | ||
|
||
# calculate bands borders for splitting signarure matrix | ||
splits <- split_vector(x = 1:sm_nrow, splits = bands_number) | ||
|
||
buckets <- Map(hash_bucket, splits, | ||
MoreArgs = list(signature_matrix = signature_matrix )) | ||
|
||
candidate_pairs <- lapply(buckets, detect_buckets) | ||
dt = rbindlist(candidate_pairs) | ||
dt[, .N, keyby = c('index1', 'index2')][N > 1] | ||
} | ||
|
||
detect_buckets <- function(bucket) { | ||
dt = data.table(index = seq_along(bucket), value = bucket, key=c('value')) | ||
dt = dt[dt[ , .N, keyby = value][N > 1]] | ||
dt[dt, list(index1 = index, index2 = i.index), allow.cartesian = TRUE][index1 < index2] | ||
} | ||
|
||
hash_bucket <- function(row_index_bounds, signature_matrix) { | ||
row_indices <- row_index_bounds[[1]]:row_index_bounds[[2]] | ||
# Using simple sum() as hash of signature chunk in each band. | ||
# We can (and should) construct better hash function for hashing integer sequences | ||
colSums(signature_matrix[row_indices, , drop = FALSE]) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
minhashing <- function(hash_matrix, TDM, hashfun_number) { | ||
p <- length(TDM) | ||
m_sig <- matrix(data = rep(Inf, hashfun_number * p), nrow = hashfun_number, ncol = p) | ||
for (c in 1:length(TDM)) { | ||
mat_non_zero_rows <- TDM[[c]] | ||
mat <- hash_matrix[mat_non_zero_rows, , drop = FALSE] | ||
m_sig[, c] <- pmin.int(m_sig[, c], matrixStats::colMins(mat)) | ||
} | ||
m_sig | ||
} | ||
|
||
#' @export | ||
#' @name get_signature_matrix | ||
#' @title Calculating signature matrix using minhashing techniqe. | ||
#' | ||
#' @param sets input sets in a form of \code{\link{list}} of \code{\link{vector}}s | ||
#' @param hashfun_number number of hash functions to calculate signature matrix. | ||
#' @param cores number of CPU threads to use while calculating hashes | ||
#' | ||
#' @return The signature matrix - \code{\link{integer}} \code{\link{matrix}} with dimension \code{hashfun_number * length(sets)} | ||
#' | ||
#' @examples | ||
#' sets <- lapply(1:10, function(x) sample(letters, sample(5:15))) | ||
#' sm <- get_signature_matrix(sets, 10, cores = 4) | ||
get_signature_matrix <- function (sets, hashfun_number, cores = parallel::detectCores()) { | ||
shingles <- sets %>% unlist %>% unique | ||
unique_shingles_length <- length(shingles) | ||
# make sparse term-document matrix : rows = elements of set, cols = set's ids | ||
# values = [TRUE,FALSE] - whether given set contains given element of set | ||
# we store matrix as list of arrays. So we keep only TRUE values: | ||
# each element of list is an arrays which contains row numbers where elements are TRUE | ||
TDM <- Map(function(set, dict) fastmatch::fmatch(x = set, table = dict), | ||
sets, | ||
MoreArgs = list(dict = shingles)) | ||
# calculate hashes for each hash function and each row number | ||
hash_matrix <- get_hash_matrix(unique_shingles_length = unique_shingles_length, | ||
hashfun_number = hashfun_number, | ||
cores = cores) | ||
minhashing(hash_matrix, TDM, hashfun_number) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
split_vector <- function(x, splits) { | ||
if(! is.vector(x)) stop("x must be vector or list") | ||
if (length(x) < splits ) { | ||
warning("Length of input is too small for splitting for a given number of splits. Assuming no splits.") | ||
return (list(c(1, length(x)))) | ||
} | ||
chunkSize = length(x) %/% (splits) | ||
knots = ceiling(seq.int(from = 1, to = length(x) + 1, length.out = splits + 1)) | ||
mapply(FUN = function(lower, upper) list(c(lower, upper)), knots[-length(knots)], knots[-1] - 1) | ||
} | ||
|
||
jaccard <- function(x, y) { | ||
length(intersect(x, y)) / length(union(x, y)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
% Generated by roxygen2 (4.1.1): do not edit by hand | ||
% Please edit documentation in R/LSHR.R | ||
\docType{package} | ||
\name{LSHR} | ||
\alias{LSHR} | ||
\alias{LSHR-package} | ||
\title{LSHR: Locality Sensitive haching in R} | ||
\description{ | ||
LSHR: Locality Sensitive haching in R | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
% Generated by roxygen2 (4.1.1): do not edit by hand | ||
% Please edit documentation in R/lsh.R | ||
\name{get_candidate_pairs} | ||
\alias{get_candidate_pairs} | ||
\title{Calculating candidate pairs using locality sensitive hashing.} | ||
\usage{ | ||
get_candidate_pairs(signature_matrix, bands_number, similarity, | ||
verbose = TRUE) | ||
} | ||
\arguments{ | ||
\item{signature_matrix}{input signature matrix - \code{\link{integer}} \code{\link{matrix}}} | ||
|
||
\item{bands_number}{number of bands for LSH algorithm.} | ||
|
||
\item{similarity}{target value of jacard similarity we are looking for.} | ||
|
||
\item{verbose}{- \link{logical} print useful algorithm information.} | ||
} | ||
\value{ | ||
pairs of candidates with similarity => \code{similarity} - | ||
\code{\link{data.table}} with 3 colums: index1, index2, N - | ||
index of first candidate, index of second candidate, | ||
and number of buckets where they share same value. The latter provided | ||
only for information. | ||
(Intutition is following: the bigger N - the bigger similarity) | ||
} | ||
\description{ | ||
Calculating candidate pairs using locality sensitive hashing. | ||
} | ||
\examples{ | ||
sets <- lapply(1:10, function(x) sample(letters, sample(5:15))) | ||
# add set similar to first set to the end of list | ||
sets <- c(sets, list(c(sets[[1]], sample(letters, 5)))) | ||
sm <- get_signature_matrix(sets, 12, cores = 4) | ||
get_candidate_pairs(sm, 6, 0.9) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
% Generated by roxygen2 (4.1.1): do not edit by hand | ||
% Please edit documentation in R/minhash.R | ||
\name{get_signature_matrix} | ||
\alias{get_signature_matrix} | ||
\title{Calculating signature matrix using minhashing techniqe.} | ||
\usage{ | ||
get_signature_matrix(sets, hashfun_number, cores = parallel::detectCores()) | ||
} | ||
\arguments{ | ||
\item{sets}{input sets in a form of \code{\link{list}} of \code{\link{vector}}s} | ||
|
||
\item{hashfun_number}{number of hash functions to calculate signature matrix.} | ||
|
||
\item{cores}{number of CPU threads to use while calculating hashes} | ||
} | ||
\value{ | ||
The signature matrix - \code{\link{integer}} \code{\link{matrix}} with dimension \code{hashfun_number * length(sets)} | ||
} | ||
\description{ | ||
Calculating signature matrix using minhashing techniqe. | ||
} | ||
\examples{ | ||
sets <- lapply(1:10, function(x) sample(letters, sample(5:15))) | ||
sm <- get_signature_matrix(sets, 10, cores = 4) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters