Skip to content

Commit

Permalink
minimal functionality functions
Browse files Browse the repository at this point in the history
  • Loading branch information
dselivanov committed Jun 10, 2015
1 parent b9baca0 commit 2c3f2d6
Show file tree
Hide file tree
Showing 15 changed files with 264 additions and 24 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.Rproj.user
.Rhistory
.RData
src/*.o
src/*.so
src/*.dll
26 changes: 26 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
Package: LSHR
Type: Package
Title: Locality Sensitive Hashing In R
Version: 0.1.0
Date: 2015-06-03
Authors@R: person("Dmitriy", "Selivanov", role = c("aut", "cre"),
email = "selivanov.dmitriy@gmail.com")
Maintainer: Dmitriy Selivanov <selivanov.dmitriy@gmail.com>
Description: More about what it does (maybe more than one line)
License: MIT + file LICENSE
URL: https://github.com/dselivanov/LSHR
BugReports: https://github.com/dselivanov/LSHR/issues
VignetteBuilder: knitr
LazyData: TRUE
Imports:
data.table(>= 1.9.4),
fastmatch (>= 1.0-4),
magrittr (>= 1.5),
parallel,
Rcpp (>= 0.11.5)
LinkingTo:
Rcpp
Suggests:
testthat,
roxygen2,
knitr
2 changes: 2 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
YEAR: 2015
COPYRIGHT HOLDER: Dmitriy Selivanov
21 changes: 21 additions & 0 deletions LSHR.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace,vignette
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Generated by roxygen2 (4.1.1): do not edit by hand

export(get_candidate_pairs)
export(get_signature_matrix)
import(data.table)
useDynLib(LSHR)
7 changes: 7 additions & 0 deletions R/LSHR.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#' LSHR: Locality Sensitive haching in R
#'
#' @docType package
#' @name LSHR
#' @useDynLib LSHR
#' @import data.table
NULL
7 changes: 7 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# This file was generated by Rcpp::compileAttributes
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

get_hash_matrix <- function(unique_shingles_length, hashfun_number = 60L, cores = 2L) {
.Call('LSHR_get_hash_matrix', PACKAGE = 'LSHR', unique_shingles_length, hashfun_number, cores)
}

59 changes: 59 additions & 0 deletions R/lsh.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#' @export
#' @name get_candidate_pairs
#' @title Calculating candidate pairs using locality sensitive hashing.
#'
#' @param signature_matrix input signature matrix - \code{\link{integer}} \code{\link{matrix}}
#' @param bands_number number of bands for LSH algorithm.
#' @param similarity target value of jacard similarity we are looking for.
#' @param verbose - \link{logical} print useful algorithm information.
#' @return pairs of candidates with similarity => \code{similarity} -
#' \code{\link{data.table}} with 3 colums: index1, index2, N -
#' index of first candidate, index of second candidate,
#' and number of buckets where they share same value. The latter provided
#' only for information.
#' (Intutition is following: the bigger N - the bigger similarity)
#'
#'
#' @examples
#' sets <- lapply(1:10, function(x) sample(letters, sample(5:15)))
#' # add set similar to first set to the end of list
#' sets <- c(sets, list(c(sets[[1]], sample(letters, 5))))
#' sm <- get_signature_matrix(sets, 12, cores = 4)
#' get_candidate_pairs(sm, 6, 0.9)
get_candidate_pairs <- function(signature_matrix, bands_number, similarity, verbose = TRUE) {
# signature_matrix <- get_signature_matrix(sets, hashfun_number, cores = cores)
sm_nrow <- nrow(signature_matrix)
if( sm_nrow %% bands_number != 0)
stop("number of bands should be divisor of number of rows of signature matrix: 0 == nrow(signature_matrix) %% bands_number")
if(verbose) {
rows_per_band <- sm_nrow / bands_number
prob_become_candidate <- 1 - (1 - similarity ^ rows_per_band) ^ bands_number
print(paste('Looking for sets with similarity',
round(similarity, 2),
'with probablity of becoming candidate pair =',
prob_become_candidate))
}

# calculate bands borders for splitting signarure matrix
splits <- split_vector(x = 1:sm_nrow, splits = bands_number)

buckets <- Map(hash_bucket, splits,
MoreArgs = list(signature_matrix = signature_matrix ))

candidate_pairs <- lapply(buckets, detect_buckets)
dt = rbindlist(candidate_pairs)
dt[, .N, keyby = c('index1', 'index2')][N > 1]
}

detect_buckets <- function(bucket) {
dt = data.table(index = seq_along(bucket), value = bucket, key=c('value'))
dt = dt[dt[ , .N, keyby = value][N > 1]]
dt[dt, list(index1 = index, index2 = i.index), allow.cartesian = TRUE][index1 < index2]
}

hash_bucket <- function(row_index_bounds, signature_matrix) {
row_indices <- row_index_bounds[[1]]:row_index_bounds[[2]]
# Using simple sum() as hash of signature chunk in each band.
# We can (and should) construct better hash function for hashing integer sequences
colSums(signature_matrix[row_indices, , drop = FALSE])
}
40 changes: 40 additions & 0 deletions R/minhash.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
minhashing <- function(hash_matrix, TDM, hashfun_number) {
p <- length(TDM)
m_sig <- matrix(data = rep(Inf, hashfun_number * p), nrow = hashfun_number, ncol = p)
for (c in 1:length(TDM)) {
mat_non_zero_rows <- TDM[[c]]
mat <- hash_matrix[mat_non_zero_rows, , drop = FALSE]
m_sig[, c] <- pmin.int(m_sig[, c], matrixStats::colMins(mat))
}
m_sig
}

#' @export
#' @name get_signature_matrix
#' @title Calculating signature matrix using minhashing techniqe.
#'
#' @param sets input sets in a form of \code{\link{list}} of \code{\link{vector}}s
#' @param hashfun_number number of hash functions to calculate signature matrix.
#' @param cores number of CPU threads to use while calculating hashes
#'
#' @return The signature matrix - \code{\link{integer}} \code{\link{matrix}} with dimension \code{hashfun_number * length(sets)}
#'
#' @examples
#' sets <- lapply(1:10, function(x) sample(letters, sample(5:15)))
#' sm <- get_signature_matrix(sets, 10, cores = 4)
get_signature_matrix <- function (sets, hashfun_number, cores = parallel::detectCores()) {
shingles <- sets %>% unlist %>% unique
unique_shingles_length <- length(shingles)
# make sparse term-document matrix : rows = elements of set, cols = set's ids
# values = [TRUE,FALSE] - whether given set contains given element of set
# we store matrix as list of arrays. So we keep only TRUE values:
# each element of list is an arrays which contains row numbers where elements are TRUE
TDM <- Map(function(set, dict) fastmatch::fmatch(x = set, table = dict),
sets,
MoreArgs = list(dict = shingles))
# calculate hashes for each hash function and each row number
hash_matrix <- get_hash_matrix(unique_shingles_length = unique_shingles_length,
hashfun_number = hashfun_number,
cores = cores)
minhashing(hash_matrix, TDM, hashfun_number)
}
14 changes: 14 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
split_vector <- function(x, splits) {
if(! is.vector(x)) stop("x must be vector or list")
if (length(x) < splits ) {
warning("Length of input is too small for splitting for a given number of splits. Assuming no splits.")
return (list(c(1, length(x))))
}
chunkSize = length(x) %/% (splits)
knots = ceiling(seq.int(from = 1, to = length(x) + 1, length.out = splits + 1))
mapply(FUN = function(lower, upper) list(c(lower, upper)), knots[-length(knots)], knots[-1] - 1)
}

jaccard <- function(x, y) {
length(intersect(x, y)) / length(union(x, y))
}
11 changes: 11 additions & 0 deletions man/LSHR.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/LSHR.R
\docType{package}
\name{LSHR}
\alias{LSHR}
\alias{LSHR-package}
\title{LSHR: Locality Sensitive haching in R}
\description{
LSHR: Locality Sensitive haching in R
}

37 changes: 37 additions & 0 deletions man/get_candidate_pairs.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/lsh.R
\name{get_candidate_pairs}
\alias{get_candidate_pairs}
\title{Calculating candidate pairs using locality sensitive hashing.}
\usage{
get_candidate_pairs(signature_matrix, bands_number, similarity,
verbose = TRUE)
}
\arguments{
\item{signature_matrix}{input signature matrix - \code{\link{integer}} \code{\link{matrix}}}

\item{bands_number}{number of bands for LSH algorithm.}

\item{similarity}{target value of jacard similarity we are looking for.}

\item{verbose}{- \link{logical} print useful algorithm information.}
}
\value{
pairs of candidates with similarity => \code{similarity} -
\code{\link{data.table}} with 3 colums: index1, index2, N -
index of first candidate, index of second candidate,
and number of buckets where they share same value. The latter provided
only for information.
(Intutition is following: the bigger N - the bigger similarity)
}
\description{
Calculating candidate pairs using locality sensitive hashing.
}
\examples{
sets <- lapply(1:10, function(x) sample(letters, sample(5:15)))
# add set similar to first set to the end of list
sets <- c(sets, list(c(sets[[1]], sample(letters, 5))))
sm <- get_signature_matrix(sets, 12, cores = 4)
get_candidate_pairs(sm, 6, 0.9)
}

26 changes: 26 additions & 0 deletions man/get_signature_matrix.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/minhash.R
\name{get_signature_matrix}
\alias{get_signature_matrix}
\title{Calculating signature matrix using minhashing techniqe.}
\usage{
get_signature_matrix(sets, hashfun_number, cores = parallel::detectCores())
}
\arguments{
\item{sets}{input sets in a form of \code{\link{list}} of \code{\link{vector}}s}

\item{hashfun_number}{number of hash functions to calculate signature matrix.}

\item{cores}{number of CPU threads to use while calculating hashes}
}
\value{
The signature matrix - \code{\link{integer}} \code{\link{matrix}} with dimension \code{hashfun_number * length(sets)}
}
\description{
Calculating signature matrix using minhashing techniqe.
}
\examples{
sets <- lapply(1:10, function(x) sample(letters, sample(5:15)))
sm <- get_signature_matrix(sets, 10, cores = 4)
}

24 changes: 0 additions & 24 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,6 @@

using namespace Rcpp;

// hashfun_1
Rcpp::IntegerVector hashfun_1(IntegerVector vec, int cores);
RcppExport SEXP LSHR_hashfun_1(SEXP vecSEXP, SEXP coresSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< IntegerVector >::type vec(vecSEXP);
Rcpp::traits::input_parameter< int >::type cores(coresSEXP);
__result = Rcpp::wrap(hashfun_1(vec, cores));
return __result;
END_RCPP
}
// hashfun_2
Rcpp::IntegerVector hashfun_2(IntegerVector vec, int cores);
RcppExport SEXP LSHR_hashfun_2(SEXP vecSEXP, SEXP coresSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< IntegerVector >::type vec(vecSEXP);
Rcpp::traits::input_parameter< int >::type cores(coresSEXP);
__result = Rcpp::wrap(hashfun_2(vec, cores));
return __result;
END_RCPP
}
// get_hash_matrix
IntegerMatrix get_hash_matrix(int unique_shingles_length, int hashfun_number, int cores);
RcppExport SEXP LSHR_get_hash_matrix(SEXP unique_shingles_lengthSEXP, SEXP hashfun_numberSEXP, SEXP coresSEXP) {
Expand Down

0 comments on commit 2c3f2d6

Please sign in to comment.