Skip to content

Commit

Permalink
Do not tests spookyhash test on big endian platforms (#206)
Browse files Browse the repository at this point in the history
* Do not test spookyhash on big endian platforms

* Do not use a header-file names endian.h, embed declarations

* Refine spookyhash and endianness checks
  • Loading branch information
eddelbuettel committed Jun 23, 2024
1 parent a8aeaa9 commit 26ab7a6
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 96 deletions.
20 changes: 20 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
2024-06-23 Dirk Eddelbuettel <edd@debian.org>

* DESCRIPTION (Version, Date): Roll micro version and date

* src/digest.c (is_little_endian): Define alternate helper tooo
* man/digest.Rd: Compare to spookyhash ref only on little endian

2024-06-22 Dirk Eddelbuettel <edd@debian.org>

* src/digest.c (is_big_endian): Use endian-ness definition from
Rconfig.h and define one-line helper

* inst/tinytest/test_digest.R: Skip spookyhash test on big endian

2024-06-21 Sergey Fedorov <vital.had@gmail.com>

* src/blake3.c: Upstream patch for big endian systems
* src/blake3_impl.h: Idem
* src/blake3_portable.c: Idem

2024-06-15 Dirk Eddelbuettel <edd@debian.org>

* .github/workflows/ci.yaml (jobs): Update to r-ci-setup action
Expand Down
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ Author: Dirk Eddelbuettel <edd@debian.org> with contributions
Michel Lang, Viliam Simko, Kurt Hornik, Radford Neal, Kendon Bell,
Matthew de Queljoe, Ion Suruceanu, Bill Denney, Dirk Schumacher,
Winston Chang, Dean Attali, and Michael Chirico.
Version: 0.6.35.2
Date: 2024-05-16
Version: 0.6.35.3
Date: 2024-06-23
Maintainer: Dirk Eddelbuettel <edd@debian.org>
Title: Create Compact Hash Digests of R Objects
Description: Implementation of a function 'digest()' for the creation of hash
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
## package has a dynamic library
useDynLib(digest, digest_impl=digest, vdigest_impl=vdigest, digest2int_impl=digest2int, AESinit, AESencryptECB, AESdecryptECB, spookydigest_impl, .registration=TRUE)
useDynLib(digest, digest_impl=digest, vdigest_impl=vdigest, digest2int_impl=digest2int, AESinit, AESencryptECB, AESdecryptECB, spookydigest_impl, is_little_endian, is_big_endian, .registration=TRUE)

importFrom(utils, packageVersion)

Expand Down
181 changes: 89 additions & 92 deletions inst/tinytest/test_digest.R
Original file line number Diff line number Diff line change
Expand Up @@ -208,100 +208,97 @@ expect_identical(murmur32(murmur32Input, serialize = FALSE), murmur32Output)

## tests for digest spooky

expect_true(require(digest))

## test vectors (originally for md5)
spookyInput <-
c("",
"a",
"abc",
"message digest",
"abcdefghijklmnopqrstuvwxyz",
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
paste("12345678901234567890123456789012345678901234567890123456789012",
"345678901234567890", sep=""))

# from spooky import hash128
# from binascii import hexlify
#
# spookyInput = [
# "",
# "a",
# "abc",
# "message digest",
# "abcdefghijklmnopqrstuvwxyz",
# "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
# "12345678901234567890123456789012345678901234567890123456789012345678901234567890"
# ]
#
# for s in spookyInput:
# hexlify(hash128(s).to_bytes(16, 'little')).decode()
#
# '1909f56bfc062723c751e8b465ee728b'
# 'bdc9bba09181101a922a4161f0584275'
# '67c93775f715ab8ab01178caf86713c6'
# '9630c2a55c0987a0db44434f9d67a192'
# '5172de938ce149a98f4d06d3c3168ffe'
# 'b5b3b2d0f08b58aa07f551895f929f81'
# '3621ec01112dafa1610a4bd23041966b'

spookyOutputPython <-
c(
'1909f56bfc062723c751e8b465ee728b',
'bdc9bba09181101a922a4161f0584275',
'67c93775f715ab8ab01178caf86713c6',
'9630c2a55c0987a0db44434f9d67a192',
'5172de938ce149a98f4d06d3c3168ffe',
'b5b3b2d0f08b58aa07f551895f929f81',
'3621ec01112dafa1610a4bd23041966b'
)

## spooky raw output test
for (i in seq(along.with=spookyInput)) {
# skip = 30 skips the entire serialization header for a length 1 character vector
# this is equivalent to raw = TRUE and matches the python spooky implementation for those vectors
spooky <- digest(spookyInput[i], algo = "spookyhash", skip = 30)
expect_true(identical(spooky, spookyOutputPython[i]))
#cat(spooky, "\n")
}

expect_identical(
getVDigest(algo = 'spookyhash')(spookyInput, skip = 30),
spookyOutputPython
)

## some extras to get coverage up - these aren't tested against reference output,
## just output from R 3.6.0
spookyInput <- c("a", "aaaaaaaaa", "aaaaaaaaaaaaa")
spookyOutput <- c(
"b7a3573ba6139dfdc52db30acba87f46",
"fd876ecaa5d1e442600333118f223e02",
"91848873bf91d06ad321bbd47400a556"
)
for (i in seq(along.with=spookyInput)) {
spooky <- digest(spookyInput[i], algo = "spookyhash")
expect_true(identical(spooky, spookyOutput[i]))
#cat(spooky, "\n")
}

expect_identical(
getVDigest(algo = 'spookyhash')(spookyInput),
spookyOutput
)

# test a bigger object
spooky <- digest(iris, algo = "spookyhash")
expect_true(identical(spooky, "af58add8b4f7044582b331083bc239ff"))
expect_identical(getVDigest('spookyhash')(list(iris)),
"af58add8b4f7044582b331083bc239ff")
#cat(spooky, "\n")
## Per PR 205, see comment in https://github.com/facebook/folly/blob/4c603f8c2add8d0228de0e073c5ae3ce9b02b6f3/folly/hash/SpookyHashV2.h#L35-L36
## Values ought to be sensible on big endian too but different from little endian reference
## so we do not test on big endian
if (isTRUE(.Call(digest:::is_little_endian))) {

## test vectors (originally for md5)
spookyInput <- c("",
"a",
"abc",
"message digest",
"abcdefghijklmnopqrstuvwxyz",
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
paste("12345678901234567890123456789012345678901234567890123456789012",
"345678901234567890", sep=""))

# from spooky import hash128
# from binascii import hexlify
#
# spookyInput = [
# "",
# "a",
# "abc",
# "message digest",
# "abcdefghijklmnopqrstuvwxyz",
# "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
# "12345678901234567890123456789012345678901234567890123456789012345678901234567890"
# ]
#
# for s in spookyInput:
# hexlify(hash128(s).to_bytes(16, 'little')).decode()
#
# '1909f56bfc062723c751e8b465ee728b'
# 'bdc9bba09181101a922a4161f0584275'
# '67c93775f715ab8ab01178caf86713c6'
# '9630c2a55c0987a0db44434f9d67a192'
# '5172de938ce149a98f4d06d3c3168ffe'
# 'b5b3b2d0f08b58aa07f551895f929f81'
# '3621ec01112dafa1610a4bd23041966b'

spookyOutputPython <- c('1909f56bfc062723c751e8b465ee728b',
'bdc9bba09181101a922a4161f0584275',
'67c93775f715ab8ab01178caf86713c6',
'9630c2a55c0987a0db44434f9d67a192',
'5172de938ce149a98f4d06d3c3168ffe',
'b5b3b2d0f08b58aa07f551895f929f81',
'3621ec01112dafa1610a4bd23041966b')

## spooky raw output test
for (i in seq(along.with=spookyInput)) {
# skip = 30 skips the entire serialization header for a length 1 character vector
# this is equivalent to raw = TRUE and matches the python spooky implementation for those vectors
spooky <- digest(spookyInput[i], algo = "spookyhash", skip = 30)
expect_true(identical(spooky, spookyOutputPython[i]))
#cat(spooky, "\n")
}

expect_identical(
getVDigest(algo = 'spookyhash')(spookyInput, skip = 30),
spookyOutputPython
)

# test error message
#error.message <- try(digest(spookyInput[i], algo = "spookyhash", serialize = FALSE))
#expect_true(
# grepl("spookyhash algorithm is not available without serialization.", error.message)
#)
## some extras to get coverage up - these aren't tested against reference output,
## just output from R 3.6.0
spookyInput <- c("a", "aaaaaaaaa", "aaaaaaaaaaaaa")
spookyOutput <- c("b7a3573ba6139dfdc52db30acba87f46",
"fd876ecaa5d1e442600333118f223e02",
"91848873bf91d06ad321bbd47400a556")
for (i in seq(along.with=spookyInput)) {
spooky <- digest(spookyInput[i], algo = "spookyhash")
expect_true(identical(spooky, spookyOutput[i]))
##cat(spooky, "\n")
}

expect_identical(
getVDigest(algo = 'spookyhash')(spookyInput),
spookyOutput
)

## test a bigger object
spooky <- digest(iris, algo = "spookyhash")
expect_true(identical(spooky, "af58add8b4f7044582b331083bc239ff"))
expect_identical(getVDigest('spookyhash')(list(iris)),
"af58add8b4f7044582b331083bc239ff")
##cat(spooky, "\n")

# test error message
#error.message <- try(digest(spookyInput[i], algo = "spookyhash", serialize = FALSE))
#expect_true(
# grepl("spookyhash algorithm is not available without serialization.", error.message)
#)
}

## Ensure that all values of algo are actually allowed (in case a new one is
## added in the future). The call to match.arg() passes choices explicitly
Expand Down
4 changes: 3 additions & 1 deletion man/digest.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,9 @@ for (i in seq(along=spookyInput)) {
# skip = 30 skips the serialization header and just hashes the strings
spooky <- digest(spookyInput[i], algo="spookyhash", skip = 30)
cat(spooky, "\n")
stopifnot(identical(spooky, spookyOutput[i]))
## we can only compare to reference output on little-endian systems
if (isTRUE(.Call(digest:::is_little_endian)))
stopifnot(identical(spooky, spookyOutput[i]))
}
## blake3 example
Expand Down
29 changes: 29 additions & 0 deletions src/digest.c
Original file line number Diff line number Diff line change
Expand Up @@ -620,3 +620,32 @@ SEXP vdigest(SEXP Txt, SEXP Algo, SEXP Length, SEXP Skip, SEXP Leave_raw, SEXP S
UNPROTECT(1);
return ans;
}


// Also already used in sha2.h
//
// We can rely on WORDS_BIGENDIAN only be defined on big endian systems thanks to Rconfig.
//
// A number of other #define based tests are in other source files here for different hash
// algorithm implementations notably crc32c, pmurhash, sha2 and xxhash
//
// A small and elegant test is also in package qs based on https://stackoverflow.com/a/1001373

// edd 02 Dec 2013 use Rconfig.h to define BYTE_ORDER, unless already defined
#ifndef BYTE_ORDER
// see sha2.c comments, and on the internet at large
#define LITTLE_ENDIAN 1234
#define BIG_ENDIAN 4321
#ifdef WORDS_BIGENDIAN
#define BYTE_ORDER BIG_ENDIAN
#else
#define BYTE_ORDER LITTLE_ENDIAN
#endif
#endif

SEXP is_big_endian() {
return Rf_ScalarLogical(BYTE_ORDER == BIG_ENDIAN);
}
SEXP is_little_endian() {
return Rf_ScalarLogical(BYTE_ORDER == LITTLE_ENDIAN);
}

0 comments on commit 26ab7a6

Please sign in to comment.