Do not tests spookyhash test on big endian platforms (#206)

* Do not test spookyhash on big endian platforms * Do not use a header-file names endian.h, embed declarations * Refine spookyhash and endianness checks
eddelbuettel · Jun 23, 2024 · 26ab7a6 · 26ab7a6
1 parent a8aeaa9
commit 26ab7a6
Show file tree

Hide file tree

Showing 6 changed files with 144 additions and 96 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,23 @@
+2024-06-23  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): Roll micro version and date
+
+	* src/digest.c (is_little_endian): Define alternate helper tooo
+	* man/digest.Rd: Compare to spookyhash ref only on little endian
+
+2024-06-22  Dirk Eddelbuettel  <edd@debian.org>
+
+	* src/digest.c (is_big_endian): Use endian-ness definition from
+	Rconfig.h and define one-line helper
+
+	* inst/tinytest/test_digest.R: Skip spookyhash test on big endian
+
+2024-06-21  Sergey Fedorov  <vital.had@gmail.com>
+
+	* src/blake3.c: Upstream patch for big endian systems
+	* src/blake3_impl.h: Idem
+	* src/blake3_portable.c: Idem
+
 2024-06-15  Dirk Eddelbuettel  <edd@debian.org>
 
 	* .github/workflows/ci.yaml (jobs): Update to r-ci-setup action

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -6,8 +6,8 @@ Author: Dirk Eddelbuettel <edd@debian.org> with contributions
  Michel Lang, Viliam Simko, Kurt Hornik, Radford Neal, Kendon Bell,
  Matthew de Queljoe, Ion Suruceanu, Bill Denney, Dirk Schumacher,
  Winston Chang, Dean Attali, and Michael Chirico.
-Version: 0.6.35.2
-Date: 2024-05-16
+Version: 0.6.35.3
+Date: 2024-06-23
 Maintainer: Dirk Eddelbuettel <edd@debian.org>
 Title: Create Compact Hash Digests of R Objects
 Description: Implementation of a function 'digest()' for the creation of hash

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,5 @@
 ## package has a dynamic library
-useDynLib(digest, digest_impl=digest, vdigest_impl=vdigest, digest2int_impl=digest2int, AESinit, AESencryptECB, AESdecryptECB, spookydigest_impl, .registration=TRUE)
+useDynLib(digest, digest_impl=digest, vdigest_impl=vdigest, digest2int_impl=digest2int, AESinit, AESencryptECB, AESdecryptECB, spookydigest_impl, is_little_endian, is_big_endian, .registration=TRUE)
 
 importFrom(utils, packageVersion)
 

diff --git a/inst/tinytest/test_digest.R b/inst/tinytest/test_digest.R
@@ -208,100 +208,97 @@ expect_identical(murmur32(murmur32Input, serialize = FALSE), murmur32Output)
 
 ## tests for digest spooky
 
-expect_true(require(digest))
-
-## test vectors (originally for md5)
-spookyInput <-
-  c("",
-    "a",
-    "abc",
-    "message digest",
-    "abcdefghijklmnopqrstuvwxyz",
-    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
-    paste("12345678901234567890123456789012345678901234567890123456789012",
-          "345678901234567890", sep=""))
-
-# from spooky import hash128
-# from binascii import hexlify
-#
-# spookyInput = [
-#     "",
-#       "a",
-#       "abc",
-#       "message digest",
-#       "abcdefghijklmnopqrstuvwxyz",
-#       "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
-#       "12345678901234567890123456789012345678901234567890123456789012345678901234567890"
-#     ]
-#
-# for s in spookyInput:
-#     hexlify(hash128(s).to_bytes(16, 'little')).decode()
-#
-# '1909f56bfc062723c751e8b465ee728b'
-# 'bdc9bba09181101a922a4161f0584275'
-# '67c93775f715ab8ab01178caf86713c6'
-# '9630c2a55c0987a0db44434f9d67a192'
-# '5172de938ce149a98f4d06d3c3168ffe'
-# 'b5b3b2d0f08b58aa07f551895f929f81'
-# '3621ec01112dafa1610a4bd23041966b'
-
-spookyOutputPython <-
-  c(
-    '1909f56bfc062723c751e8b465ee728b',
-    'bdc9bba09181101a922a4161f0584275',
-    '67c93775f715ab8ab01178caf86713c6',
-    '9630c2a55c0987a0db44434f9d67a192',
-    '5172de938ce149a98f4d06d3c3168ffe',
-    'b5b3b2d0f08b58aa07f551895f929f81',
-    '3621ec01112dafa1610a4bd23041966b'
-  )
-
-## spooky raw output test
-for (i in seq(along.with=spookyInput)) {
-  # skip = 30 skips the entire serialization header for a length 1 character vector
-  # this is equivalent to raw = TRUE and matches the python spooky implementation for those vectors
-  spooky <- digest(spookyInput[i], algo = "spookyhash", skip = 30)
-  expect_true(identical(spooky, spookyOutputPython[i]))
-  #cat(spooky, "\n")
-}
-
-expect_identical(
-  getVDigest(algo = 'spookyhash')(spookyInput, skip = 30),
-  spookyOutputPython
-)
-
-## some extras to get coverage up - these aren't tested against reference output,
-## just output from R 3.6.0
-spookyInput <- c("a", "aaaaaaaaa", "aaaaaaaaaaaaa")
-spookyOutput <- c(
-  "b7a3573ba6139dfdc52db30acba87f46",
-  "fd876ecaa5d1e442600333118f223e02",
-  "91848873bf91d06ad321bbd47400a556"
-)
-for (i in seq(along.with=spookyInput)) {
-  spooky <- digest(spookyInput[i], algo = "spookyhash")
-  expect_true(identical(spooky, spookyOutput[i]))
-  #cat(spooky, "\n")
-}
-
-expect_identical(
-  getVDigest(algo = 'spookyhash')(spookyInput),
-  spookyOutput
-)
-
-# test a bigger object
-spooky <- digest(iris, algo = "spookyhash")
-expect_true(identical(spooky, "af58add8b4f7044582b331083bc239ff"))
-expect_identical(getVDigest('spookyhash')(list(iris)),
-                 "af58add8b4f7044582b331083bc239ff")
-#cat(spooky, "\n")
+## Per PR 205, see comment in https://github.com/facebook/folly/blob/4c603f8c2add8d0228de0e073c5ae3ce9b02b6f3/folly/hash/SpookyHashV2.h#L35-L36
+## Values ought to be sensible on big endian too but different from little endian reference
+## so we do not test on big endian
+if (isTRUE(.Call(digest:::is_little_endian))) {
+
+    ## test vectors (originally for md5)
+    spookyInput <- c("",
+                     "a",
+                     "abc",
+                     "message digest",
+                     "abcdefghijklmnopqrstuvwxyz",
+                     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
+                     paste("12345678901234567890123456789012345678901234567890123456789012",
+                           "345678901234567890", sep=""))
+
+    # from spooky import hash128
+    # from binascii import hexlify
+    #
+    # spookyInput = [
+    #     "",
+    #       "a",
+    #       "abc",
+    #       "message digest",
+    #       "abcdefghijklmnopqrstuvwxyz",
+    #       "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
+    #       "12345678901234567890123456789012345678901234567890123456789012345678901234567890"
+    #     ]
+    #
+    # for s in spookyInput:
+    #     hexlify(hash128(s).to_bytes(16, 'little')).decode()
+    #
+    # '1909f56bfc062723c751e8b465ee728b'
+    # 'bdc9bba09181101a922a4161f0584275'
+    # '67c93775f715ab8ab01178caf86713c6'
+    # '9630c2a55c0987a0db44434f9d67a192'
+    # '5172de938ce149a98f4d06d3c3168ffe'
+    # 'b5b3b2d0f08b58aa07f551895f929f81'
+    # '3621ec01112dafa1610a4bd23041966b'
+
+    spookyOutputPython <-   c('1909f56bfc062723c751e8b465ee728b',
+                              'bdc9bba09181101a922a4161f0584275',
+                              '67c93775f715ab8ab01178caf86713c6',
+                              '9630c2a55c0987a0db44434f9d67a192',
+                              '5172de938ce149a98f4d06d3c3168ffe',
+                              'b5b3b2d0f08b58aa07f551895f929f81',
+                              '3621ec01112dafa1610a4bd23041966b')
+
+    ## spooky raw output test
+    for (i in seq(along.with=spookyInput)) {
+      # skip = 30 skips the entire serialization header for a length 1 character vector
+      # this is equivalent to raw = TRUE and matches the python spooky implementation for those vectors
+      spooky <- digest(spookyInput[i], algo = "spookyhash", skip = 30)
+      expect_true(identical(spooky, spookyOutputPython[i]))
+      #cat(spooky, "\n")
+    }
+
+    expect_identical(
+        getVDigest(algo = 'spookyhash')(spookyInput, skip = 30),
+        spookyOutputPython
+    )
 
-# test error message
-#error.message <- try(digest(spookyInput[i], algo = "spookyhash", serialize = FALSE))
-#expect_true(
-#  grepl("spookyhash algorithm is not available without serialization.", error.message)
-#)
+    ## some extras to get coverage up - these aren't tested against reference output,
+    ## just output from R 3.6.0
+    spookyInput <- c("a", "aaaaaaaaa", "aaaaaaaaaaaaa")
+    spookyOutput <- c("b7a3573ba6139dfdc52db30acba87f46",
+                      "fd876ecaa5d1e442600333118f223e02",
+                      "91848873bf91d06ad321bbd47400a556")
+    for (i in seq(along.with=spookyInput)) {
+        spooky <- digest(spookyInput[i], algo = "spookyhash")
+        expect_true(identical(spooky, spookyOutput[i]))
+        ##cat(spooky, "\n")
+    }
+
+    expect_identical(
+        getVDigest(algo = 'spookyhash')(spookyInput),
+        spookyOutput
+    )
 
+    ## test a bigger object
+    spooky <- digest(iris, algo = "spookyhash")
+    expect_true(identical(spooky, "af58add8b4f7044582b331083bc239ff"))
+    expect_identical(getVDigest('spookyhash')(list(iris)),
+                     "af58add8b4f7044582b331083bc239ff")
+    ##cat(spooky, "\n")
+
+    # test error message
+    #error.message <- try(digest(spookyInput[i], algo = "spookyhash", serialize = FALSE))
+    #expect_true(
+    #  grepl("spookyhash algorithm is not available without serialization.", error.message)
+    #)
+}
 
 ## Ensure that all values of algo are actually allowed (in case a new one is
 ## added in the future). The call to match.arg() passes choices explicitly

diff --git a/man/digest.Rd b/man/digest.Rd
@@ -304,7 +304,9 @@ for (i in seq(along=spookyInput)) {
     # skip = 30 skips the serialization header and just hashes the strings
     spooky <- digest(spookyInput[i], algo="spookyhash", skip = 30)
     cat(spooky, "\n")
-    stopifnot(identical(spooky, spookyOutput[i]))
+    ## we can only compare to reference output on little-endian systems
+    if (isTRUE(.Call(digest:::is_little_endian)))
+        stopifnot(identical(spooky, spookyOutput[i]))
 }
 
 ## blake3 example

diff --git a/src/digest.c b/src/digest.c
@@ -620,3 +620,32 @@ SEXP vdigest(SEXP Txt, SEXP Algo, SEXP Length, SEXP Skip, SEXP Leave_raw, SEXP S
     UNPROTECT(1);
     return ans;
 }
+
+
+// Also already used in sha2.h
+//
+// We can rely on WORDS_BIGENDIAN only be defined on big endian systems thanks to Rconfig.
+//
+// A number of other #define based tests are in other source files here for different hash
+// algorithm implementations notably crc32c, pmurhash, sha2 and xxhash
+//
+// A small and elegant test is also in package qs based on https://stackoverflow.com/a/1001373
+
+// edd 02 Dec 2013  use Rconfig.h to define BYTE_ORDER, unless already defined
+#ifndef BYTE_ORDER
+    // see sha2.c comments, and on the internet at large
+    #define LITTLE_ENDIAN 1234
+    #define BIG_ENDIAN    4321
+#ifdef WORDS_BIGENDIAN
+    #define BYTE_ORDER  BIG_ENDIAN
+#else
+    #define BYTE_ORDER  LITTLE_ENDIAN
+#endif
+#endif
+
+SEXP is_big_endian() {
+    return Rf_ScalarLogical(BYTE_ORDER == BIG_ENDIAN);
+}
+SEXP is_little_endian() {
+    return Rf_ScalarLogical(BYTE_ORDER == LITTLE_ENDIAN);
+}