From f5444a32c91e50889d3fc3526fac0b26f8938050 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 15 Jan 2018 12:03:36 +1100
Subject: [PATCH 01/49] Refactor main C++ function to avoid use "constant"
 memory and avoid new/delete.

---
 _cffi_build/dice_one_against_many.cpp | 44 ++++++++++++---------------
 1 file changed, 20 insertions(+), 24 deletions(-)
diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 6182585c..9f5a87b9 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -139,14 +139,17 @@ extern "C"
         const uint64_t *comp1 = (const uint64_t *) one;
         const uint64_t *comp2 = (const uint64_t *) many;
 
+        // TODO: Given that k is 10 by default, often 5 in practice,
+        // and probably never ever more than 20 or so, the use of a
+        // priority_queue here is expensive overkill.  Better to just
+        // store the scores in an array and do a linear search every
+        // time.
         std::priority_queue<Node, std::vector<Node>, score_cmp> max_k_scores;
 
         uint32_t count_one = builtin_popcnt_unrolled_errata_manual(comp1);
 
         uint64_t combined[KEYWORDS];
 
-        double *all_scores = new double[n];
-
         uint32_t max_popcnt_delta = 1024;
         if(threshold > 0) {
             max_popcnt_delta = calculate_max_difference(count_one, threshold);
@@ -155,39 +158,32 @@ extern "C"
 
         for (int j = 0; j < n; j++) {
             const uint64_t *current = comp2 + j * KEYWORDS;
+            const uint32_t counts_many_j = counts_many[j];
 
-            if(count_one > counts_many[j]){
-                current_delta = count_one - counts_many[j];
+            if (count_one > counts_many_j) {
+                current_delta = count_one - counts_many_j;
             } else {
-                current_delta = counts_many[j] - count_one;
+                current_delta = counts_many_j - count_one;
             }
 
-            if(current_delta <= max_popcnt_delta){
-                for (unsigned int i = 0 ; i < KEYWORDS; i++ ) {
+            if (current_delta <= max_popcnt_delta) {
+                for (int i = 0; i < (int)KEYWORDS; i++) {
                     combined[i] = current[i] & comp1[i];
                 }
 
                 uint32_t count_curr = builtin_popcnt_unrolled_errata_manual(combined);
 
-                double score = 2 * count_curr / (double) (count_one + counts_many[j]);
-                all_scores[j] = score;
-            } else {
-                // Skipping because popcount difference too large
-                all_scores[j] = -1;
-            }
-        }
-
-        for (int j = 0; j < n; j++) {
-
-            if(all_scores[j] >= threshold) {
-                max_k_scores.push(Node(j, all_scores[j]));
-            }
-
-            if(max_k_scores.size() > k) max_k_scores.pop();
+                // TODO: double precision is overkill for this
+                // problem; just use float.
+                double score = 2 * count_curr / (double) (count_one + counts_many_j);
+                if (score >= threshold) {
+                    max_k_scores.push(Node(j, score));
+                    if (max_k_scores.size() > k)
+                        max_k_scores.pop();
+                }
+            } // else skip because popcount difference too large
         }
 
-        delete[] all_scores;
-
         int i = 0;
         while (!max_k_scores.empty()) {
 

From 5d5337b84256fe4cb7e558800ad152e862a9a326 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 15 Jan 2018 14:37:09 +1100
Subject: [PATCH 02/49] Implement popcount on (almost) arbitrary length arrays.

---
 _cffi_build/dice_one_against_many.cpp | 57 +++++++++++++++++++++------
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 9f5a87b9..6b17a8fd 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -24,7 +24,7 @@
 // explicitly loading the contents of buf into registers and using
 // these same registers for the intermediate popcnts.
 static inline uint32_t
-builtin_popcnt_unrolled_errata_manual(const uint64_t* buf) {
+builtin_popcnt_unrolled_errata_manual(const uint64_t* buf, int n) {
   uint64_t b0, b1, b2, b3;
   uint64_t c0, c1, c2, c3;
   c0 = c1 = c2 = c3 = 0;
@@ -47,14 +47,48 @@ builtin_popcnt_unrolled_errata_manual(const uint64_t* buf) {
       "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));                \
   } while (0)
 
-  LOOP_BODY(0);
-  LOOP_BODY(4);
-  LOOP_BODY(8);
-  LOOP_BODY(12);
+  // Here we assume that 4|n and n <= 16.  This means that n/4 is
+  // either 4, 3, 2 or 1, and these values correspond to the switch
+  // cases, which in turn determine whether we read and popcnt 16, 12,
+  // 8 or 4 elements from buf.  The __attribute__ ((fallthrough));
+  // thingo is to let the compiler know that we are falling through
+  // the switch case statements deliberately (otherwise this illicits
+  // a warning with -Wextra).
+  switch (n >> 2) { //  n/4
+  case 4:
+      LOOP_BODY(12);
+      __attribute__ ((fallthrough));
+  case 3:
+      LOOP_BODY(8);
+      __attribute__ ((fallthrough));
+  case 2:
+      LOOP_BODY(4);
+      __attribute__ ((fallthrough));
+  case 1:
+      LOOP_BODY(0);
+      __attribute__ ((fallthrough));
+  }
 
   return c0 + c1 + c2 + c3;
 }
 
+/**
+ * Bit population count of the 8n bytes of memory starting at buf (8 =
+ * sizeof(uint64_t)).
+ */
+static uint32_t
+popcount_array(const uint64_t *buf, int n) {
+    assert(n % 4 == 0);
+    uint32_t pc = 0;
+    while (n >= 16) {
+        pc += builtin_popcnt_unrolled_errata_manual(buf, 16);
+        n -= 16;
+    }
+    if (n > 0)
+        pc += builtin_popcnt_unrolled_errata_manual(buf, n);
+    return pc;
+}
+
 /**
  * Compute the Dice coefficient similarity measure of two bit patterns.
  */
@@ -65,8 +99,8 @@ dice_coeff_1024(const char *e1, const char *e2) {
 
     uint32_t count_both = 0;
 
-    count_both += builtin_popcnt_unrolled_errata_manual(comp1);
-    count_both += builtin_popcnt_unrolled_errata_manual(comp2);
+    count_both += popcount_array(comp1, KEYWORDS);
+    count_both += popcount_array(comp2, KEYWORDS);
     if(count_both == 0) {
         return 0.0;
     }
@@ -76,7 +110,7 @@ dice_coeff_1024(const char *e1, const char *e2) {
         combined[i] = comp1[i] & comp2[i];
     }
 
-    uint32_t count_and = builtin_popcnt_unrolled_errata_manual(combined);
+    uint32_t count_and = popcount_array(combined, KEYWORDS);
 
     return 2 * count_and / (double)count_both;
 }
@@ -108,7 +142,7 @@ struct score_cmp{
 static void popcount_1024_array(const char *many, int n, uint32_t *counts_many) {
     for (int i = 0; i < n; i++) {
         const uint64_t *sig = (const uint64_t *) many + i * KEYWORDS;
-        counts_many[i] = builtin_popcnt_unrolled_errata_manual(sig);
+        counts_many[i] = popcount_array(sig, KEYWORDS);
     }
 }
 
@@ -146,7 +180,7 @@ extern "C"
         // time.
         std::priority_queue<Node, std::vector<Node>, score_cmp> max_k_scores;
 
-        uint32_t count_one = builtin_popcnt_unrolled_errata_manual(comp1);
+        uint32_t count_one = popcount_array(comp1, KEYWORDS);
 
         uint64_t combined[KEYWORDS];
 
@@ -171,7 +205,7 @@ extern "C"
                     combined[i] = current[i] & comp1[i];
                 }
 
-                uint32_t count_curr = builtin_popcnt_unrolled_errata_manual(combined);
+                uint32_t count_curr = popcount_array(combined, KEYWORDS);
 
                 // TODO: double precision is overkill for this
                 // problem; just use float.
@@ -210,4 +244,3 @@ extern "C"
         return res;
     }
 }
-

From 3864028f84d9b92ab6f790c998b8e6f6632ad258 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 15 Jan 2018 15:13:11 +1100
Subject: [PATCH 03/49] First pass at integrating arbitrary length keys. Slows
 things down a bit.

---
 _cffi_build/build_matcher.py          |  2 +-
 _cffi_build/dice_one_against_many.cpp | 40 +++++++++++++++++----------
 anonlink/entitymatch.py               |  9 ++++--
 3 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py
index fb7e1161..16a4ed3f 100644
--- a/_cffi_build/build_matcher.py
+++ b/_cffi_build/build_matcher.py
@@ -20,7 +20,7 @@
 
 ffibuilder.cdef("""
     int match_one_against_many_dice(const char * one, const char * many, int n, double * score);
-    int match_one_against_many_dice_1024_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, uint32_t k, double threshold, int *indices, double *scores);
+    int match_one_against_many_dice_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, int keybytes, uint32_t k, double threshold, int *indices, double *scores);
     double dice_coeff_1024(const char *e1, const char *e2);
 """)
 
diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 6b17a8fd..ca69eb22 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -157,14 +157,16 @@ static uint32_t calculate_max_difference(uint32_t popcnt_a, double threshold)
 extern "C"
 {
     /**
-     * Calculate up to the top k indices and scores.
-     * Returns the number matched above a threshold.
+     * Calculate up to the top k indices and scores.  Returns the
+     * number matched above a threshold or -1 if keybytes is not a
+     * multiple of 32.
      */
-    int match_one_against_many_dice_1024_k_top(
+    int match_one_against_many_dice_k_top(
         const char *one,
         const char *many,
         const uint32_t *counts_many,
         int n,
+        int keybytes,
         uint32_t k,
         double threshold,
         int *indices,
@@ -173,6 +175,13 @@ extern "C"
         const uint64_t *comp1 = (const uint64_t *) one;
         const uint64_t *comp2 = (const uint64_t *) many;
 
+        // keybytes must be divisible by 32, because keywords must be
+        // divisible by 4 for the builtin popcount function to work
+        // and keywords = keybytes / 8.
+        if (keybytes % (4 * WORDBYTES) != 0)  // (keybytes & 31)
+            return -1;
+        int keywords = keybytes / WORDBYTES;
+
         // TODO: Given that k is 10 by default, often 5 in practice,
         // and probably never ever more than 20 or so, the use of a
         // priority_queue here is expensive overkill.  Better to just
@@ -180,19 +189,21 @@ extern "C"
         // time.
         std::priority_queue<Node, std::vector<Node>, score_cmp> max_k_scores;
 
-        uint32_t count_one = popcount_array(comp1, KEYWORDS);
-
-        uint64_t combined[KEYWORDS];
-
-        uint32_t max_popcnt_delta = 1024;
+        uint32_t count_one = popcount_array(comp1, keywords);
+        uint32_t max_popcnt_delta = keywords * WORDBITS; // = bits per key
         if(threshold > 0) {
             max_popcnt_delta = calculate_max_difference(count_one, threshold);
         }
-        uint32_t current_delta;
 
+        // TODO: This allocation could be avoided by writing a special
+        // popcount_array_combined() function that does the AND
+        // itself; this would almost certainly be faster than the
+        // new/delete pair and would require no memory overhead.
+        uint64_t *combined = new uint64_t[keywords];
         for (int j = 0; j < n; j++) {
-            const uint64_t *current = comp2 + j * KEYWORDS;
+            const uint64_t *current = comp2 + j * keywords;
             const uint32_t counts_many_j = counts_many[j];
+            uint32_t current_delta;
 
             if (count_one > counts_many_j) {
                 current_delta = count_one - counts_many_j;
@@ -201,11 +212,11 @@ extern "C"
             }
 
             if (current_delta <= max_popcnt_delta) {
-                for (int i = 0; i < (int)KEYWORDS; i++) {
+                for (int i = 0; i < keywords; i++) {
                     combined[i] = current[i] & comp1[i];
                 }
 
-                uint32_t count_curr = popcount_array(combined, KEYWORDS);
+                uint32_t count_curr = popcount_array(combined, keywords);
 
                 // TODO: double precision is overkill for this
                 // problem; just use float.
@@ -217,6 +228,7 @@ extern "C"
                 }
             } // else skip because popcount difference too large
         }
+        delete[] combined;
 
         int i = 0;
         while (!max_k_scores.empty()) {
@@ -237,8 +249,8 @@ extern "C"
         int idx_unused;
         uint32_t *counts_many = new uint32_t[n];
         popcount_1024_array(many, n, counts_many);
-        int res = match_one_against_many_dice_1024_k_top(
-            one, many, counts_many, n, k, threshold, &idx_unused, score);
+        int res = match_one_against_many_dice_k_top(
+            one, many, counts_many, n, 128, k, threshold, &idx_unused, score);
         delete[] counts_many;
 
         return res;
diff --git a/anonlink/entitymatch.py b/anonlink/entitymatch.py
index 1484b12d..06752c1d 100644
--- a/anonlink/entitymatch.py
+++ b/anonlink/entitymatch.py
@@ -38,8 +38,8 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold):
     length_f1 = len(filters1)
     length_f2 = len(filters2)
 
-    # We assume the length is 1024 bit = 128 Bytes
-    match_one_against_many_dice_1024_k_top = lib.match_one_against_many_dice_1024_k_top
+    # We assume the length is a multple of 128 bits.
+    match_one_against_many_dice_k_top = lib.match_one_against_many_dice_k_top
 
     # An array of the *one* filter
     clist1 = [ffi.new("char[128]", bytes(f[0].tobytes()))
@@ -69,16 +69,19 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold):
     for i, f1 in enumerate(filters1):
         assert len(clist1[i]) == 128
         assert len(carr2) % 64 == 0
-        matches = match_one_against_many_dice_1024_k_top(
+        matches = match_one_against_many_dice_k_top(
             clist1[i],
             carr2,
             c_popcounts,
             length_f2,
+            128,
             k,
             threshold,
             c_indices,
             c_scores)
 
+        if matches < 0:
+            raise Exception('Internel error: Bad key length')
         for j in range(matches):
             ind = c_indices[j]
             assert ind < len(filters2)

From 5d5338f6818c7fcd6932f33e6a75a44aa8d44fea Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Thu, 1 Feb 2018 11:03:20 +1100
Subject: [PATCH 04/49] Refactor Dice coefficient calculation.

---
 _cffi_build/dice_one_against_many.cpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 6182585c..15b7f56a 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -120,6 +120,17 @@ static uint32_t calculate_max_difference(uint32_t popcnt_a, double threshold)
     return 2 * popcnt_a * (1/threshold - 1);
 }
 
+static double
+dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_popc)
+{
+    uint64_t uv[KEYWORDS];
+    for (unsigned int i = 0 ; i < KEYWORDS; i++ ) {
+        uv[i] = u[i] & v[i];
+    }
+    uint32_t uv_popc = builtin_popcnt_unrolled_errata_manual(uv);
+    return (2 * uv_popc) / (double) (u_popc + v_popc);
+}
+
 extern "C"
 {
     /**
@@ -143,8 +154,6 @@ extern "C"
 
         uint32_t count_one = builtin_popcnt_unrolled_errata_manual(comp1);
 
-        uint64_t combined[KEYWORDS];
-
         double *all_scores = new double[n];
 
         uint32_t max_popcnt_delta = 1024;
@@ -163,14 +172,7 @@ extern "C"
             }
 
             if(current_delta <= max_popcnt_delta){
-                for (unsigned int i = 0 ; i < KEYWORDS; i++ ) {
-                    combined[i] = current[i] & comp1[i];
-                }
-
-                uint32_t count_curr = builtin_popcnt_unrolled_errata_manual(combined);
-
-                double score = 2 * count_curr / (double) (count_one + counts_many[j]);
-                all_scores[j] = score;
+                all_scores[j] = dice_coeff(comp1, count_one, current, counts_many[j]);
             } else {
                 // Skipping because popcount difference too large
                 all_scores[j] = -1;

From 88e3625376edbfb94921ac5a4e7ee2f2c2de64d7 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Thu, 1 Feb 2018 11:04:03 +1100
Subject: [PATCH 05/49] Temporary fiddling with benchmark code.

---
 anonlink/benchmark.py | 63 +++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 38 deletions(-)

diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py
index c1fba03d..eb0cdd97 100644
--- a/anonlink/benchmark.py
+++ b/anonlink/benchmark.py
@@ -8,7 +8,6 @@
 
 from anonlink.entitymatch import *
 from anonlink.util import popcount_vector, generate_clks, generate_bitarray
-from anonlink.distributed_processing import calculate_filter_similarity
 
 
 some_filters = generate_clks(10000)
@@ -25,15 +24,16 @@ def compute_popcount_speed(n):
     elapsed_time = end - start
     print("{:6d} x 1024 bit popcounts in {:.6f} seconds".format(n, elapsed_time))
     speed_in_MiB = n / (1024 * 8 * elapsed_time)
-    print("Popcount speed: {:.2f} MiB/s".format(speed_in_MiB))
+    print("Popcount speed: {:.2f} MiB/s (bitarray.count())".format(speed_in_MiB))
     return speed_in_MiB
 
 
-def print_comparison_header():
-    print("Size 1 | Size 2 | Comparisons  | Compute Time | Million Comparisons per second")
+def print_comparison_header(threshold):
+    print("Threshold = ", threshold)
+    print("Size 1 | Size 2 | Comparisons  | Total Time (simat/solv) | Million Comparisons per second")
 
 
-def compute_comparison_speed(n1=100, n2=100):
+def compute_comparison_speed(n1=100, n2=100, threshold=0.75):
     """
     Using the greedy solver, how fast can hashes be computed using one core.
     """
@@ -42,30 +42,20 @@ def compute_comparison_speed(n1=100, n2=100):
     filters2 = [some_filters[random.randrange(2000, 10000)] for _ in range(n2)]
 
     start = timer()
-    result3 = calculate_mapping_greedy(filters1, filters2)
+    sparse_matrix = calculate_filter_similarity(filters1, filters2, k=len(filters2), threshold=threshold)
+    t1 = timer()
+    res = greedy_solver(sparse_matrix)
     end = timer()
-    elapsed_time = end - start
-    print("{:6d} | {:6d} | {:12d} | {:8.3f}s    |  {:12.3f}".format(
-        n1, n2, n1*n2, elapsed_time, (n1*n2)/(1e6*elapsed_time)))
-    return elapsed_time
-
-
-def compute_comparison_speed_parallel(n1=100, n2=100):
-    """
-    Using the greedy solver in chunks, how fast can hashes be computed.
-    """
-
-    filters1 = [some_filters[random.randrange(0, 8000)] for _ in range(n1)]
-    filters2 = [some_filters[random.randrange(2000, 10000)] for _ in range(n2)]
-
-
-    start = timer()
-    calculate_filter_similarity(filters1, filters2)
 
-    end = timer()
+    #print("mat size = ", len(sparse_matrix))
+    similarity_time = t1 - start
+    solver_time = end - t1
     elapsed_time = end - start
-    print("{:6d} | {:6d} | {:12d} | {:8.3f}s    |  {:12.3f}".format(
-        n1, n2, n1*n2, elapsed_time, (n1*n2)/(1e6*elapsed_time)))
+    print("{:6d} | {:6d} | {:12d} | {:7.3f}s ({:3.1f}% / {:3.1f}%) |  {:12.3f}  -- {:8d} = {:2.1f}%".format(
+        n1, n2, n1*n2, elapsed_time,
+        100.0*similarity_time/elapsed_time,
+        100.0*solver_time/elapsed_time,
+        (n1*n2)/(1e6*similarity_time), len(sparse_matrix), 100.0*len(sparse_matrix)/(n1*n2)))
     return elapsed_time
 
 
@@ -116,8 +106,6 @@ def benchmark(size, compare):
 
     compute_popcount_speed(100000)
 
-    print_comparison_header()
-
     possible_test_sizes = [
         1000, 2000, 3000, 4000,
         5000, 6000, 7000, 8000, 9000,
@@ -127,15 +115,14 @@ def benchmark(size, compare):
         2000000
     ]
 
-    for test_size in possible_test_sizes:
-        if test_size <= size:
-            compute_comparison_speed_parallel(
-                test_size, test_size
-            )
-
-    print("Single Core:")
-    compute_comparison_speed(5000, 5000)
-
+    #for thld in [0.95, 0.85, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5]:
+    for thld in [0.22, 0.25, 0.27, 0.48, 0.49, 0.5, 0.51, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64]:
+        #print_comparison_header(thld)
+        print("threshold = ", thld)
+        for test_size in possible_test_sizes:
+            if test_size <= size:
+                compute_comparison_speed(test_size, test_size, thld)
 
 if __name__ == '__main__':
-    benchmark(20000, False)
\ No newline at end of file
+    benchmark(1000, False)
+    #benchmark(20000, False)

From a705de8b47643a36c22e6d6f5fe97fe365d779db Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Thu, 1 Feb 2018 22:40:36 +1100
Subject: [PATCH 06/49] Calculate and report popcount speed from native code
 implementation.

---
 _cffi_build/build_matcher.py          |  1 +
 _cffi_build/dice_one_against_many.cpp | 38 ++++++++++++++++++++-------
 anonlink/benchmark.py                 | 26 +++++++++++++++---
 anonlink/util.py                      | 35 +++++++++++++++---------
 4 files changed, 73 insertions(+), 27 deletions(-)

diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py
index fb7e1161..4065e4ee 100644
--- a/_cffi_build/build_matcher.py
+++ b/_cffi_build/build_matcher.py
@@ -22,6 +22,7 @@
     int match_one_against_many_dice(const char * one, const char * many, int n, double * score);
     int match_one_against_many_dice_1024_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, uint32_t k, double threshold, int *indices, double *scores);
     double dice_coeff_1024(const char *e1, const char *e2);
+    double popcount_1024_array(const char *many, int n, uint32_t *counts_many);
 """)
 
 
diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 15b7f56a..7edb78dc 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -3,6 +3,7 @@
 #include <queue>
 #include <cstdint>
 #include <cstdlib>
+#include <ctime>
 #include <iostream>
 
 
@@ -102,16 +103,6 @@ struct score_cmp{
 };
 
 
-/**
- * Count lots of bits.
- */
-static void popcount_1024_array(const char *many, int n, uint32_t *counts_many) {
-    for (int i = 0; i < n; i++) {
-        const uint64_t *sig = (const uint64_t *) many + i * KEYWORDS;
-        counts_many[i] = builtin_popcnt_unrolled_errata_manual(sig);
-    }
-}
-
 /**
  *
  */
@@ -131,8 +122,35 @@ dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_pop
     return (2 * uv_popc) / (double) (u_popc + v_popc);
 }
 
+static inline double to_millis(clock_t t)
+{
+    static constexpr double CPS = (double)CLOCKS_PER_SEC;
+    return t * 1.0E3 / CPS;
+}
+
 extern "C"
 {
+    /**
+     * Calculate population counts of an array of inputs; return how
+     * long it took in milliseconds.
+     *
+     * 'many' must point to n*KEYWORDS*sizeof(uint64_t) (== 128*n) bytes
+     * 'counts_many' must point to n*sizeof(uint32_t) bytes.
+     * For i = 0 to n - 1, the population count of the 1024 bits
+     *
+     *   many[i * KEYWORDS] ... many[(i + 1) * KEYWORDS - 1]
+     *
+     * is put in counts_many[i].
+     */
+    double popcount_1024_array(const char *many, int n, uint32_t *counts_many) {
+        clock_t t = clock();
+        for (int i = 0; i < n; i++) {
+            const uint64_t *sig = (const uint64_t *) many + i * KEYWORDS;
+            counts_many[i] = builtin_popcnt_unrolled_errata_manual(sig);
+        }
+        return to_millis(clock() - t);
+    }
+
     /**
      * Calculate up to the top k indices and scores.
      * Returns the number matched above a threshold.
diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py
index eb0cdd97..4b96b9ab 100644
--- a/anonlink/benchmark.py
+++ b/anonlink/benchmark.py
@@ -18,13 +18,31 @@ def compute_popcount_speed(n):
     Just do as much counting of bits.
     """
     clks = [generate_bitarray(1024) for _ in range(n)]
+
+    print("{:6d} x 1024 bit popcounts".format(n))
+    print("Implementation              | Time (ms) | Bandwidth (MiB/s)")
+    start = timer()
+    popcounts = popcount_vector(clks, use_native=False)
+    end = timer()
+    elapsed_time = end - start
+    speed_in_MiB = (n * 128) / ((1 << 20) * elapsed_time)
+    print("Python (bitarray.count()):  |  {:7.2f}  |  {:9.2f} "
+          .format(elapsed_time * 1e3, speed_in_MiB))
+
+    # Native
     start = timer()
-    popcounts = popcount_vector(clks)
+    popcounts, ms = popcount_vector(clks, use_native=True)
     end = timer()
     elapsed_time = end - start
-    print("{:6d} x 1024 bit popcounts in {:.6f} seconds".format(n, elapsed_time))
-    speed_in_MiB = n / (1024 * 8 * elapsed_time)
-    print("Popcount speed: {:.2f} MiB/s (bitarray.count())".format(speed_in_MiB))
+    elapsed_nocopy = ms / 1e3
+    copy_percent = 100*(elapsed_time - elapsed_nocopy) / elapsed_time
+    speed_in_MiB = (n * 128) / ((1 << 20) * elapsed_time)
+    speed_in_MiB_nocopy = (n * 128) / ((1 << 20) * elapsed_nocopy)
+    print("Native code (no copy):      |  {:7.2f}  |  {:9.2f} "
+          .format(ms, speed_in_MiB_nocopy))
+    print("Native code (w/ copy):      |  {:7.2f}  |  {:9.2f}   ({:.1f}% copying)"
+          .format(elapsed_time * 1e3, speed_in_MiB, copy_percent))
+
     return speed_in_MiB
 
 
diff --git a/anonlink/util.py b/anonlink/util.py
index dcafa80f..24982adb 100644
--- a/anonlink/util.py
+++ b/anonlink/util.py
@@ -2,8 +2,10 @@
 
 import os
 import random
+import time
 from bitarray import bitarray
 
+from anonlink._entitymatcher import ffi, lib
 
 def generate_bitarray(length):
     a = bitarray(endian=['little', 'big'][random.randint(0, 1)])
@@ -19,25 +21,32 @@ def generate_clks(n):
     return res
 
 
-def popcount_vector(bitarrays):
-    """
-    Note, due to the overhead of converting bitarrays into
-    bytes, it is more expensive to call our C implementation
+def popcount_vector(bitarrays, use_native=False):
+    """Return an array containing the popcounts of the elements of
+    bitarrays. If use_native is True, use the native code
+    implementation and return the time spent (in milliseconds) in the
+    native code as a second return value.
+
+    Note, due to the overhead of converting bitarrays into bytes,
+    it is currently more expensive to call our C implementation
     than just calling bitarray.count()
 
     """
-    return [clk.count() for clk in bitarrays]
+    # Use Python
+    if not use_native:
+        return [clk.count() for clk in bitarrays]
+
+    # Use native code
+    n = len(bitarrays)
+    c_popcounts = ffi.new("uint32_t[{}]".format(n))
+    many = ffi.new("char[{}]".format(128 * n),
+                    bytes([b for f in bitarrays for b in f.tobytes()]))
+    ms = lib.popcount_1024_array(many, n, c_popcounts)
 
-    # n = len(clks)
-    # c_popcounts = ffi.new("uint32_t[{}]".format(n))
-    # many = ffi.new("char[{}]".format(128 * n),
-    #                 bytes([b for f in clks for b in f.tobytes()]))
-    # lib.popcount_1024_array(many, n, c_popcounts)
-    #
-    # return [c_popcounts[i] for i in range(n)]
+    return [c_popcounts[i] for i in range(n)], ms
 
 
 def chunks(l, n):
     """Yield successive n-sized chunks from l."""
     for i in range(0, len(l), n):
-        yield l[i:i + n]
\ No newline at end of file
+        yield l[i:i + n]

From cff1cb6d22b502bbd3b56b1248639dd41c5225d4 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 2 Feb 2018 11:01:29 +1100
Subject: [PATCH 07/49] Give some values more sensible variable names.

---
 anonlink/benchmark.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py
index 4b96b9ab..ab854542 100644
--- a/anonlink/benchmark.py
+++ b/anonlink/benchmark.py
@@ -17,15 +17,19 @@ def compute_popcount_speed(n):
     """
     Just do as much counting of bits.
     """
-    clks = [generate_bitarray(1024) for _ in range(n)]
+    clk_bits = 1024
+    clk_bytes = clk_bits / 8
+    clks_MiB = n * clk_bytes * 1.0 / (1 << 20)
 
-    print("{:6d} x 1024 bit popcounts".format(n))
+    clks = [generate_bitarray(clk_bits) for _ in range(n)]
+
+    print("{:6d} x {:d} bit popcounts".format(n, clk_bits))
     print("Implementation              | Time (ms) | Bandwidth (MiB/s)")
     start = timer()
     popcounts = popcount_vector(clks, use_native=False)
     end = timer()
     elapsed_time = end - start
-    speed_in_MiB = (n * 128) / ((1 << 20) * elapsed_time)
+    speed_in_MiB = clks_MiB / elapsed_time
     print("Python (bitarray.count()):  |  {:7.2f}  |  {:9.2f} "
           .format(elapsed_time * 1e3, speed_in_MiB))
 
@@ -36,8 +40,8 @@ def compute_popcount_speed(n):
     elapsed_time = end - start
     elapsed_nocopy = ms / 1e3
     copy_percent = 100*(elapsed_time - elapsed_nocopy) / elapsed_time
-    speed_in_MiB = (n * 128) / ((1 << 20) * elapsed_time)
-    speed_in_MiB_nocopy = (n * 128) / ((1 << 20) * elapsed_nocopy)
+    speed_in_MiB = clks_MiB / elapsed_time
+    speed_in_MiB_nocopy = clks_MiB / elapsed_nocopy
     print("Native code (no copy):      |  {:7.2f}  |  {:9.2f} "
           .format(ms, speed_in_MiB_nocopy))
     print("Native code (w/ copy):      |  {:7.2f}  |  {:9.2f}   ({:.1f}% copying)"

From 603b6d47669da2279c16a38690ae23853e7ad40f Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 2 Feb 2018 11:02:39 +1100
Subject: [PATCH 08/49] Remove unused import.

---
 anonlink/util.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/anonlink/util.py b/anonlink/util.py
index 24982adb..dfadfc54 100644
--- a/anonlink/util.py
+++ b/anonlink/util.py
@@ -2,7 +2,6 @@
 
 import os
 import random
-import time
 from bitarray import bitarray
 
 from anonlink._entitymatcher import ffi, lib

From de33a67c723b3b74ce4a097c5882481e5048df39 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 2 Feb 2018 11:05:54 +1100
Subject: [PATCH 09/49] Add documentation.

---
 _cffi_build/dice_one_against_many.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 7edb78dc..23d5c6bd 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -122,6 +122,12 @@ dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_pop
     return (2 * uv_popc) / (double) (u_popc + v_popc);
 }
 
+/**
+ * Convert clock measurement t to milliseconds.
+ *
+ * t should have been obtained as the difference of calls to clock()
+ * for this to make sense.
+ */
 static inline double to_millis(clock_t t)
 {
     static constexpr double CPS = (double)CLOCKS_PER_SEC;

From a458ed023c348a3d7eed975e6041a8a6c227f657 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 2 Feb 2018 14:55:20 +1100
Subject: [PATCH 10/49] Expand reporting of various measurements.

---
 anonlink/benchmark.py | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py
index ab854542..83b050fc 100644
--- a/anonlink/benchmark.py
+++ b/anonlink/benchmark.py
@@ -51,11 +51,11 @@ def compute_popcount_speed(n):
 
 
 def print_comparison_header(threshold):
-    print("Threshold = ", threshold)
-    print("Size 1 | Size 2 | Comparisons  | Total Time (simat/solv) | Million Comparisons per second")
+    print("\nThreshold:", threshold)
+    print("Size 1 | Size 2 | Comparisons (match %) | Total Time (simat/solv) | Throughput (1e6 cmp/s)")
 
 
-def compute_comparison_speed(n1=100, n2=100, threshold=0.75):
+def compute_comparison_speed(n1, n2, threshold):
     """
     Using the greedy solver, how fast can hashes be computed using one core.
     """
@@ -69,15 +69,16 @@ def compute_comparison_speed(n1=100, n2=100, threshold=0.75):
     res = greedy_solver(sparse_matrix)
     end = timer()
 
-    #print("mat size = ", len(sparse_matrix))
     similarity_time = t1 - start
     solver_time = end - t1
     elapsed_time = end - start
-    print("{:6d} | {:6d} | {:12d} | {:7.3f}s ({:3.1f}% / {:3.1f}%) |  {:12.3f}  -- {:8d} = {:2.1f}%".format(
-        n1, n2, n1*n2, elapsed_time,
+    print("{:6d} | {:6d} |  {:6d}e6  ({:5.2f}%)   | {:6.3f}s ({:4.1f}% / {:4.1f}%) |  {:8.3f}".format(
+        n1, n2, n1*n2 // 1000000,
+        100.0*len(sparse_matrix)/(n1*n2),
+        elapsed_time,
         100.0*similarity_time/elapsed_time,
         100.0*solver_time/elapsed_time,
-        (n1*n2)/(1e6*similarity_time), len(sparse_matrix), 100.0*len(sparse_matrix)/(n1*n2)))
+        (n1*n2)/(1e6*similarity_time)))
     return elapsed_time
 
 
@@ -137,14 +138,23 @@ def benchmark(size, compare):
         2000000
     ]
 
-    #for thld in [0.95, 0.85, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5]:
-    for thld in [0.22, 0.25, 0.27, 0.48, 0.49, 0.5, 0.51, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64]:
-        #print_comparison_header(thld)
-        print("threshold = ", thld)
-        for test_size in possible_test_sizes:
-            if test_size <= size:
-                compute_comparison_speed(test_size, test_size, thld)
+    # Testing two things:
+    # - the Dice coefficient calculation
+    # - picking the top k candidates
+
+    thld = 0.5
+    print_comparison_header(thld)
+    for test_size in possible_test_sizes:
+        if test_size <= size:
+            compute_comparison_speed(test_size, test_size, thld)
+
+    thld = 0.7
+    print_comparison_header(thld)
+    size *= 5
+    for test_size in possible_test_sizes:
+        if test_size <= size:
+            compute_comparison_speed(test_size, test_size, thld)
 
 if __name__ == '__main__':
-    benchmark(1000, False)
+    benchmark(4000, False)
     #benchmark(20000, False)

From 7d2e66c7b7d68f6211d23dcf65b445e53ff7495f Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 5 Feb 2018 12:51:27 +1100
Subject: [PATCH 11/49] Comments.

---
 anonlink/benchmark.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py
index 83b050fc..c4c78819 100644
--- a/anonlink/benchmark.py
+++ b/anonlink/benchmark.py
@@ -25,6 +25,8 @@ def compute_popcount_speed(n):
 
     print("{:6d} x {:d} bit popcounts".format(n, clk_bits))
     print("Implementation              | Time (ms) | Bandwidth (MiB/s)")
+
+    # Python
     start = timer()
     popcounts = popcount_vector(clks, use_native=False)
     end = timer()
@@ -124,6 +126,8 @@ def compare_python_c(ntotal=10000, nsubset=6000, frac=0.8):
 
 def benchmark(size, compare):
 
+    print("Anonlink benchmark -- see README for explanation")
+    print("------------------------------------------------")
     if compare:
         print(compare_python_c(ntotal=1000, nsubset=600))
 
@@ -138,10 +142,6 @@ def benchmark(size, compare):
         2000000
     ]
 
-    # Testing two things:
-    # - the Dice coefficient calculation
-    # - picking the top k candidates
-
     thld = 0.5
     print_comparison_header(thld)
     for test_size in possible_test_sizes:

From 9666eae628585e61429a7eef415871dbd0b61449 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 5 Feb 2018 13:54:04 +1100
Subject: [PATCH 12/49] Update README.

---
 README.rst | 78 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 59 insertions(+), 19 deletions(-)

diff --git a/README.rst b/README.rst
index 9c025eb5..8fe60808 100644
--- a/README.rst
+++ b/README.rst
@@ -43,28 +43,68 @@ For linux with:
 Benchmark
 ---------
 
+You can run the benchmark with:
+
 ::
 
     $ python -m anonlink.benchmark
-    100000 x 1024 bit popcounts in 0.016376 seconds
-    Popcount speed: 745.42 MiB/s
-    Size 1 | Size 2 | Comparisons  | Compute Time | Million Comparisons per second
-      1000 |   1000 |      1000000 |    0.060s    |        16.632
-      2000 |   2000 |      4000000 |    0.159s    |        25.232
-      3000 |   3000 |      9000000 |    0.316s    |        28.524
-      4000 |   4000 |     16000000 |    0.486s    |        32.943
-      5000 |   5000 |     25000000 |    0.584s    |        42.825
-      6000 |   6000 |     36000000 |    0.600s    |        60.027
-      7000 |   7000 |     49000000 |    0.621s    |        78.875
-      8000 |   8000 |     64000000 |    0.758s    |        84.404
-      9000 |   9000 |     81000000 |    0.892s    |        90.827
-     10000 |  10000 |    100000000 |    1.228s    |        81.411
-     20000 |  20000 |    400000000 |    3.980s    |       100.504
-     30000 |  30000 |    900000000 |    9.280s    |        96.986
-     40000 |  40000 |   1600000000 |   17.318s    |        92.391
-
-C++ version uses cpu instruction ``POPCNT`` for bitcount in a 64bit
-word. http://wm.ite.pl/articles/sse-popcount.html
+    Anonlink benchmark -- see README for explanation
+    ------------------------------------------------
+    100000 x 1024 bit popcounts
+    Implementation              | Time (ms) | Bandwidth (MiB/s)
+    Python (bitarray.count()):  |    20.83  |     586.12
+    Native code (no copy):      |     0.91  |   13443.87
+    Native code (w/ copy):      |   381.83  |      31.97   (99.8% copying)
+
+    Threshold: 0.5
+    Size 1 | Size 2 | Comparisons (match %) | Total Time (simat/solv) | Throughput (1e6 cmp/s)
+      1000 |   1000 |       1e6  (49.59%)   |  0.293s (89.7% / 10.3%) |     3.812
+      2000 |   2000 |       4e6  (50.33%)   |  1.151s (89.2% / 10.8%) |     3.899
+      3000 |   3000 |       9e6  (50.94%)   |  2.611s (88.7% / 11.3%) |     3.886
+      4000 |   4000 |      16e6  (50.54%)   |  4.635s (88.3% / 11.7%) |     3.910
+
+    Threshold: 0.7
+    Size 1 | Size 2 | Comparisons (match %) | Total Time (simat/solv) | Throughput (1e6 cmp/s)
+      1000 |   1000 |       1e6  ( 0.01%)   |  0.018s (99.8% /  0.2%) |    54.846
+      2000 |   2000 |       4e6  ( 0.01%)   |  0.067s (99.9% /  0.1%) |    59.983
+      3000 |   3000 |       9e6  ( 0.01%)   |  0.131s (99.8% /  0.2%) |    68.958
+      4000 |   4000 |      16e6  ( 0.01%)   |  0.219s (99.9% /  0.1%) |    73.092
+      5000 |   5000 |      25e6  ( 0.01%)   |  0.333s (99.9% /  0.1%) |    75.280
+      6000 |   6000 |      36e6  ( 0.01%)   |  0.472s (99.9% /  0.1%) |    76.373
+      7000 |   7000 |      49e6  ( 0.01%)   |  0.629s (99.9% /  0.1%) |    78.030
+      8000 |   8000 |      64e6  ( 0.01%)   |  0.809s (99.9% /  0.1%) |    79.255
+      9000 |   9000 |      81e6  ( 0.01%)   |  1.024s (99.9% /  0.1%) |    79.212
+     10000 |  10000 |     100e6  ( 0.01%)   |  1.386s (99.9% /  0.1%) |    72.233
+     20000 |  20000 |     400e6  ( 0.01%)   |  4.932s (99.9% /  0.1%) |    81.185
+
+The tables are interpreted as follows. The first section compares the
+bandwidth doing popcounts through (i) the Python bitarray library and
+(ii) a native code implementation in assembler.  The latter
+implementation is measured in two ways: the first measures just the
+time taken to compute the popcounts, while the second includes the
+time taken to copy the data out of the running Python instance as well
+as copying the result back into Python. The "% copying" measure is the
+proportion of time spent doing this copying.
+
+The second section includes two tables that measure the throughput of
+the Dice coefficient comparison function. The two tables correspond to
+two different choices of "matching threshold", 0.5 and 0.7, which were
+chosen to characterise two different performance scenarios. Since the
+data used for comparisons is randomly generated, the first threshold
+value will cause about 50% of the candidates to "match", while the
+second threshold value will cause <0.01% of the candidates to match
+(these values are reported in the "match %" column). In the first
+case, the large number of matches means that much of the time is spent
+keeping the candidates in order so that the top `k` matches can be
+returned. In the latter case, the tiny number of candidate matches
+means that the throughput is determined primarily by the comparison
+code itself.
+
+Finally, the Total Time column includes indications as to the
+proportion of time spent calculating the (sparse) similarity matrix
+(`simat`) and the proportion of time spent in the greedy solver
+(`solv`). This latter is determined by the size of the similarity
+matrix, which will be approximately `#comparisons * match% / 100`.
 
 Tests
 =====

From 6fe3663972842255d97a288952b9865b0cae2e44 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 5 Feb 2018 15:30:04 +1100
Subject: [PATCH 13/49] Bring test suite up-to-date.

---
 anonlink/benchmark.py   | 14 +++++++-------
 tests/test_benchmark.py |  5 +----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py
index c4c78819..5a43c101 100644
--- a/anonlink/benchmark.py
+++ b/anonlink/benchmark.py
@@ -31,9 +31,9 @@ def compute_popcount_speed(n):
     popcounts = popcount_vector(clks, use_native=False)
     end = timer()
     elapsed_time = end - start
-    speed_in_MiB = clks_MiB / elapsed_time
+    python_speed_in_MiB = clks_MiB / elapsed_time
     print("Python (bitarray.count()):  |  {:7.2f}  |  {:9.2f} "
-          .format(elapsed_time * 1e3, speed_in_MiB))
+          .format(elapsed_time * 1e3, python_speed_in_MiB))
 
     # Native
     start = timer()
@@ -42,14 +42,14 @@ def compute_popcount_speed(n):
     elapsed_time = end - start
     elapsed_nocopy = ms / 1e3
     copy_percent = 100*(elapsed_time - elapsed_nocopy) / elapsed_time
-    speed_in_MiB = clks_MiB / elapsed_time
-    speed_in_MiB_nocopy = clks_MiB / elapsed_nocopy
+    native_speed_in_MiB = clks_MiB / elapsed_time
+    native_speed_in_MiB_nocopy = clks_MiB / elapsed_nocopy
     print("Native code (no copy):      |  {:7.2f}  |  {:9.2f} "
-          .format(ms, speed_in_MiB_nocopy))
+          .format(ms, native_speed_in_MiB_nocopy))
     print("Native code (w/ copy):      |  {:7.2f}  |  {:9.2f}   ({:.1f}% copying)"
-          .format(elapsed_time * 1e3, speed_in_MiB, copy_percent))
+          .format(elapsed_time * 1e3, native_speed_in_MiB, copy_percent))
 
-    return speed_in_MiB
+    return python_speed_in_MiB
 
 
 def print_comparison_header(threshold):
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index acc6fc06..cb6f0a2f 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -11,10 +11,7 @@ def test_benchmarking_popcount(self):
         self.assertGreater(speed, 50, "Popcounting at less than 50MiB/s")
 
     def test_comparison_speed_benchmark(self):
-        benchmark.compute_comparison_speed()
-
-    def test_parallel_comparison_speed_benchmark(self):
-        benchmark.compute_comparison_speed_parallel()
+        benchmark.compute_comparison_speed(100, 100, 0.7)
 
     def test_comparing_python_c_bench(self):
         benchmark.compare_python_c(500, 30, frac=0.8)

From 66d9b6e644606216e5a9b07d6b42067ae96e5b0a Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 15 Jan 2018 12:03:36 +1100
Subject: [PATCH 14/49] Refactor main C++ function to avoid use "constant"
 memory and avoid new/delete.

---
 _cffi_build/dice_one_against_many.cpp | 31 +++++++++------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 23d5c6bd..66565fdd 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -177,9 +177,6 @@ extern "C"
         std::priority_queue<Node, std::vector<Node>, score_cmp> max_k_scores;
 
         uint32_t count_one = builtin_popcnt_unrolled_errata_manual(comp1);
-
-        double *all_scores = new double[n];
-
         uint32_t max_popcnt_delta = 1024;
         if(threshold > 0) {
             max_popcnt_delta = calculate_max_difference(count_one, threshold);
@@ -188,32 +185,24 @@ extern "C"
 
         for (int j = 0; j < n; j++) {
             const uint64_t *current = comp2 + j * KEYWORDS;
+            const uint32_t counts_many_j = counts_many[j];
 
-            if(count_one > counts_many[j]){
-                current_delta = count_one - counts_many[j];
+            if (count_one > counts_many_j) {
+                current_delta = count_one - counts_many_j;
             } else {
-                current_delta = counts_many[j] - count_one;
+                current_delta = counts_many_j - count_one;
             }
 
             if(current_delta <= max_popcnt_delta){
-                all_scores[j] = dice_coeff(comp1, count_one, current, counts_many[j]);
-            } else {
-                // Skipping because popcount difference too large
-                all_scores[j] = -1;
-            }
-        }
-
-        for (int j = 0; j < n; j++) {
-
-            if(all_scores[j] >= threshold) {
-                max_k_scores.push(Node(j, all_scores[j]));
+                double score = dice_coeff(comp1, count_one, current, counts_many[j]);
+                if (score >= threshold) {
+                    max_k_scores.push(Node(j, score));
+                    if (max_k_scores.size() > k)
+                        max_k_scores.pop();
+                }
             }
-
-            if(max_k_scores.size() > k) max_k_scores.pop();
         }
 
-        delete[] all_scores;
-
         int i = 0;
         while (!max_k_scores.empty()) {
 

From 3a55dc4145c67f7eafc46b92bf853549511776d1 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Tue, 6 Feb 2018 22:00:18 +1100
Subject: [PATCH 15/49] Screw everything up by unrolling with C++ templates,
 apparently.

---
 _cffi_build/dice_one_against_many.cpp | 29 ++++++++++++++-------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 72f80d84..b4302578 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -8,16 +8,17 @@
 
 
 template<int n>
-uint32_t popcount(const uint64_t *buf) {
-    return popcount<4>(buf) + popcount<n - 4>(buf + 4);
+void popcount(uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint64_t *buf) {
+    popcount<4>(c0, c1, c2, c3, buf);
+    popcount<n - 4>(c0, c1, c2, c3, buf + 4);
 }
 
 template<>
-uint32_t popcount<4>(const uint64_t* buf) {
-    uint64_t b0, b1, b2, b3;
-    uint64_t c0, c1, c2, c3;
-    c0 = c1 = c2 = c3 = 0;
+void popcount<4>(
+    uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
+    const uint64_t* buf) {
 
+    uint64_t b0, b1, b2, b3;
     b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3];
     __asm__(
         "popcnt %4, %4  \n\t"
@@ -30,8 +31,6 @@ uint32_t popcount<4>(const uint64_t* buf) {
         "add %7, %3     \n\t"
         : "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3),
           "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));
-
-    return c0 + c1 + c2 + c3;
 }
 
 static uint32_t
@@ -40,10 +39,11 @@ popcount_array(const uint64_t *buf, int n) {
     // iteration. Currently 16, which corresponds to 16*64 = 1024 bits.
     static constexpr int WORDS_PER_POPCOUNT = 16;
     assert(n % WORDS_PER_POPCOUNT == 0);
-    uint32_t pc = 0;
+    uint64_t c0, c1, c2, c3;
+    c0 = c1 = c2 = c3 = 0;
     for (int i = 0; i < n; i += WORDS_PER_POPCOUNT)
-        pc += popcount<WORDS_PER_POPCOUNT>(buf + i);
-    return pc;
+        popcount<WORDS_PER_POPCOUNT>(c0, c1, c2, c3, buf + i);
+    return c0 + c1 + c2 + c3;
 }
 
 static uint32_t
@@ -52,14 +52,15 @@ popcount_combined_array(const uint64_t *__restrict__ buf1, const uint64_t *__res
     // iteration. Currently 16, which corresponds to 16*64 = 1024 bits.
     static constexpr int WORDS_PER_POPCOUNT = 16;
     assert(n % WORDS_PER_POPCOUNT == 0);
-    uint32_t pc = 0;
     uint64_t combined[WORDS_PER_POPCOUNT];
+    uint64_t c0, c1, c2, c3;
+    c0 = c1 = c2 = c3 = 0;
     for (int i = 0; i < n; i += WORDS_PER_POPCOUNT) {
         for (int j = 0; j < WORDS_PER_POPCOUNT; ++j)
             combined[j] = buf1[i + j] & buf2[i + j];
-        pc += popcount<WORDS_PER_POPCOUNT>(combined);
+        popcount<WORDS_PER_POPCOUNT>(c0, c1, c2, c3, combined);
     }
-    return pc;
+    return c0 + c1 + c2 + c3;
 }
 
 /**

From b94c555f0bdf1168d786097493379b30c31136a2 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Wed, 7 Feb 2018 12:54:45 +1100
Subject: [PATCH 16/49] Magical argument that makes the compiler generate the
 correct (performant) code.

---
 _cffi_build/build_matcher.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py
index cebbf15f..8592e183 100644
--- a/_cffi_build/build_matcher.py
+++ b/_cffi_build/build_matcher.py
@@ -15,7 +15,8 @@
     "_entitymatcher",
     source,
     source_extension='.cpp',
-    extra_compile_args=['-Wall', '-Wextra', '-Werror', '-O3', '-std=c++11', '-mssse3', '-mpopcnt'],
+    extra_compile_args=['-Wall', '-Wextra', '-Werror', '-O3', '-std=c++11', '-mssse3', '-mpopcnt', '-fvisibility=hidden'
+    ],
 )
 
 ffibuilder.cdef("""

From 166f6e95058628971945469e3c72cde578acb35a Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Wed, 7 Feb 2018 13:45:15 +1100
Subject: [PATCH 17/49] Address Brian's comments.

---
 README.rst            | 64 +++++++++++++++++++++++--------------------
 anonlink/benchmark.py | 16 +++++------
 anonlink/util.py      | 22 +++++++++------
 3 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/README.rst b/README.rst
index 8fe60808..f3c42eae 100644
--- a/README.rst
+++ b/README.rst
@@ -46,36 +46,39 @@ Benchmark
 You can run the benchmark with:
 
 ::
-
-    $ python -m anonlink.benchmark
+    $ python3 -m anonlink.benchmark
     Anonlink benchmark -- see README for explanation
     ------------------------------------------------
     100000 x 1024 bit popcounts
     Implementation              | Time (ms) | Bandwidth (MiB/s)
-    Python (bitarray.count()):  |    20.83  |     586.12
-    Native code (no copy):      |     0.91  |   13443.87
-    Native code (w/ copy):      |   381.83  |      31.97   (99.8% copying)
+    Python (bitarray.count()):  |    18.40  |     663.30
+    Native code (no copy):      |     0.97  |   12558.67
+    Native code (w/ copy):      |   347.66  |      35.11   (99.7% copying)
 
     Threshold: 0.5
-    Size 1 | Size 2 | Comparisons (match %) | Total Time (simat/solv) | Throughput (1e6 cmp/s)
-      1000 |   1000 |       1e6  (49.59%)   |  0.293s (89.7% / 10.3%) |     3.812
-      2000 |   2000 |       4e6  (50.33%)   |  1.151s (89.2% / 10.8%) |     3.899
-      3000 |   3000 |       9e6  (50.94%)   |  2.611s (88.7% / 11.3%) |     3.886
-      4000 |   4000 |      16e6  (50.54%)   |  4.635s (88.3% / 11.7%) |     3.910
+    Size 1 | Size 2 | Comparisons      | Total Time (s)          | Throughput
+           |        |        (match %) | (comparisons / matching)|  (1e6 cmp/s)
+    -------+--------+------------------+-------------------------+-------------
+      1000 |   1000 |    1e6  (50.20%) |  0.249  (88.6% / 11.4%) |     4.525
+      2000 |   2000 |    4e6  (50.51%) |  1.069  (88.5% / 11.5%) |     4.227
+      3000 |   3000 |    9e6  (50.51%) |  2.412  (85.3% / 14.7%) |     4.375
+      4000 |   4000 |   16e6  (50.56%) |  4.316  (83.6% / 16.4%) |     4.434
 
     Threshold: 0.7
-    Size 1 | Size 2 | Comparisons (match %) | Total Time (simat/solv) | Throughput (1e6 cmp/s)
-      1000 |   1000 |       1e6  ( 0.01%)   |  0.018s (99.8% /  0.2%) |    54.846
-      2000 |   2000 |       4e6  ( 0.01%)   |  0.067s (99.9% /  0.1%) |    59.983
-      3000 |   3000 |       9e6  ( 0.01%)   |  0.131s (99.8% /  0.2%) |    68.958
-      4000 |   4000 |      16e6  ( 0.01%)   |  0.219s (99.9% /  0.1%) |    73.092
-      5000 |   5000 |      25e6  ( 0.01%)   |  0.333s (99.9% /  0.1%) |    75.280
-      6000 |   6000 |      36e6  ( 0.01%)   |  0.472s (99.9% /  0.1%) |    76.373
-      7000 |   7000 |      49e6  ( 0.01%)   |  0.629s (99.9% /  0.1%) |    78.030
-      8000 |   8000 |      64e6  ( 0.01%)   |  0.809s (99.9% /  0.1%) |    79.255
-      9000 |   9000 |      81e6  ( 0.01%)   |  1.024s (99.9% /  0.1%) |    79.212
-     10000 |  10000 |     100e6  ( 0.01%)   |  1.386s (99.9% /  0.1%) |    72.233
-     20000 |  20000 |     400e6  ( 0.01%)   |  4.932s (99.9% /  0.1%) |    81.185
+    Size 1 | Size 2 | Comparisons      | Total Time (s)          | Throughput
+           |        |        (match %) | (comparisons / matching)|  (1e6 cmp/s)
+    -------+--------+------------------+-------------------------+-------------
+      1000 |   1000 |    1e6  ( 0.01%) |  0.017  (99.8% /  0.2%) |    59.605
+      2000 |   2000 |    4e6  ( 0.01%) |  0.056  (99.8% /  0.2%) |    71.484
+      3000 |   3000 |    9e6  ( 0.01%) |  0.118  (99.9% /  0.1%) |    76.500
+      4000 |   4000 |   16e6  ( 0.01%) |  0.202  (99.9% /  0.1%) |    79.256
+      5000 |   5000 |   25e6  ( 0.01%) |  0.309  (99.9% /  0.1%) |    81.093
+      6000 |   6000 |   36e6  ( 0.01%) |  0.435  (99.9% /  0.1%) |    82.841
+      7000 |   7000 |   49e6  ( 0.01%) |  0.590  (99.9% /  0.1%) |    83.164
+      8000 |   8000 |   64e6  ( 0.01%) |  0.757  (99.9% /  0.1%) |    84.619
+      9000 |   9000 |   81e6  ( 0.01%) |  0.962  (99.8% /  0.2%) |    84.358
+     10000 |  10000 |  100e6  ( 0.01%) |  1.166  (99.8% /  0.2%) |    85.895
+     20000 |  20000 |  400e6  ( 0.01%) |  4.586  (99.9% /  0.1%) |    87.334
 
 The tables are interpreted as follows. The first section compares the
 bandwidth doing popcounts through (i) the Python bitarray library and
@@ -93,17 +96,18 @@ chosen to characterise two different performance scenarios. Since the
 data used for comparisons is randomly generated, the first threshold
 value will cause about 50% of the candidates to "match", while the
 second threshold value will cause <0.01% of the candidates to match
-(these values are reported in the "match %" column). In the first
-case, the large number of matches means that much of the time is spent
-keeping the candidates in order so that the top `k` matches can be
-returned. In the latter case, the tiny number of candidate matches
-means that the throughput is determined primarily by the comparison
-code itself.
+(these values are reported in the "match %" column).  In both cases,
+all matches above the threshold are returned and passed to the
+solver. In the first case, the large number of matches means that much
+of the time is spent keeping the candidates in order so that the top
+`k` matches can be returned. In the latter case, the tiny number of
+candidate matches means that the throughput is determined primarily by
+the comparison code itself.
 
 Finally, the Total Time column includes indications as to the
 proportion of time spent calculating the (sparse) similarity matrix
-(`simat`) and the proportion of time spent in the greedy solver
-(`solv`). This latter is determined by the size of the similarity
+`comparisons` and the proportion of time spent `matching` in the
+greedy solver. This latter is determined by the size of the similarity
 matrix, which will be approximately `#comparisons * match% / 100`.
 
 Tests
diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py
index 5a43c101..14bf38e4 100644
--- a/anonlink/benchmark.py
+++ b/anonlink/benchmark.py
@@ -27,25 +27,21 @@ def compute_popcount_speed(n):
     print("Implementation              | Time (ms) | Bandwidth (MiB/s)")
 
     # Python
-    start = timer()
-    popcounts = popcount_vector(clks, use_native=False)
-    end = timer()
-    elapsed_time = end - start
+    popcounts, elapsed_time = popcount_vector(clks, use_python=True)
     python_speed_in_MiB = clks_MiB / elapsed_time
     print("Python (bitarray.count()):  |  {:7.2f}  |  {:9.2f} "
           .format(elapsed_time * 1e3, python_speed_in_MiB))
 
     # Native
     start = timer()
-    popcounts, ms = popcount_vector(clks, use_native=True)
+    popcounts, elapsed_nocopy = popcount_vector(clks, use_python=False)
     end = timer()
     elapsed_time = end - start
-    elapsed_nocopy = ms / 1e3
     copy_percent = 100*(elapsed_time - elapsed_nocopy) / elapsed_time
     native_speed_in_MiB = clks_MiB / elapsed_time
     native_speed_in_MiB_nocopy = clks_MiB / elapsed_nocopy
     print("Native code (no copy):      |  {:7.2f}  |  {:9.2f} "
-          .format(ms, native_speed_in_MiB_nocopy))
+          .format(elapsed_nocopy * 1e3, native_speed_in_MiB_nocopy))
     print("Native code (w/ copy):      |  {:7.2f}  |  {:9.2f}   ({:.1f}% copying)"
           .format(elapsed_time * 1e3, native_speed_in_MiB, copy_percent))
 
@@ -54,7 +50,9 @@ def compute_popcount_speed(n):
 
 def print_comparison_header(threshold):
     print("\nThreshold:", threshold)
-    print("Size 1 | Size 2 | Comparisons (match %) | Total Time (simat/solv) | Throughput (1e6 cmp/s)")
+    print("Size 1 | Size 2 | Comparisons      | Total Time (s)          | Throughput")
+    print("       |        |        (match %) | (comparisons / matching)|  (1e6 cmp/s)")
+    print("-------+--------+------------------+-------------------------+-------------")
 
 
 def compute_comparison_speed(n1, n2, threshold):
@@ -74,7 +72,7 @@ def compute_comparison_speed(n1, n2, threshold):
     similarity_time = t1 - start
     solver_time = end - t1
     elapsed_time = end - start
-    print("{:6d} | {:6d} |  {:6d}e6  ({:5.2f}%)   | {:6.3f}s ({:4.1f}% / {:4.1f}%) |  {:8.3f}".format(
+    print("{:6d} | {:6d} | {:4d}e6  ({:5.2f}%) | {:6.3f}  ({:4.1f}% / {:4.1f}%) |  {:8.3f}".format(
         n1, n2, n1*n2 // 1000000,
         100.0*len(sparse_matrix)/(n1*n2),
         elapsed_time,
diff --git a/anonlink/util.py b/anonlink/util.py
index dfadfc54..b0f1cb2c 100644
--- a/anonlink/util.py
+++ b/anonlink/util.py
@@ -3,6 +3,7 @@
 import os
 import random
 from bitarray import bitarray
+from timeit import default_timer as timer
 
 from anonlink._entitymatcher import ffi, lib
 
@@ -20,20 +21,23 @@ def generate_clks(n):
     return res
 
 
-def popcount_vector(bitarrays, use_native=False):
-    """Return an array containing the popcounts of the elements of
-    bitarrays. If use_native is True, use the native code
-    implementation and return the time spent (in milliseconds) in the
-    native code as a second return value.
+def popcount_vector(bitarrays, use_python=True):
+    """Return a list containing the popcounts of the elements of
+    bitarrays, and the time (in seconds) it took. If use_python is
+    False, use the native code implementation instead of Python; in
+    this case the returned time is the time spent in the native code,
+    NOT including copying to and from the Python runtime.
 
     Note, due to the overhead of converting bitarrays into bytes,
     it is currently more expensive to call our C implementation
     than just calling bitarray.count()
-
     """
     # Use Python
-    if not use_native:
-        return [clk.count() for clk in bitarrays]
+    if use_python:
+        start = timer()
+        counts = [clk.count() for clk in bitarrays]
+        elapsed = timer() - start
+        return counts, elapsed
 
     # Use native code
     n = len(bitarrays)
@@ -42,7 +46,7 @@ def popcount_vector(bitarrays, use_native=False):
                     bytes([b for f in bitarrays for b in f.tobytes()]))
     ms = lib.popcount_1024_array(many, n, c_popcounts)
 
-    return [c_popcounts[i] for i in range(n)], ms
+    return [c_popcounts[i] for i in range(n)], ms * 1e-3
 
 
 def chunks(l, n):

From 9cbc2432f5deb7a99ffdbb6ade773be05a0a1164 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Wed, 7 Feb 2018 13:53:06 +1100
Subject: [PATCH 18/49] Update tests; also test native code version.

---
 tests/test_util.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/test_util.py b/tests/test_util.py
index 521cada5..2dae61d5 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -21,8 +21,13 @@ def test_generate_clks(self):
 
     def test_popcount_vector(self):
         bas = [util.generate_bitarray(1024) for i in range(100)]
-        popcounts = util.popcount_vector(bas)
 
+        popcounts, _ = util.popcount_vector(bas, use_python=True)
+        self.assertEquals(len(popcounts), 100)
+        for i, cnt in enumerate(popcounts):
+            self.assertEquals(cnt, bas[i].count())
+
+        popcounts, _ = util.popcount_vector(bas, use_python=False)
         self.assertEquals(len(popcounts), 100)
         for i, cnt in enumerate(popcounts):
             self.assertEquals(cnt, bas[i].count())

From cf26901cb9734e5b7160a245d727fbd4a5a018d0 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 9 Feb 2018 10:56:45 +1100
Subject: [PATCH 19/49] Print popcount throughput; give some variables better
 names.

---
 anonlink/benchmark.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py
index 14bf38e4..2fd0e438 100644
--- a/anonlink/benchmark.py
+++ b/anonlink/benchmark.py
@@ -24,13 +24,15 @@ def compute_popcount_speed(n):
     clks = [generate_bitarray(clk_bits) for _ in range(n)]
 
     print("{:6d} x {:d} bit popcounts".format(n, clk_bits))
-    print("Implementation              | Time (ms) | Bandwidth (MiB/s)")
+    print("Implementation              | Time (ms) | Bandwidth (MiB/s) | Throughput (1e6 popc/s)")
 
     # Python
     popcounts, elapsed_time = popcount_vector(clks, use_python=True)
     python_speed_in_MiB = clks_MiB / elapsed_time
-    print("Python (bitarray.count()):  |  {:7.2f}  |  {:9.2f} "
-          .format(elapsed_time * 1e3, python_speed_in_MiB))
+    python_Mops = n / (1e6 * elapsed_time)
+    elapsed_time_ms = elapsed_time * 1e3
+    print("Python (bitarray.count()):  |  {:7.2f}  |   {:9.2f}       | {:7.2f}"
+          .format(elapsed_time_ms, python_speed_in_MiB, python_Mops))
 
     # Native
     start = timer()
@@ -38,12 +40,16 @@ def compute_popcount_speed(n):
     end = timer()
     elapsed_time = end - start
     copy_percent = 100*(elapsed_time - elapsed_nocopy) / elapsed_time
+    elapsed_time_ms = elapsed_time * 1e3
+    elapsed_nocopy_ms = elapsed_nocopy * 1e3
     native_speed_in_MiB = clks_MiB / elapsed_time
     native_speed_in_MiB_nocopy = clks_MiB / elapsed_nocopy
-    print("Native code (no copy):      |  {:7.2f}  |  {:9.2f} "
-          .format(elapsed_nocopy * 1e3, native_speed_in_MiB_nocopy))
-    print("Native code (w/ copy):      |  {:7.2f}  |  {:9.2f}   ({:.1f}% copying)"
-          .format(elapsed_time * 1e3, native_speed_in_MiB, copy_percent))
+    native_Mops = n / (1e6 * elapsed_time)
+    native_Mops_nocopy = n / (1e6 * elapsed_nocopy)
+    print("Native code (no copy):      |  {:7.2f}  |   {:9.2f}       | {:7.2f}"
+          .format(elapsed_nocopy_ms, native_speed_in_MiB_nocopy, native_Mops_nocopy))
+    print("Native code (w/ copy):      |  {:7.2f}  |   {:9.2f}       | {:7.2f} ({:.1f}% copying)"
+          .format(elapsed_time_ms, native_speed_in_MiB, native_Mops, copy_percent))
 
     return python_speed_in_MiB
 

From d02f23a06d4be731ae444ee3d431f59298b688a5 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 9 Feb 2018 11:59:13 +1100
Subject: [PATCH 20/49] Make some functions static inline.

---
 _cffi_build/dice_one_against_many.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index b4302578..66f0f94d 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -108,12 +108,13 @@ struct score_cmp{
 /**
  *
  */
-static uint32_t calculate_max_difference(uint32_t popcnt_a, double threshold)
+static inline uint32_t
+calculate_max_difference(uint32_t popcnt_a, double threshold)
 {
     return 2 * popcnt_a * (1/threshold - 1);
 }
 
-static double
+static inline double
 dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_popc, int n)
 {
     uint32_t uv_popc = popcount_combined_array(u, v, n);
@@ -126,12 +127,20 @@ dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_pop
  * t should have been obtained as the difference of calls to clock()
  * for this to make sense.
  */
-static inline double to_millis(clock_t t)
+static inline double
+to_millis(clock_t t)
 {
     static constexpr double CPS = (double)CLOCKS_PER_SEC;
     return t * 1.0E3 / CPS;
 }
 
+static inline uint32_t
+abs_diff(uint32_t a, uint32_t b) {
+    if (a > b)
+        return a - b;
+    return b - a;
+}
+
 extern "C"
 {
     /**

From 888e989bedd16297a521cbae74c77fa8e26ddaaf Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 9 Feb 2018 13:19:08 +1100
Subject: [PATCH 21/49] Tidy up some expressions.

---
 _cffi_build/dice_one_against_many.cpp | 33 +++++++++++----------------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 66f0f94d..48628496 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -167,8 +167,8 @@ extern "C"
 
     /**
      * Calculate up to the top k indices and scores.  Returns the
-     * number matched above a threshold or -1 if keybytes is not a
-     * multiple of 32.
+     * number matched above the given threshold or -1 if keybytes is
+     * not a multiple of 8.
      */
     int match_one_against_many_dice_k_top(
         const char *one,
@@ -184,35 +184,28 @@ extern "C"
         const uint64_t *comp1 = (const uint64_t *) one;
         const uint64_t *comp2 = (const uint64_t *) many;
 
-        // FIXME: This comment needs to be updated
-        // keybytes must be divisible by 32, because keywords must be
-        // divisible by 4 for the builtin popcount function to work
-        // and keywords = keybytes / 8.
         static constexpr int WORDBYTES = sizeof(uint64_t);
-        int keywords = keybytes / WORDBYTES;
-        if (keywords % 16 != 0)
+        if (keybytes % WORDBYTES != 0)
             return -1;
+        int keywords = keybytes / WORDBYTES;
 
-        std::priority_queue<Node, std::vector<Node>, score_cmp> max_k_scores;
+        typedef std::vector<Node> node_vector;
+        typedef std::priority_queue<Node, std::vector<Node>, score_cmp> node_queue;
+        node_vector vec;
+        vec.reserve(k + 1);
+        node_queue max_k_scores(score_cmp(), std::move(vec));
 
         uint32_t count_one = popcount_array(comp1, keywords);
-        uint32_t max_popcnt_delta = keywords * WORDBYTES * 8; // = bits per key
+        uint32_t max_popcnt_delta = keybytes * 8; // = bits per key
         if(threshold > 0) {
             max_popcnt_delta = calculate_max_difference(count_one, threshold);
         }
 
-        for (int j = 0; j < n; j++) {
-            const uint64_t *current = comp2 + j * keywords;
+        const uint64_t *current = comp2;
+        for (int j = 0; j < n; j++, current += keywords) {
             const uint32_t counts_many_j = counts_many[j];
-            uint32_t current_delta;
-
-            if (count_one > counts_many_j) {
-                current_delta = count_one - counts_many_j;
-            } else {
-                current_delta = counts_many_j - count_one;
-            }
 
-            if (current_delta <= max_popcnt_delta) {
+            if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) {
                 double score = dice_coeff(comp1, count_one, current, counts_many_j, keywords);
                 if (score >= threshold) {
                     max_k_scores.push(Node(j, score));

From c6780f0d1358384be2c640fa53f471ea5432af22 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 9 Feb 2018 13:21:34 +1100
Subject: [PATCH 22/49] Put some braces in the right place; make fn inline.

---
 _cffi_build/dice_one_against_many.cpp | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 48628496..5b56de02 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -33,7 +33,7 @@ void popcount<4>(
           "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));
 }
 
-static uint32_t
+static inline uint32_t
 popcount_array(const uint64_t *buf, int n) {
     // WORDS_PER_POPCOUNT is how many elements of buf we process each
     // iteration. Currently 16, which corresponds to 16*64 = 1024 bits.
@@ -93,13 +93,11 @@ class Node {
 
     // Constructor with default
     Node( int n_index = -1, double n_score = -1.0 )
-        :index(n_index), score( n_score )
-        {
-        }
+        :index(n_index), score( n_score ) { }
 };
 
 struct score_cmp{
-    bool operator()(const Node& a, const Node& b) const{
+    bool operator()(const Node& a, const Node& b) const {
         return a.score > b.score;
     }
 };
@@ -109,14 +107,12 @@ struct score_cmp{
  *
  */
 static inline uint32_t
-calculate_max_difference(uint32_t popcnt_a, double threshold)
-{
+calculate_max_difference(uint32_t popcnt_a, double threshold) {
     return 2 * popcnt_a * (1/threshold - 1);
 }
 
 static inline double
-dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_popc, int n)
-{
+dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_popc, int n) {
     uint32_t uv_popc = popcount_combined_array(u, v, n);
     return (2 * uv_popc) / (double) (u_popc + v_popc);
 }
@@ -128,8 +124,7 @@ dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_pop
  * for this to make sense.
  */
 static inline double
-to_millis(clock_t t)
-{
+to_millis(clock_t t) {
     static constexpr double CPS = (double)CLOCKS_PER_SEC;
     return t * 1.0E3 / CPS;
 }

From 3f1104f09c05e5eb2c3809a00b594e8985e377cd Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 9 Feb 2018 13:33:09 +1100
Subject: [PATCH 23/49] Reinstate comment on origin of popcount assembler.

---
 _cffi_build/dice_one_against_many.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 5b56de02..2e9fc049 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -13,6 +13,16 @@ void popcount(uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint
     popcount<n - 4>(c0, c1, c2, c3, buf + 4);
 }
 
+// Source: http://danluu.com/assembly-intrinsics/
+// https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance
+//
+// NB: Dan Luu's original assembly is incorrect because it
+// clobbers registers marked as "input only" (see warning at
+// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands
+// -- this mistake does not materialise with GCC (4.9), but it
+// does with Clang (3.6 and 3.8)).  We fix the mistake by
+// explicitly loading the contents of buf into registers and using
+// these same registers for the intermediate popcnts.
 template<>
 void popcount<4>(
     uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,

From edc7c2bee494db412a255b7e92a5a6402f89b1bc Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 9 Feb 2018 15:15:07 +1100
Subject: [PATCH 24/49] Make constant a template parameter.

---
 _cffi_build/dice_one_against_many.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 2e9fc049..2c9d408b 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -43,11 +43,11 @@ void popcount<4>(
           "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));
 }
 
-static inline uint32_t
+// WORDS_PER_POPCOUNT is how many elements of buf we process each
+// iteration. Currently 16, which corresponds to 16*64 = 1024 bits.
+template< int WORDS_PER_POPCOUNT = 16 >
+uint32_t
 popcount_array(const uint64_t *buf, int n) {
-    // WORDS_PER_POPCOUNT is how many elements of buf we process each
-    // iteration. Currently 16, which corresponds to 16*64 = 1024 bits.
-    static constexpr int WORDS_PER_POPCOUNT = 16;
     assert(n % WORDS_PER_POPCOUNT == 0);
     uint64_t c0, c1, c2, c3;
     c0 = c1 = c2 = c3 = 0;
@@ -56,11 +56,11 @@ popcount_array(const uint64_t *buf, int n) {
     return c0 + c1 + c2 + c3;
 }
 
-static uint32_t
+// WORDS_PER_POPCOUNT is how many elements of buf we process each
+// iteration. Currently 16, which corresponds to 16*64 = 1024 bits.
+template< int WORDS_PER_POPCOUNT = 16 >
+uint32_t
 popcount_combined_array(const uint64_t *__restrict__ buf1, const uint64_t *__restrict__ buf2, int n) {
-    // WORDS_PER_POPCOUNT is how many elements of buf we process each
-    // iteration. Currently 16, which corresponds to 16*64 = 1024 bits.
-    static constexpr int WORDS_PER_POPCOUNT = 16;
     assert(n % WORDS_PER_POPCOUNT == 0);
     uint64_t combined[WORDS_PER_POPCOUNT];
     uint64_t c0, c1, c2, c3;

From 892c599167a47c26e331400b11a98951a287f5fe Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Thu, 15 Feb 2018 11:12:28 +1100
Subject: [PATCH 25/49] Comment.

---
 _cffi_build/dice_one_against_many.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 2c9d408b..1c500a2b 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -194,6 +194,11 @@ extern "C"
             return -1;
         int keywords = keybytes / WORDBYTES;
 
+        // Here we create max_k_scores on the stack by providing it
+        // with a vector in which to put its elements. We do this so
+        // that we can reserve the amount of space needed for the
+        // scores in advance and avoid potential memory reallocation
+        // and copying.
         typedef std::vector<Node> node_vector;
         typedef std::priority_queue<Node, std::vector<Node>, score_cmp> node_queue;
         node_vector vec;

From f500231263c8bd8a0257e9139e0acb8bf43f1577 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Thu, 15 Feb 2018 18:16:19 +1100
Subject: [PATCH 26/49] Complete version working with multiples of 1024 bits.

---
 _cffi_build/build_matcher.py          |   4 +-
 _cffi_build/dice_one_against_many.cpp | 187 +++++++++++++++-----------
 anonlink/bloommatcher.py              |   5 +-
 anonlink/util.py                      |   2 +-
 4 files changed, 113 insertions(+), 85 deletions(-)

diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py
index 8592e183..9e0c685a 100644
--- a/_cffi_build/build_matcher.py
+++ b/_cffi_build/build_matcher.py
@@ -22,8 +22,8 @@
 ffibuilder.cdef("""
     int match_one_against_many_dice(const char * one, const char * many, int n, double * score);
     int match_one_against_many_dice_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, int keybytes, uint32_t k, double threshold, int *indices, double *scores);
-    double dice_coeff_1024(const char *e1, const char *e2);
-    double popcount_1024_array(const char *many, int n, uint32_t *counts_many);
+    double dice_coeff(const char *array1, const char *array2, int array_bytes);
+    double popcount_arrays(uint32_t *counts, const char *arrays, int narrays, int array_bytes);
 """)
 
 
diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 1c500a2b..a5eeda9c 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -5,7 +5,13 @@
 #include <cstdlib>
 #include <ctime>
 #include <cassert>
+#include <climits>
 
+// WORDS_PER_POPCOUNT determines how much we unroll the popcounting in
+// each iteration of a loop. Currently 16, which corresponds to 16*64
+// = 1024 bits per loop.
+static constexpr int WORDS_PER_POPCOUNT = 16;
+static constexpr int WORD_BYTES = sizeof(uint64_t);
 
 template<int n>
 void popcount(uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint64_t *buf) {
@@ -16,13 +22,14 @@ void popcount(uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint
 // Source: http://danluu.com/assembly-intrinsics/
 // https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance
 //
-// NB: Dan Luu's original assembly is incorrect because it
-// clobbers registers marked as "input only" (see warning at
+// NB: Dan Luu's original assembly (and the SO answer it was based on)
+// is incorrect because it clobbers registers marked as "input only"
+// (see warning at
 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands
-// -- this mistake does not materialise with GCC (4.9), but it
-// does with Clang (3.6 and 3.8)).  We fix the mistake by
-// explicitly loading the contents of buf into registers and using
-// these same registers for the intermediate popcnts.
+// -- this mistake does not materialise with GCC (4.9), but it does
+// with Clang (3.6 and 3.8)).  We fix the mistake by explicitly
+// loading the contents of buf into registers and using these same
+// registers for the intermediate popcnts.
 template<>
 void popcount<4>(
     uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
@@ -43,60 +50,46 @@ void popcount<4>(
           "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));
 }
 
-// WORDS_PER_POPCOUNT is how many elements of buf we process each
-// iteration. Currently 16, which corresponds to 16*64 = 1024 bits.
-template< int WORDS_PER_POPCOUNT = 16 >
-uint32_t
-popcount_array(const uint64_t *buf, int n) {
-    assert(n % WORDS_PER_POPCOUNT == 0);
+// "Assumes" WORDS_PER_POPCOUNT divides nwords
+static uint32_t
+_popcount_array(const uint64_t *array, int nwords) {
     uint64_t c0, c1, c2, c3;
     c0 = c1 = c2 = c3 = 0;
-    for (int i = 0; i < n; i += WORDS_PER_POPCOUNT)
-        popcount<WORDS_PER_POPCOUNT>(c0, c1, c2, c3, buf + i);
+    for (int i = 0; i < nwords; i += WORDS_PER_POPCOUNT)
+        popcount<WORDS_PER_POPCOUNT>(c0, c1, c2, c3, array + i);
     return c0 + c1 + c2 + c3;
 }
 
-// WORDS_PER_POPCOUNT is how many elements of buf we process each
-// iteration. Currently 16, which corresponds to 16*64 = 1024 bits.
-template< int WORDS_PER_POPCOUNT = 16 >
-uint32_t
-popcount_combined_array(const uint64_t *__restrict__ buf1, const uint64_t *__restrict__ buf2, int n) {
-    assert(n % WORDS_PER_POPCOUNT == 0);
+// "Assumes" WORDS_PER_POPCOUNT divides nwords
+static uint32_t
+_popcount_combined_array(
+        const uint64_t *array1,
+        const uint64_t *array2,
+        int nwords) {
     uint64_t combined[WORDS_PER_POPCOUNT];
     uint64_t c0, c1, c2, c3;
     c0 = c1 = c2 = c3 = 0;
-    for (int i = 0; i < n; i += WORDS_PER_POPCOUNT) {
+    for (int i = 0; i < nwords; i += WORDS_PER_POPCOUNT) {
         for (int j = 0; j < WORDS_PER_POPCOUNT; ++j)
-            combined[j] = buf1[i + j] & buf2[i + j];
+            combined[j] = array1[i + j] & array2[i + j];
         popcount<WORDS_PER_POPCOUNT>(c0, c1, c2, c3, combined);
     }
     return c0 + c1 + c2 + c3;
 }
 
-/**
- * Compute the Dice coefficient similarity measure of two bit patterns.
- */
-static double
-dice_coeff_1024(const char *e1, const char *e2) {
-    const uint64_t *comp1 = (const uint64_t *) e1;
-    const uint64_t *comp2 = (const uint64_t *) e2;
-
-    static constexpr int KEYWORDS = 16;
-    uint32_t count_both = 0;
-
-    count_both += popcount_array(comp1, KEYWORDS);
-    count_both += popcount_array(comp2, KEYWORDS);
-    if(count_both == 0) {
-        return 0.0;
-    }
-    uint32_t count_and = popcount_combined_array(comp1, comp2, KEYWORDS);
-
-    return 2 * count_and / (double)count_both;
+// "Assumes" WORDS_PER_POPCOUNT divides nwords
+// assumes u_popc or v_popc is nonzero.
+static inline double
+_dice_coeff(
+        const uint64_t *u, uint32_t u_popc,
+        const uint64_t *v, uint32_t v_popc,
+        int nwords) {
+    uint32_t uv_popc = _popcount_combined_array(u, v, nwords);
+    return (2 * uv_popc) / (double) (u_popc + v_popc);
 }
 
 
 class Node {
-
 public:
     int index;
     double score;
@@ -106,7 +99,7 @@ class Node {
         :index(n_index), score( n_score ) { }
 };
 
-struct score_cmp{
+struct score_cmp {
     bool operator()(const Node& a, const Node& b) const {
         return a.score > b.score;
     }
@@ -121,12 +114,6 @@ calculate_max_difference(uint32_t popcnt_a, double threshold) {
     return 2 * popcnt_a * (1/threshold - 1);
 }
 
-static inline double
-dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_popc, int n) {
-    uint32_t uv_popc = popcount_combined_array(u, v, n);
-    return (2 * uv_popc) / (double) (u_popc + v_popc);
-}
-
 /**
  * Convert clock measurement t to milliseconds.
  *
@@ -146,6 +133,7 @@ abs_diff(uint32_t a, uint32_t b) {
     return b - a;
 }
 
+
 extern "C"
 {
     /**
@@ -160,39 +148,74 @@ extern "C"
      *
      * is put in counts_many[i].
      */
-    double popcount_1024_array(const char *many, int n, uint32_t *counts_many) {
-        static constexpr int KEYWORDS = 16;
+    double
+    popcount_arrays(
+            uint32_t *counts,
+            const char *arrays, int narrays, int array_bytes) {
+        // assumes WORD_BYTES divides array_bytes
+        int nwords = array_bytes / WORD_BYTES;
+        const uint64_t *u = reinterpret_cast<const uint64_t *>(arrays);
+
+        // assumes WORD_PER_POPCOUNT divides nwords
         clock_t t = clock();
-        for (int i = 0; i < n; i++) {
-            const uint64_t *sig = (const uint64_t *) many + i * KEYWORDS;
-            counts_many[i] = popcount_array(sig, KEYWORDS);
-        }
+        for (int i = 0; i < narrays; ++i, u += nwords)
+            counts[i] = _popcount_array(u, nwords);
         return to_millis(clock() - t);
     }
 
+    /**
+     * Compute the Dice coefficient similarity measure of two arrays.
+     */
+    double
+    dice_coeff(
+            const char *array1,
+            const char *array2,
+            int array_bytes) {
+        const uint64_t *u, *v;
+        uint32_t u_popc, v_popc;
+        // assumes WORD_BYTES divides array_bytes
+        int nwords = array_bytes / WORD_BYTES;
+
+        u = reinterpret_cast<const uint64_t *>(array1);
+        v = reinterpret_cast<const uint64_t *>(array2);
+
+        // assumes WORD_PER_POPCOUNT divides array_words
+
+        // If the popcount of one of the arrays is zero, then the
+        // popcount of the "intersection" (logical AND) will be zero,
+        // hence the whole Dice coefficient will be zero.
+        u_popc = _popcount_array(u, nwords);
+        if (u_popc == 0)
+            return 0.0;
+        v_popc = _popcount_array(v, nwords);
+        if (v_popc == 0)
+            return 0.0;
+
+        return _dice_coeff(u, u_popc, v, v_popc, nwords);
+    }
+
     /**
      * Calculate up to the top k indices and scores.  Returns the
      * number matched above the given threshold or -1 if keybytes is
      * not a multiple of 8.
      */
     int match_one_against_many_dice_k_top(
-        const char *one,
-        const char *many,
-        const uint32_t *counts_many,
-        int n,
-        int keybytes,
-        uint32_t k,
-        double threshold,
-        int *indices,
-        double *scores) {
+            const char *one,
+            const char *many,
+            const uint32_t *counts_many,
+            int n,
+            int keybytes,
+            uint32_t k,
+            double threshold,
+            int *indices,
+            double *scores) {
 
         const uint64_t *comp1 = (const uint64_t *) one;
         const uint64_t *comp2 = (const uint64_t *) many;
 
-        static constexpr int WORDBYTES = sizeof(uint64_t);
-        if (keybytes % WORDBYTES != 0)
+        if (keybytes % WORD_BYTES != 0)
             return -1;
-        int keywords = keybytes / WORDBYTES;
+        int keywords = keybytes / WORD_BYTES;
 
         // Here we create max_k_scores on the stack by providing it
         // with a vector in which to put its elements. We do this so
@@ -203,10 +226,10 @@ extern "C"
         typedef std::priority_queue<Node, std::vector<Node>, score_cmp> node_queue;
         node_vector vec;
         vec.reserve(k + 1);
-        node_queue max_k_scores(score_cmp(), std::move(vec));
+        node_queue top_k_scores(score_cmp(), std::move(vec));
 
-        uint32_t count_one = popcount_array(comp1, keywords);
-        uint32_t max_popcnt_delta = keybytes * 8; // = bits per key
+        uint32_t count_one = _popcount_array(comp1, keywords);
+        uint32_t max_popcnt_delta = keybytes * CHAR_BIT; // = bits per key
         if(threshold > 0) {
             max_popcnt_delta = calculate_max_difference(count_one, threshold);
         }
@@ -216,20 +239,23 @@ extern "C"
             const uint32_t counts_many_j = counts_many[j];
 
             if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) {
-                double score = dice_coeff(comp1, count_one, current, counts_many_j, keywords);
+                double score = _dice_coeff(comp1, count_one, current, counts_many_j, keywords);
                 if (score >= threshold) {
-                    max_k_scores.push(Node(j, score));
-                    if (max_k_scores.size() > k)
-                        max_k_scores.pop();
+                    top_k_scores.push(Node(j, score));
+                    if (top_k_scores.size() > k) {
+                        // Popping the top element is O(log(k))!
+                        top_k_scores.pop();
+                    }
                 }
             }
         }
 
         int i = 0;
-        while ( ! max_k_scores.empty()) {
-           scores[i] = max_k_scores.top().score;
-           indices[i] = max_k_scores.top().index;
-           max_k_scores.pop();
+        while ( ! top_k_scores.empty()) {
+           scores[i] = top_k_scores.top().score;
+           indices[i] = top_k_scores.top().index;
+           // Popping the top element is O(log(k))!
+           top_k_scores.pop();
            i += 1;
         }
         return i;
@@ -237,13 +263,14 @@ extern "C"
 
     int match_one_against_many_dice(const char *one, const char *many, int n, double *score) {
 
+        static const int array_bytes = 128;
         static const double threshold = 0.0;
         static const int k = 1;
         int idx_unused;
         uint32_t *counts_many = new uint32_t[n];
-        popcount_1024_array(many, n, counts_many);
+        popcount_arrays(counts_many, many, n, array_bytes);
         int res = match_one_against_many_dice_k_top(
-            one, many, counts_many, n, 128, k, threshold, &idx_unused, score);
+            one, many, counts_many, n, array_bytes, k, threshold, &idx_unused, score);
         delete[] counts_many;
 
         return res;
diff --git a/anonlink/bloommatcher.py b/anonlink/bloommatcher.py
index a8977166..d1a55acf 100644
--- a/anonlink/bloommatcher.py
+++ b/anonlink/bloommatcher.py
@@ -29,10 +29,11 @@ def dicecoeff(e1, e2):
 
     :return: real 0-1 similarity measure
     """
-    if len(e1) == 1024 and len(e2) == 1024:
+    # TODO: Remove restriction to lengths divisible by 128 bytes
+    if e1.length() == e2.length() and (e1.length()/8) % (8*16) == 0:
         e1array = ffi.new("char[]", e1.tobytes())
         e2array = ffi.new("char[]", e2.tobytes())
-        return lib.dice_coeff_1024(e1array, e2array)
+        return lib.dice_coeff(e1array, e2array, len(e1array))
     else:
         return dicecoeff_pure_python(e1, e2)
 
diff --git a/anonlink/util.py b/anonlink/util.py
index b0f1cb2c..d8280c29 100644
--- a/anonlink/util.py
+++ b/anonlink/util.py
@@ -44,7 +44,7 @@ def popcount_vector(bitarrays, use_python=True):
     c_popcounts = ffi.new("uint32_t[{}]".format(n))
     many = ffi.new("char[{}]".format(128 * n),
                     bytes([b for f in bitarrays for b in f.tobytes()]))
-    ms = lib.popcount_1024_array(many, n, c_popcounts)
+    ms = lib.popcount_arrays(c_popcounts, many, n, 128)
 
     return [c_popcounts[i] for i in range(n)], ms * 1e-3
 

From 063115a54ddabb045329360e8c6a0270dde3b44f Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 19 Feb 2018 10:12:10 +1100
Subject: [PATCH 27/49] Add -march=native compiler option.

---
 _cffi_build/build_matcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py
index 9e0c685a..d191cac0 100644
--- a/_cffi_build/build_matcher.py
+++ b/_cffi_build/build_matcher.py
@@ -15,7 +15,7 @@
     "_entitymatcher",
     source,
     source_extension='.cpp',
-    extra_compile_args=['-Wall', '-Wextra', '-Werror', '-O3', '-std=c++11', '-mssse3', '-mpopcnt', '-fvisibility=hidden'
+    extra_compile_args=['-Wall', '-Wextra', '-Werror', '-O3', '-std=c++11', '-march=native', '-mssse3', '-mpopcnt', '-fvisibility=hidden'
     ],
 )
 

From c9134d01ee3c19e93051b09cf35f25a05a622ce2 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 19 Feb 2018 10:15:51 +1100
Subject: [PATCH 28/49] Implementation of arbitrary length CLKs.

---
 _cffi_build/dice_one_against_many.cpp | 69 ++++++++++++++++++++++-----
 anonlink/util.py                      |  5 +-
 2 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index a5eeda9c..b8059da6 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -14,11 +14,15 @@ static constexpr int WORDS_PER_POPCOUNT = 16;
 static constexpr int WORD_BYTES = sizeof(uint64_t);
 
 template<int n>
-void popcount(uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint64_t *buf) {
+void popcount(
+        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
+        const uint64_t *buf) {
     popcount<4>(c0, c1, c2, c3, buf);
     popcount<n - 4>(c0, c1, c2, c3, buf + 4);
 }
 
+// Fast Path
+//
 // Source: http://danluu.com/assembly-intrinsics/
 // https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance
 //
@@ -32,9 +36,8 @@ void popcount(uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint
 // registers for the intermediate popcnts.
 template<>
 void popcount<4>(
-    uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
-    const uint64_t* buf) {
-
+        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
+        const uint64_t* buf) {
     uint64_t b0, b1, b2, b3;
     b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3];
     __asm__(
@@ -50,17 +53,61 @@ void popcount<4>(
           "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3));
 }
 
-// "Assumes" WORDS_PER_POPCOUNT divides nwords
+// Slow paths
+// TODO: Assumes sizeof(long) == 8
+template<>
+void popcount<3>(
+        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
+        const uint64_t* buf) {
+    c0 = __builtin_popcountl(buf[0]);
+    c1 = __builtin_popcountl(buf[1]);
+    c2 = __builtin_popcountl(buf[2]);
+}
+template<>
+void popcount<2>(
+        uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &,
+        const uint64_t* buf) {
+    c0 = __builtin_popcountl(buf[0]);
+    c1 = __builtin_popcountl(buf[1]);
+}
+template<>
+void popcount<1>(
+        uint64_t &c0, uint64_t &, uint64_t &, uint64_t &,
+        const uint64_t* buf) {
+    c0 = __builtin_popcountl(buf[0]);
+}
+
+
 static uint32_t
 _popcount_array(const uint64_t *array, int nwords) {
     uint64_t c0, c1, c2, c3;
     c0 = c1 = c2 = c3 = 0;
-    for (int i = 0; i < nwords; i += WORDS_PER_POPCOUNT)
-        popcount<WORDS_PER_POPCOUNT>(c0, c1, c2, c3, array + i);
+
+    while (nwords >= 16) {
+        popcount<16>(c0, c1, c2, c3, array += 16);
+        nwords -= 16;
+    }
+    // nwords < 16
+    if (nwords >= 8) {
+        popcount<8>(c0, c1, c2, c3, array += 8);
+        nwords -= 8;
+    }
+    // nwords < 8
+    if (nwords >= 4) {
+        popcount<4>(c0, c1, c2, c3, array += 4);
+        nwords -= 4;
+    }
+    // nwords < 4
+    if (nwords >= 2) {
+        popcount<2>(c0, c1, c2, c3, array += 2);
+        nwords -= 2;
+    }
+    // nwords < 2
+    if (nwords == 1)
+        popcount<1>(c0, c1, c2, c3, array + 1);
     return c0 + c1 + c2 + c3;
 }
 
-// "Assumes" WORDS_PER_POPCOUNT divides nwords
 static uint32_t
 _popcount_combined_array(
         const uint64_t *array1,
@@ -77,7 +124,6 @@ _popcount_combined_array(
     return c0 + c1 + c2 + c3;
 }
 
-// "Assumes" WORDS_PER_POPCOUNT divides nwords
 // assumes u_popc or v_popc is nonzero.
 static inline double
 _dice_coeff(
@@ -210,12 +256,11 @@ extern "C"
             int *indices,
             double *scores) {
 
-        const uint64_t *comp1 = (const uint64_t *) one;
-        const uint64_t *comp2 = (const uint64_t *) many;
-
         if (keybytes % WORD_BYTES != 0)
             return -1;
         int keywords = keybytes / WORD_BYTES;
+        const uint64_t *comp1 = (const uint64_t *) one;
+        const uint64_t *comp2 = (const uint64_t *) many;
 
         // Here we create max_k_scores on the stack by providing it
         // with a vector in which to put its elements. We do this so
diff --git a/anonlink/util.py b/anonlink/util.py
index d8280c29..1fec0d96 100644
--- a/anonlink/util.py
+++ b/anonlink/util.py
@@ -41,10 +41,11 @@ def popcount_vector(bitarrays, use_python=True):
 
     # Use native code
     n = len(bitarrays)
+    arr_bytes = bitarrays[0].length() // 8
     c_popcounts = ffi.new("uint32_t[{}]".format(n))
-    many = ffi.new("char[{}]".format(128 * n),
+    many = ffi.new("char[{}]".format(arr_bytes * n),
                     bytes([b for f in bitarrays for b in f.tobytes()]))
-    ms = lib.popcount_arrays(c_popcounts, many, n, 128)
+    ms = lib.popcount_arrays(c_popcounts, many, n, arr_bytes)
 
     return [c_popcounts[i] for i in range(n)], ms * 1e-3
 

From b2435f9b2065b70d7cacb6f3f43fc556cf78da7b Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 19 Feb 2018 11:09:40 +1100
Subject: [PATCH 29/49] Fix dumb mistakes in updating array pointer and
 popcounts.

---
 _cffi_build/dice_one_against_many.cpp | 28 ++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index b8059da6..59368a12 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -59,22 +59,24 @@ template<>
 void popcount<3>(
         uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
         const uint64_t* buf) {
-    c0 = __builtin_popcountl(buf[0]);
-    c1 = __builtin_popcountl(buf[1]);
-    c2 = __builtin_popcountl(buf[2]);
+    c0 += __builtin_popcountl(buf[0]);
+    c1 += __builtin_popcountl(buf[1]);
+    c2 += __builtin_popcountl(buf[2]);
 }
+
 template<>
 void popcount<2>(
         uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &,
         const uint64_t* buf) {
-    c0 = __builtin_popcountl(buf[0]);
-    c1 = __builtin_popcountl(buf[1]);
+    c0 += __builtin_popcountl(buf[0]);
+    c1 += __builtin_popcountl(buf[1]);
 }
+
 template<>
 void popcount<1>(
         uint64_t &c0, uint64_t &, uint64_t &, uint64_t &,
         const uint64_t* buf) {
-    c0 = __builtin_popcountl(buf[0]);
+    c0 += __builtin_popcountl(buf[0]);
 }
 
 
@@ -84,27 +86,31 @@ _popcount_array(const uint64_t *array, int nwords) {
     c0 = c1 = c2 = c3 = 0;
 
     while (nwords >= 16) {
-        popcount<16>(c0, c1, c2, c3, array += 16);
+        popcount<16>(c0, c1, c2, c3, array);
+        array += 16;
         nwords -= 16;
     }
     // nwords < 16
     if (nwords >= 8) {
-        popcount<8>(c0, c1, c2, c3, array += 8);
+        popcount<8>(c0, c1, c2, c3, array);
+        array += 8;
         nwords -= 8;
     }
     // nwords < 8
     if (nwords >= 4) {
-        popcount<4>(c0, c1, c2, c3, array += 4);
+        popcount<4>(c0, c1, c2, c3, array);
+        array += 4;
         nwords -= 4;
     }
     // nwords < 4
     if (nwords >= 2) {
-        popcount<2>(c0, c1, c2, c3, array += 2);
+        popcount<2>(c0, c1, c2, c3, array);
+        array += 2;
         nwords -= 2;
     }
     // nwords < 2
     if (nwords == 1)
-        popcount<1>(c0, c1, c2, c3, array + 1);
+        popcount<1>(c0, c1, c2, c3, array);
     return c0 + c1 + c2 + c3;
 }
 

From 4acd62f02bb57c4420c723a125adce1619ec9d65 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 19 Feb 2018 16:54:45 +1100
Subject: [PATCH 30/49] Tests for arbitrary length popcounts.

---
 tests/test_util.py | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/tests/test_util.py b/tests/test_util.py
index 2dae61d5..059dab49 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python3.4
 
 import unittest
+from itertools import combinations_with_replacement
 from anonlink import util
+from bitarray import bitarray
 
 class TestUtilDataGeneration(unittest.TestCase):
 
@@ -19,15 +21,30 @@ def test_generate_clks(self):
             self.assertEqual(len(clk[0]), 1024)
             self.assertEqual(clk[0].count(), clk[2])
 
-    def test_popcount_vector(self):
-        bas = [util.generate_bitarray(1024) for i in range(100)]
 
-        popcounts, _ = util.popcount_vector(bas, use_python=True)
-        self.assertEquals(len(popcounts), 100)
-        for i, cnt in enumerate(popcounts):
-            self.assertEquals(cnt, bas[i].count())
-
-        popcounts, _ = util.popcount_vector(bas, use_python=False)
-        self.assertEquals(len(popcounts), 100)
-        for i, cnt in enumerate(popcounts):
-            self.assertEquals(cnt, bas[i].count())
+def concat_bitarrays(products):
+    for p in products:
+        yield sum(p, bitarray())
+
+# Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1
+# of various lengths between 1 and 65 words.
+def test_generator():
+    key_lengths = [1, 2, 3, 4, 8, 9, 10, 15, 16, 17,
+                   23, 24, 25, 30, 31, 32, 33, 63, 64, 65]
+    special_words = [64*bitarray('0'),
+                     63*bitarray('0') + bitarray('1'),
+                     bitarray('1') + 63*bitarray('0'),
+                     64*bitarray('1')]
+    for L in key_lengths:
+        words = combinations_with_replacement(special_words, L)
+        # '+' on bitarrays is concatenation
+        bas = [sum(w, bitarray()) for w in words]
+        yield check_popcount_vector, bas
+
+def check_popcount_vector(bas):
+    bas_counts = [b.count() for b in bas]
+
+    popcounts, _ = util.popcount_vector(bas, use_python=True)
+    assert(popcounts == bas_counts)
+    popcounts, _ = util.popcount_vector(bas, use_python=False)
+    assert(popcounts == bas_counts)

From 38ca3ce4be6355b8229e4ca7a44a5b36923c58d3 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 19 Feb 2018 17:26:51 +1100
Subject: [PATCH 31/49] Update some comments.

---
 _cffi_build/dice_one_against_many.cpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 59368a12..3a553b6d 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -192,13 +192,15 @@ extern "C"
      * Calculate population counts of an array of inputs; return how
      * long it took in milliseconds.
      *
-     * 'many' must point to n*KEYWORDS*sizeof(uint64_t) (== 128*n) bytes
-     * 'counts_many' must point to n*sizeof(uint32_t) bytes.
-     * For i = 0 to n - 1, the population count of the 1024 bits
+     * 'arrays' must point to narrays*array_bytes bytes
+     * 'counts' must point to narrays*sizeof(uint32_t) bytes.
+     * For i = 0 to n - 1, the population count of the array_bytes*8 bits
      *
-     *   many[i * KEYWORDS] ... many[(i + 1) * KEYWORDS - 1]
+     *   arrays[i * array_bytes] ... arrays[(i + 1) * array_bytes - 1]
      *
-     * is put in counts_many[i].
+     * is put in counts[i].
+     *
+     * ASSUMES: array_bytes is divisible by 8.
      */
     double
     popcount_arrays(
@@ -216,7 +218,10 @@ extern "C"
     }
 
     /**
-     * Compute the Dice coefficient similarity measure of two arrays.
+     * Compute the Dice coefficient similarity measure of two arrays
+     * of length array_bytes.
+     *
+     * ASSUMES: array_bytes is divisible by 8.
      */
     double
     dice_coeff(
@@ -231,8 +236,6 @@ extern "C"
         u = reinterpret_cast<const uint64_t *>(array1);
         v = reinterpret_cast<const uint64_t *>(array2);
 
-        // assumes WORD_PER_POPCOUNT divides array_words
-
         // If the popcount of one of the arrays is zero, then the
         // popcount of the "intersection" (logical AND) will be zero,
         // hence the whole Dice coefficient will be zero.

From e8c77bc94fa1fbfcbd9b4f43a891ae3115d2e085 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 19 Feb 2018 17:51:54 +1100
Subject: [PATCH 32/49] Arbitrary length Dice coefficient.

---
 _cffi_build/dice_one_against_many.cpp | 34 ++++++++++++++++++---------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 3a553b6d..375cbdc3 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -7,10 +7,6 @@
 #include <cassert>
 #include <climits>
 
-// WORDS_PER_POPCOUNT determines how much we unroll the popcounting in
-// each iteration of a loop. Currently 16, which corresponds to 16*64
-// = 1024 bits per loop.
-static constexpr int WORDS_PER_POPCOUNT = 16;
 static constexpr int WORD_BYTES = sizeof(uint64_t);
 
 template<int n>
@@ -114,20 +110,36 @@ _popcount_array(const uint64_t *array, int nwords) {
     return c0 + c1 + c2 + c3;
 }
 
+static inline void
+logand_array(uint64_t *out, const uint64_t *arr1, const uint64_t *arr2, int n) {
+    for (int j = 0; j < n; ++j)
+        out[j] = arr1[j] & arr2[j];
+}
+
 static uint32_t
 _popcount_combined_array(
         const uint64_t *array1,
         const uint64_t *array2,
         int nwords) {
-    uint64_t combined[WORDS_PER_POPCOUNT];
-    uint64_t c0, c1, c2, c3;
+    const uint64_t *arr1 = array1, *arr2 = array2;
+    int n = nwords;
+    static constexpr int BUF_WORDS = 16;
+    uint64_t combined[BUF_WORDS];
+    uint64_t c0, c1, c2, c3, rest;
+
     c0 = c1 = c2 = c3 = 0;
-    for (int i = 0; i < nwords; i += WORDS_PER_POPCOUNT) {
-        for (int j = 0; j < WORDS_PER_POPCOUNT; ++j)
-            combined[j] = array1[i + j] & array2[i + j];
-        popcount<WORDS_PER_POPCOUNT>(c0, c1, c2, c3, combined);
+
+    while (n >= BUF_WORDS) {
+        logand_array(combined, arr1, arr2, BUF_WORDS);
+        popcount<BUF_WORDS>(c0, c1, c2, c3, combined);
+        arr1 += BUF_WORDS;
+        arr2 += BUF_WORDS;
+        n -= BUF_WORDS;
     }
-    return c0 + c1 + c2 + c3;
+    logand_array(combined, arr1, arr2, n);
+    rest = _popcount_array(combined, n);
+
+    return c0 + c1 + c2 + c3 + rest;
 }
 
 // assumes u_popc or v_popc is nonzero.

From 1febd65bfe3efab60740576f965e53f60f17178c Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Mon, 19 Feb 2018 17:56:09 +1100
Subject: [PATCH 33/49] Rename function.

---
 _cffi_build/dice_one_against_many.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 375cbdc3..2c1b1e61 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -117,7 +117,7 @@ logand_array(uint64_t *out, const uint64_t *arr1, const uint64_t *arr2, int n) {
 }
 
 static uint32_t
-_popcount_combined_array(
+_popcount_logand_array(
         const uint64_t *array1,
         const uint64_t *array2,
         int nwords) {
@@ -148,7 +148,7 @@ _dice_coeff(
         const uint64_t *u, uint32_t u_popc,
         const uint64_t *v, uint32_t v_popc,
         int nwords) {
-    uint32_t uv_popc = _popcount_combined_array(u, v, nwords);
+    uint32_t uv_popc = _popcount_logand_array(u, v, nwords);
     return (2 * uv_popc) / (double) (u_popc + v_popc);
 }
 

From 21390c4c0d81cc8f06391a19bca3d83bd878bbf7 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Tue, 20 Feb 2018 15:21:49 +1100
Subject: [PATCH 34/49] Move native dicecoeff calculation into its own
 function.

---
 anonlink/bloommatcher.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/anonlink/bloommatcher.py b/anonlink/bloommatcher.py
index d1a55acf..102c8653 100644
--- a/anonlink/bloommatcher.py
+++ b/anonlink/bloommatcher.py
@@ -22,6 +22,10 @@ def dicecoeff_pure_python(e1, e2):
     else:
         return 2.0 * overlap_count / combined_count
 
+def dicecoeff_native(e1, e2):
+    e1array = ffi.new("char[]", e1.tobytes())
+    e2array = ffi.new("char[]", e2.tobytes())
+    return lib.dice_coeff(e1array, e2array, len(e1array))
 
 def dicecoeff(e1, e2):
     """
@@ -29,11 +33,8 @@ def dicecoeff(e1, e2):
 
     :return: real 0-1 similarity measure
     """
-    # TODO: Remove restriction to lengths divisible by 128 bytes
-    if e1.length() == e2.length() and (e1.length()/8) % (8*16) == 0:
-        e1array = ffi.new("char[]", e1.tobytes())
-        e2array = ffi.new("char[]", e2.tobytes())
-        return lib.dice_coeff(e1array, e2array, len(e1array))
+    if e1.length() == e2.length() and (e1.length()/8) % 8 == 0:
+        return dicecoeff_native(e1, e2)
     else:
         return dicecoeff_pure_python(e1, e2)
 

From 75cef8e19bdb8e0ec12f11ffe78a6ab02fcb30f0 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Tue, 20 Feb 2018 15:22:18 +1100
Subject: [PATCH 35/49] Add tests for native Dice coefficient calculation.

---
 tests/test_util.py | 53 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/tests/test_util.py b/tests/test_util.py
index ceb880e3..edfca8cb 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,9 +1,12 @@
 #!/usr/bin/env python3.4
 
 import unittest
+import os
 from itertools import combinations_with_replacement
+from collections import deque
 from anonlink import util
 from bitarray import bitarray
+from anonlink import bloommatcher as bm
 
 class TestUtilDataGeneration(unittest.TestCase):
 
@@ -21,24 +24,27 @@ def test_generate_clks(self):
             self.assertEqual(len(clk[0]), 1024)
             self.assertEqual(clk[0].count(), clk[2])
 
-def concat_bitarrays(products):
-    for p in products:
-        yield sum(p, bitarray())
-
-# Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1
-# of various lengths between 1 and 65 words.
-def test_generator():
-    key_lengths = [1, 2, 3, 4, 8, 9, 10, 15, 16, 17,
-                   23, 24, 25, 30, 31, 32, 33, 63, 64, 65]
+# Return a bit array of length L*64 whose contents are combinations of
+# the words 0, 2^64-1, 1 or 2^63 (ie. all zeros, all ones, or a one in
+# the least or most significant position).
+def bitarrays_of_length(L):
     special_words = [64*bitarray('0'),
                      63*bitarray('0') + bitarray('1'),
                      bitarray('1') + 63*bitarray('0'),
                      64*bitarray('1')]
+    # '+' on bitarrays is concatenation
+    return [sum(word, bitarray())
+            for word in combinations_with_replacement(special_words, L)]
+
+# Interesting key lengths (usually around 2^something +/-1).
+key_lengths = [1, 2, 3, 4, 8, 9, 10, 15, 16, 17,
+               23, 24, 25, 30, 31, 32, 33, 63, 64, 65]
+
+# Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1
+# of various lengths between 1 and 65 words.
+def test_popcount_vector():
     for L in key_lengths:
-        words = combinations_with_replacement(special_words, L)
-        # '+' on bitarrays is concatenation
-        bas = [sum(w, bitarray()) for w in words]
-        yield check_popcount_vector, bas
+        yield check_popcount_vector, bitarrays_of_length(L)
 
 def check_popcount_vector(bas):
     bas_counts = [b.count() for b in bas]
@@ -47,3 +53,24 @@ def check_popcount_vector(bas):
     assert(popcounts == bas_counts)
     popcounts, _ = util.popcount_vector(bas, use_python=False)
     assert(popcounts == bas_counts)
+
+def test_dicecoeff():
+    for L in key_lengths:
+        yield check_dicecoeff, bitarrays_of_length(L)
+
+def check_dicecoeff(bas):
+    # Test the Dice coefficient of bitarrays in bas with other
+    # bitarrays of bas.  rotations is the number of times we rotate
+    # bas to generate pairs to test the Dice coefficient; 10 takes
+    # around 10s, 100 around 60s.
+    rotations = 100 if "INCLUDE_10K" in os.environ else 10;
+
+    # We check that the native code and Python versions of dicecoeff
+    # don't ever differ by more than 10^{-6}.
+    eps = 0.000001
+    d = deque(bas)
+    for _ in range(min(rotations, len(bas))):
+        for a, b in zip(bas, d):
+            diff = bm.dicecoeff_pure_python(a, b) - bm.dicecoeff_native(a, b)
+            assert(abs(diff) < eps)
+        d.rotate(1)

From c338c3295510c69db0e6a479062d60712d511a71 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Tue, 20 Feb 2018 15:48:26 +1100
Subject: [PATCH 36/49] Move dicecoeff tests to bloommatcher tests; move common
 bitarray utilities to their own file.

---
 tests/bitarray_utils.py    | 18 +++++++++++++++
 tests/test_bloommatcher.py | 25 +++++++++++++++++++++
 tests/test_util.py         | 45 +++-----------------------------------
 3 files changed, 46 insertions(+), 42 deletions(-)
 create mode 100644 tests/bitarray_utils.py

diff --git a/tests/bitarray_utils.py b/tests/bitarray_utils.py
new file mode 100644
index 00000000..4574cf2b
--- /dev/null
+++ b/tests/bitarray_utils.py
@@ -0,0 +1,18 @@
+from bitarray import bitarray
+from itertools import combinations_with_replacement
+
+# Return a bit array of length L*64 whose contents are combinations of
+# the words 0, 2^64-1, 1 or 2^63 (ie. all zeros, all ones, or a one in
+# the least or most significant position).
+def bitarrays_of_length(L):
+    special_words = [64*bitarray('0'),
+                     63*bitarray('0') + bitarray('1'),
+                     bitarray('1') + 63*bitarray('0'),
+                     64*bitarray('1')]
+    # '+' on bitarrays is concatenation
+    return [sum(word, bitarray())
+            for word in combinations_with_replacement(special_words, L)]
+
+# Interesting key lengths (usually around 2^something +/-1).
+key_lengths = [1, 2, 3, 4, 8, 9, 10, 15, 16, 17,
+               23, 24, 25, 30, 31, 32, 33, 63, 64, 65]
diff --git a/tests/test_bloommatcher.py b/tests/test_bloommatcher.py
index 4c0f06fd..2eb0fa9e 100644
--- a/tests/test_bloommatcher.py
+++ b/tests/test_bloommatcher.py
@@ -1,8 +1,11 @@
 import unittest
 import random
+import os
+from collections import deque
 from bitarray import bitarray
 
 from anonlink import bloommatcher as bm
+from tests import bitarray_utils
 
 __author__ = 'shardy'
 
@@ -70,6 +73,28 @@ def test_dice_4_c(self):
 
         self.assertEqual(result, 0.0)
 
+# Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1
+# of various lengths between 1 and 65 words.
+def test_dicecoeff():
+    for L in bitarray_utils.key_lengths:
+        yield check_dicecoeff, bitarray_utils.bitarrays_of_length(L)
+
+def check_dicecoeff(bas):
+    # Test the Dice coefficient of bitarrays in bas with other
+    # bitarrays of bas.  rotations is the number of times we rotate
+    # bas to generate pairs to test the Dice coefficient; 10 takes
+    # around 10s, 100 around 60s.
+    rotations = 100 if "INCLUDE_10K" in os.environ else 10;
+
+    # We check that the native code and Python versions of dicecoeff
+    # don't ever differ by more than 10^{-6}.
+    eps = 0.000001
+    d = deque(bas)
+    for _ in range(min(rotations, len(bas))):
+        for a, b in zip(bas, d):
+            diff = bm.dicecoeff_pure_python(a, b) - bm.dicecoeff_native(a, b)
+            assert(abs(diff) < eps)
+        d.rotate(1)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_util.py b/tests/test_util.py
index edfca8cb..3c8e4312 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,12 +1,9 @@
 #!/usr/bin/env python3.4
 
 import unittest
-import os
-from itertools import combinations_with_replacement
-from collections import deque
 from anonlink import util
-from bitarray import bitarray
 from anonlink import bloommatcher as bm
+from tests import bitarray_utils
 
 class TestUtilDataGeneration(unittest.TestCase):
 
@@ -24,27 +21,11 @@ def test_generate_clks(self):
             self.assertEqual(len(clk[0]), 1024)
             self.assertEqual(clk[0].count(), clk[2])
 
-# Return a bit array of length L*64 whose contents are combinations of
-# the words 0, 2^64-1, 1 or 2^63 (ie. all zeros, all ones, or a one in
-# the least or most significant position).
-def bitarrays_of_length(L):
-    special_words = [64*bitarray('0'),
-                     63*bitarray('0') + bitarray('1'),
-                     bitarray('1') + 63*bitarray('0'),
-                     64*bitarray('1')]
-    # '+' on bitarrays is concatenation
-    return [sum(word, bitarray())
-            for word in combinations_with_replacement(special_words, L)]
-
-# Interesting key lengths (usually around 2^something +/-1).
-key_lengths = [1, 2, 3, 4, 8, 9, 10, 15, 16, 17,
-               23, 24, 25, 30, 31, 32, 33, 63, 64, 65]
-
 # Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1
 # of various lengths between 1 and 65 words.
 def test_popcount_vector():
-    for L in key_lengths:
-        yield check_popcount_vector, bitarrays_of_length(L)
+    for L in bitarray_utils.key_lengths:
+        yield check_popcount_vector, bitarray_utils.bitarrays_of_length(L)
 
 def check_popcount_vector(bas):
     bas_counts = [b.count() for b in bas]
@@ -54,23 +35,3 @@ def check_popcount_vector(bas):
     popcounts, _ = util.popcount_vector(bas, use_python=False)
     assert(popcounts == bas_counts)
 
-def test_dicecoeff():
-    for L in key_lengths:
-        yield check_dicecoeff, bitarrays_of_length(L)
-
-def check_dicecoeff(bas):
-    # Test the Dice coefficient of bitarrays in bas with other
-    # bitarrays of bas.  rotations is the number of times we rotate
-    # bas to generate pairs to test the Dice coefficient; 10 takes
-    # around 10s, 100 around 60s.
-    rotations = 100 if "INCLUDE_10K" in os.environ else 10;
-
-    # We check that the native code and Python versions of dicecoeff
-    # don't ever differ by more than 10^{-6}.
-    eps = 0.000001
-    d = deque(bas)
-    for _ in range(min(rotations, len(bas))):
-        for a, b in zip(bas, d):
-            diff = bm.dicecoeff_pure_python(a, b) - bm.dicecoeff_native(a, b)
-            assert(abs(diff) < eps)
-        d.rotate(1)

From 4d74b1c2939cf66b53f8399090605c09cefed152 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Tue, 20 Feb 2018 16:13:54 +1100
Subject: [PATCH 37/49] Simplify slow path / reduce branches in fast path.

---
 _cffi_build/dice_one_against_many.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 2c1b1e61..e30a8e60 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -125,7 +125,7 @@ _popcount_logand_array(
     int n = nwords;
     static constexpr int BUF_WORDS = 16;
     uint64_t combined[BUF_WORDS];
-    uint64_t c0, c1, c2, c3, rest;
+    uint64_t c0, c1, c2, c3;
 
     c0 = c1 = c2 = c3 = 0;
 
@@ -136,10 +136,12 @@ _popcount_logand_array(
         arr2 += BUF_WORDS;
         n -= BUF_WORDS;
     }
-    logand_array(combined, arr1, arr2, n);
-    rest = _popcount_array(combined, n);
+    if (n > 0) {
+        logand_array(combined, arr1, arr2, n);
+        c0 += _popcount_array(combined, n);
+    }
 
-    return c0 + c1 + c2 + c3 + rest;
+    return c0 + c1 + c2 + c3;
 }
 
 // assumes u_popc or v_popc is nonzero.

From 2d6b5f7be1fa6a5557937d4813d3702b07df1e8e Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Tue, 20 Feb 2018 16:28:57 +1100
Subject: [PATCH 38/49] Adapt entitymatcher to arbitrary length CLK interface.

---
 anonlink/entitymatch.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/anonlink/entitymatch.py b/anonlink/entitymatch.py
index ba03416d..cbcf1251 100644
--- a/anonlink/entitymatch.py
+++ b/anonlink/entitymatch.py
@@ -33,15 +33,26 @@ def python_filter_similarity(filters1, filters2):
 
 def cffi_filter_similarity_k(filters1, filters2, k, threshold):
     """Accelerated method for determining Bloom Filter similarity.
+
+    Assumes all filters are the same length, being a multiple of 64
+    bits.
+
     """
     length_f1 = len(filters1)
     length_f2 = len(filters2)
 
-    # We assume the length is a multple of 128 bits.
+    if length_f1 == 0:
+        return []
+
+    # Length must be a multple of 64 bits.
+    assert(len(filters1[0][0]) % 8 == 0)
+    filter_bytes = len(filters1[0][0]) // 8
+    assert(filter_bytes % 8 == 0)
+
     match_one_against_many_dice_k_top = lib.match_one_against_many_dice_k_top
 
     # An array of the *one* filter
-    clist1 = [ffi.new("char[128]", bytes(f[0].tobytes()))
+    clist1 = [ffi.new("char[{}]".format(filter_bytes), bytes(f[0].tobytes()))
               for f in filters1]
 
     if sys.version_info < (3, 0):
@@ -51,10 +62,10 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold):
             for b in f[0].tobytes():
                 data.append(b)
 
-        carr2 = ffi.new("char[{}]".format(128 * length_f2), ''.join(data))
+        carr2 = ffi.new("char[{}]".format(filter_bytes * length_f2), ''.join(data))
     else:
         # Works in Python 3+
-        carr2 = ffi.new("char[{}]".format(128 * length_f2),
+        carr2 = ffi.new("char[{}]".format(filter_bytes * length_f2),
                         bytes([b for f in filters2 for b in f[0].tobytes()]))
 
     c_popcounts = ffi.new("uint32_t[{}]".format(length_f2), [f[2] for f in filters2])
@@ -66,14 +77,13 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold):
 
     result = []
     for i, f1 in enumerate(filters1):
-        assert len(clist1[i]) == 128
-        assert len(carr2) % 64 == 0
+        assert len(clist1[i]) == filter_bytes
         matches = match_one_against_many_dice_k_top(
             clist1[i],
             carr2,
             c_popcounts,
             length_f2,
-            128,
+            filter_bytes,
             k,
             threshold,
             c_indices,

From ab45ea86d474e39a1c871f6fab08aaf906a7636a Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Tue, 20 Feb 2018 16:31:04 +1100
Subject: [PATCH 39/49] Remove unused function.

---
 _cffi_build/build_matcher.py          |  1 -
 _cffi_build/dice_one_against_many.cpp | 15 ---------------
 2 files changed, 16 deletions(-)

diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py
index d191cac0..6277f3df 100644
--- a/_cffi_build/build_matcher.py
+++ b/_cffi_build/build_matcher.py
@@ -20,7 +20,6 @@
 )
 
 ffibuilder.cdef("""
-    int match_one_against_many_dice(const char * one, const char * many, int n, double * score);
     int match_one_against_many_dice_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, int keybytes, uint32_t k, double threshold, int *indices, double *scores);
     double dice_coeff(const char *array1, const char *array2, int array_bytes);
     double popcount_arrays(uint32_t *counts, const char *arrays, int narrays, int array_bytes);
diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index e30a8e60..f0a8b930 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -328,19 +328,4 @@ extern "C"
         }
         return i;
     }
-
-    int match_one_against_many_dice(const char *one, const char *many, int n, double *score) {
-
-        static const int array_bytes = 128;
-        static const double threshold = 0.0;
-        static const int k = 1;
-        int idx_unused;
-        uint32_t *counts_many = new uint32_t[n];
-        popcount_arrays(counts_many, many, n, array_bytes);
-        int res = match_one_against_many_dice_k_top(
-            one, many, counts_many, n, array_bytes, k, threshold, &idx_unused, score);
-        delete[] counts_many;
-
-        return res;
-    }
 }

From 9ccaa8d13654eea8a057a9736d478cfdd2d3d947 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Tue, 20 Feb 2018 16:47:57 +1100
Subject: [PATCH 40/49] Update README.

---
 README.rst | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.rst b/README.rst
index 483590ee..81c64d1b 100644
--- a/README.rst
+++ b/README.rst
@@ -135,8 +135,6 @@ Limitations
 -  The linkage process has order n^2 time complexity - although algorithms exist to
    significantly speed this up. Several possible speedups are described
    in http://dbs.uni-leipzig.de/file/P4Join-BTW2015.pdf
--  The C++ code makes an assumption of 1024 bit keys (although this would be easy
-   to change).
 
 
 License

From e515b34008685c768afc62dc943d011f534d8fbd Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Thu, 22 Feb 2018 10:07:23 +1100
Subject: [PATCH 41/49] Address Brian's comments.

---
 _cffi_build/dice_one_against_many.cpp |  6 +++++-
 anonlink/bloommatcher.py              | 12 +++++++++++-
 anonlink/entitymatch.py               |  9 ++++-----
 tests/bitarray_utils.py               |  8 +++++---
 tests/test_bloommatcher.py            | 21 ++++++++++-----------
 tests/test_util.py                    | 12 ++++--------
 6 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index f0a8b930..b47441ec 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -51,6 +51,10 @@ void popcount<4>(
 
 // Slow paths
 // TODO: Assumes sizeof(long) == 8
+//
+// NB: The specialisation to n=3 is not currently used but included
+// for completeness (i.e. so that popcount<n> is defined for all
+// non-negative n) and in anticipation of its use in the near future.
 template<>
 void popcount<3>(
         uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
@@ -285,7 +289,7 @@ extern "C"
         const uint64_t *comp1 = (const uint64_t *) one;
         const uint64_t *comp2 = (const uint64_t *) many;
 
-        // Here we create max_k_scores on the stack by providing it
+        // Here we create top_k_scores on the stack by providing it
         // with a vector in which to put its elements. We do this so
         // that we can reserve the amount of space needed for the
         // scores in advance and avoid potential memory reallocation
diff --git a/anonlink/bloommatcher.py b/anonlink/bloommatcher.py
index 102c8653..24675c3b 100644
--- a/anonlink/bloommatcher.py
+++ b/anonlink/bloommatcher.py
@@ -10,7 +10,9 @@ def dicecoeff_pure_python(e1, e2):
     """
     Dice coefficient measures the similarity of two bit patterns.
 
-    :param e1,e2: bitset arrays of same length
+    Implemented exclusively in Python.
+
+    :param e1, e2: bitarrays of same length
     :return: real 0-1 similarity measure
     """
     count1 = e1.count()
@@ -23,6 +25,14 @@ def dicecoeff_pure_python(e1, e2):
         return 2.0 * overlap_count / combined_count
 
 def dicecoeff_native(e1, e2):
+    """
+    Dice coefficient measures the similarity of two bit patterns.
+
+    Implemented via an external library.
+
+    :param e1, e2: bitarrays of same length
+    :return: real 0-1 similarity measure
+    """
     e1array = ffi.new("char[]", e1.tobytes())
     e2array = ffi.new("char[]", e2.tobytes())
     return lib.dice_coeff(e1array, e2array, len(e1array))
diff --git a/anonlink/entitymatch.py b/anonlink/entitymatch.py
index cbcf1251..fae7b4f7 100644
--- a/anonlink/entitymatch.py
+++ b/anonlink/entitymatch.py
@@ -44,10 +44,9 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold):
     if length_f1 == 0:
         return []
 
-    # Length must be a multple of 64 bits.
-    assert(len(filters1[0][0]) % 8 == 0)
-    filter_bytes = len(filters1[0][0]) // 8
-    assert(filter_bytes % 8 == 0)
+    filter_bits = len(filters1[0][0])
+    assert(filter_bits % 64 == 0, 'Filter length must be a multple of 64 bits.')
+    filter_bytes = filter_bits // 8
 
     match_one_against_many_dice_k_top = lib.match_one_against_many_dice_k_top
 
@@ -90,7 +89,7 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold):
             c_scores)
 
         if matches < 0:
-            raise Exception('Internel error: Bad key length')
+            raise ValueError('Internel error: Bad key length')
         for j in range(matches):
             ind = c_indices[j]
             assert ind < len(filters2)
diff --git a/tests/bitarray_utils.py b/tests/bitarray_utils.py
index 4574cf2b..76426911 100644
--- a/tests/bitarray_utils.py
+++ b/tests/bitarray_utils.py
@@ -1,10 +1,12 @@
 from bitarray import bitarray
 from itertools import combinations_with_replacement
 
-# Return a bit array of length L*64 whose contents are combinations of
-# the words 0, 2^64-1, 1 or 2^63 (ie. all zeros, all ones, or a one in
-# the least or most significant position).
 def bitarrays_of_length(L):
+    """
+    Return a bit array of length L*64 whose contents are combinations of
+    the words 0, 2^64-1, 1 or 2^63 (ie. all zeros, all ones, or a one in
+    the least or most significant position).
+    """
     special_words = [64*bitarray('0'),
                      63*bitarray('0') + bitarray('1'),
                      bitarray('1') + 63*bitarray('0'),
diff --git a/tests/test_bloommatcher.py b/tests/test_bloommatcher.py
index 2eb0fa9e..8fff895e 100644
--- a/tests/test_bloommatcher.py
+++ b/tests/test_bloommatcher.py
@@ -1,4 +1,5 @@
 import unittest
+import pytest
 import random
 import os
 from collections import deque
@@ -73,18 +74,16 @@ def test_dice_4_c(self):
 
         self.assertEqual(result, 0.0)
 
-# Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1
-# of various lengths between 1 and 65 words.
-def test_dicecoeff():
-    for L in bitarray_utils.key_lengths:
-        yield check_dicecoeff, bitarray_utils.bitarrays_of_length(L)
-
-def check_dicecoeff(bas):
-    # Test the Dice coefficient of bitarrays in bas with other
-    # bitarrays of bas.  rotations is the number of times we rotate
-    # bas to generate pairs to test the Dice coefficient; 10 takes
-    # around 10s, 100 around 60s.
+@pytest.mark.parametrize("L", bitarray_utils.key_lengths)
+def test_dicecoeff(L):
+    """
+    Test the Dice coefficient of bitarrays in bas with other
+    bitarrays of bas.  rotations is the number of times we rotate
+    bas to generate pairs to test the Dice coefficient; 10 takes
+    around 10s, 100 around 60s.
+    """
     rotations = 100 if "INCLUDE_10K" in os.environ else 10;
+    bas = bitarray_utils.bitarrays_of_length(L)
 
     # We check that the native code and Python versions of dicecoeff
     # don't ever differ by more than 10^{-6}.
diff --git a/tests/test_util.py b/tests/test_util.py
index 3c8e4312..2715de3d 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3.4
 
 import unittest
+import pytest
 from anonlink import util
 from anonlink import bloommatcher as bm
 from tests import bitarray_utils
@@ -21,17 +22,12 @@ def test_generate_clks(self):
             self.assertEqual(len(clk[0]), 1024)
             self.assertEqual(clk[0].count(), clk[2])
 
-# Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1
-# of various lengths between 1 and 65 words.
-def test_popcount_vector():
-    for L in bitarray_utils.key_lengths:
-        yield check_popcount_vector, bitarray_utils.bitarrays_of_length(L)
-
-def check_popcount_vector(bas):
+@pytest.mark.parametrize("L", bitarray_utils.key_lengths)
+def test_popcount_vector(L):
+    bas = bitarray_utils.bitarrays_of_length(L)
     bas_counts = [b.count() for b in bas]
 
     popcounts, _ = util.popcount_vector(bas, use_python=True)
     assert(popcounts == bas_counts)
     popcounts, _ = util.popcount_vector(bas, use_python=False)
     assert(popcounts == bas_counts)
-

From 446033f607edea2dde5a3ff104adced9f8fbcd6f Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Thu, 22 Feb 2018 10:27:52 +1100
Subject: [PATCH 42/49] Exit early if filter is zero.

---
 _cffi_build/dice_one_against_many.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index b47441ec..b5a5af97 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -301,6 +301,9 @@ extern "C"
         node_queue top_k_scores(score_cmp(), std::move(vec));
 
         uint32_t count_one = _popcount_array(comp1, keywords);
+        if (count_one == 0)
+            return 0;
+
         uint32_t max_popcnt_delta = keybytes * CHAR_BIT; // = bits per key
         if(threshold > 0) {
             max_popcnt_delta = calculate_max_difference(count_one, threshold);

From dea0a0d272326375c45fbc516c3bcd5a8df90cdf Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 23 Feb 2018 14:11:39 +1100
Subject: [PATCH 43/49] Specialise popcount arrays calls on array length.

---
 _cffi_build/dice_one_against_many.cpp | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index b5a5af97..44943424 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -80,6 +80,16 @@ void popcount<1>(
 }
 
 
+template<int nwords>
+void _my_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) {
+    uint64_t c0, c1, c2, c3;
+    for (int i = 0; i < narrays; ++i, arrays += nwords) {
+        c0 = c1 = c2 = c3 = 0;
+        popcount<nwords>(c0, c1, c2, c3, arrays);
+        counts[i] = c0 + c1 + c2 + c3;
+    }
+}
+
 static uint32_t
 _popcount_array(const uint64_t *array, int nwords) {
     uint64_t c0, c1, c2, c3;
@@ -230,8 +240,14 @@ extern "C"
 
         // assumes WORD_PER_POPCOUNT divides nwords
         clock_t t = clock();
-        for (int i = 0; i < narrays; ++i, u += nwords)
-            counts[i] = _popcount_array(u, nwords);
+        switch (nwords) {
+        case 32: _my_popcount_arrays<32>(counts, u, narrays); break;
+        case 16: _my_popcount_arrays<16>(counts, u, narrays); break;
+        case  8: _my_popcount_arrays< 8>(counts, u, narrays); break;
+        default:
+            for (int i = 0; i < narrays; ++i, u += nwords)
+                counts[i] = _popcount_array(u, nwords);
+        }
         return to_millis(clock() - t);
     }
 

From d3671a22bfd0bf8ddb4a8d250989c17cc1479355 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 2 Mar 2018 12:03:47 +1100
Subject: [PATCH 44/49] Fix performance regression.

---
 _cffi_build/dice_one_against_many.cpp | 137 +++++++++++++++++---------
 1 file changed, 91 insertions(+), 46 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 44943424..3eccb2b7 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -10,7 +10,8 @@
 static constexpr int WORD_BYTES = sizeof(uint64_t);
 
 template<int n>
-void popcount(
+static inline void
+popcount(
         uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
         const uint64_t *buf) {
     popcount<4>(c0, c1, c2, c3, buf);
@@ -31,7 +32,8 @@ void popcount(
 // loading the contents of buf into registers and using these same
 // registers for the intermediate popcnts.
 template<>
-void popcount<4>(
+static inline void
+popcount<4>(
         uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
         const uint64_t* buf) {
     uint64_t b0, b1, b2, b3;
@@ -56,7 +58,8 @@ void popcount<4>(
 // for completeness (i.e. so that popcount<n> is defined for all
 // non-negative n) and in anticipation of its use in the near future.
 template<>
-void popcount<3>(
+static inline void
+popcount<3>(
         uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
         const uint64_t* buf) {
     c0 += __builtin_popcountl(buf[0]);
@@ -65,7 +68,8 @@ void popcount<3>(
 }
 
 template<>
-void popcount<2>(
+static inline void
+popcount<2>(
         uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &,
         const uint64_t* buf) {
     c0 += __builtin_popcountl(buf[0]);
@@ -73,7 +77,8 @@ void popcount<2>(
 }
 
 template<>
-void popcount<1>(
+static inline void
+popcount<1>(
         uint64_t &c0, uint64_t &, uint64_t &, uint64_t &,
         const uint64_t* buf) {
     c0 += __builtin_popcountl(buf[0]);
@@ -81,7 +86,8 @@ void popcount<1>(
 
 
 template<int nwords>
-void _my_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) {
+static void
+_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) {
     uint64_t c0, c1, c2, c3;
     for (int i = 0; i < narrays; ++i, arrays += nwords) {
         c0 = c1 = c2 = c3 = 0;
@@ -124,43 +130,57 @@ _popcount_array(const uint64_t *array, int nwords) {
     return c0 + c1 + c2 + c3;
 }
 
+template<int n>
 static inline void
-logand_array(uint64_t *out, const uint64_t *arr1, const uint64_t *arr2, int n) {
-    for (int j = 0; j < n; ++j)
-        out[j] = arr1[j] & arr2[j];
+popcount_logand(
+        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
+        const uint64_t *buf1, const uint64_t *buf2) {
+    popcount_logand<4>(c0, c1, c2, c3, buf1, buf2);
+    popcount_logand<n - 4>(c0, c1, c2, c3, buf1 + 4, buf2 + 4);
+}
+
+template<>
+static inline void
+popcount_logand<4>(
+        uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
+        const uint64_t* buf1, const uint64_t *buf2) {
+    uint64_t b[4];
+    b[0] = buf1[0] & buf2[0];
+    b[1] = buf1[1] & buf2[1];
+    b[2] = buf1[2] & buf2[2];
+    b[3] = buf1[3] & buf2[3];
+    popcount<4>(c0, c1, c2, c3, b);
 }
 
 static uint32_t
-_popcount_logand_array(
-        const uint64_t *array1,
-        const uint64_t *array2,
-        int nwords) {
-    const uint64_t *arr1 = array1, *arr2 = array2;
-    int n = nwords;
-    static constexpr int BUF_WORDS = 16;
-    uint64_t combined[BUF_WORDS];
+_popcount_logand_array(const uint64_t* u, const uint64_t* v, int len) {
+    // NB: The switch statement at the end of this function must have
+    // cases for all i = 1, ..., LOOP_LEN - 1.
+    static constexpr int LOOP_LEN = 4;
     uint64_t c0, c1, c2, c3;
-
     c0 = c1 = c2 = c3 = 0;
 
-    while (n >= BUF_WORDS) {
-        logand_array(combined, arr1, arr2, BUF_WORDS);
-        popcount<BUF_WORDS>(c0, c1, c2, c3, combined);
-        arr1 += BUF_WORDS;
-        arr2 += BUF_WORDS;
-        n -= BUF_WORDS;
+    int i = 0;
+    for ( ; i + LOOP_LEN <= len; i += LOOP_LEN) {
+        popcount_logand<LOOP_LEN>(c0, c1, c2, c3, u, v);
+        u += LOOP_LEN;
+        v += LOOP_LEN;
     }
-    if (n > 0) {
-        logand_array(combined, arr1, arr2, n);
-        c0 += _popcount_array(combined, n);
+
+    // NB: The "fall through" comments are necessary to tell GCC and
+    // Clang not to complain about the fact that the case clauses
+    // don't have break statements in them.
+    switch (len - i) {
+    case 3: c2 += __builtin_popcountl(u[2] & v[2]);  /* fall through */
+    case 2: c1 += __builtin_popcountl(u[1] & v[1]);  /* fall through */
+    case 1: c0 += __builtin_popcountl(u[0] & v[0]);  /* fall through */
     }
 
     return c0 + c1 + c2 + c3;
 }
 
-// assumes u_popc or v_popc is nonzero.
 static inline double
-_dice_coeff(
+_dice_coeff_generic(
         const uint64_t *u, uint32_t u_popc,
         const uint64_t *v, uint32_t v_popc,
         int nwords) {
@@ -168,6 +188,17 @@ _dice_coeff(
     return (2 * uv_popc) / (double) (u_popc + v_popc);
 }
 
+template<int nwords>
+static inline double
+_dice_coeff(
+        const uint64_t *u, uint32_t u_popc,
+        const uint64_t *v, uint32_t v_popc) {
+    uint64_t c0, c1, c2, c3;
+    c0 = c1 = c2 = c3 = 0;
+    popcount_logand<nwords>(c0, c1, c2, c3, u, v);
+    uint32_t uv_popc = c0 + c1 + c2 + c3;
+    return (2 * uv_popc) / (double) (u_popc + v_popc);
+}
 
 class Node {
 public:
@@ -241,9 +272,9 @@ extern "C"
         // assumes WORD_PER_POPCOUNT divides nwords
         clock_t t = clock();
         switch (nwords) {
-        case 32: _my_popcount_arrays<32>(counts, u, narrays); break;
-        case 16: _my_popcount_arrays<16>(counts, u, narrays); break;
-        case  8: _my_popcount_arrays< 8>(counts, u, narrays); break;
+        case 32: _popcount_arrays<32>(counts, u, narrays); break;
+        case 16: _popcount_arrays<16>(counts, u, narrays); break;
+        case  8: _popcount_arrays< 8>(counts, u, narrays); break;
         default:
             for (int i = 0; i < narrays; ++i, u += nwords)
                 counts[i] = _popcount_array(u, nwords);
@@ -280,7 +311,7 @@ extern "C"
         if (v_popc == 0)
             return 0.0;
 
-        return _dice_coeff(u, u_popc, v, v_popc, nwords);
+        return _dice_coeff_generic(u, u_popc, v, v_popc, nwords);
     }
 
     /**
@@ -302,8 +333,8 @@ extern "C"
         if (keybytes % WORD_BYTES != 0)
             return -1;
         int keywords = keybytes / WORD_BYTES;
-        const uint64_t *comp1 = (const uint64_t *) one;
-        const uint64_t *comp2 = (const uint64_t *) many;
+        const uint64_t *comp1 = reinterpret_cast<const uint64_t *>(one);
+        const uint64_t *comp2 = reinterpret_cast<const uint64_t *>(many);
 
         // Here we create top_k_scores on the stack by providing it
         // with a vector in which to put its elements. We do this so
@@ -325,18 +356,31 @@ extern "C"
             max_popcnt_delta = calculate_max_difference(count_one, threshold);
         }
 
+        auto push_score = [&](double score, int idx) {
+            if (score >= threshold) {
+                top_k_scores.push(Node(idx, score));
+                if (top_k_scores.size() > k) {
+                    // Popping the top element is O(log(k))!
+                    top_k_scores.pop();
+                }
+            }
+        };
+
         const uint64_t *current = comp2;
-        for (int j = 0; j < n; j++, current += keywords) {
-            const uint32_t counts_many_j = counts_many[j];
-
-            if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) {
-                double score = _dice_coeff(comp1, count_one, current, counts_many_j, keywords);
-                if (score >= threshold) {
-                    top_k_scores.push(Node(j, score));
-                    if (top_k_scores.size() > k) {
-                        // Popping the top element is O(log(k))!
-                        top_k_scores.pop();
-                    }
+        if (keywords == 16) {
+            for (int j = 0; j < n; j++, current += 16) {
+                const uint32_t counts_many_j = counts_many[j];
+                if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) {
+                    double score = _dice_coeff<16>(comp1, count_one, current, counts_many_j);
+                    push_score(score, j);
+                }
+            }
+        } else {
+            for (int j = 0; j < n; j++, current += keywords) {
+                const uint32_t counts_many_j = counts_many[j];
+                if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) {
+                    double score = _dice_coeff_generic(comp1, count_one, current, counts_many_j, keywords);
+                    push_score(score, j);
                 }
             }
         }
@@ -349,6 +393,7 @@ extern "C"
            top_k_scores.pop();
            i += 1;
         }
+
         return i;
     }
 }

From 93abfae8f223727fb601bd6395865ea114cdb607 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 2 Mar 2018 12:53:57 +1100
Subject: [PATCH 45/49] Remove storage class specifiers from explicit template
 specialisations.

---
 _cffi_build/dice_one_against_many.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 3eccb2b7..128cc4f4 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -32,7 +32,7 @@ popcount(
 // loading the contents of buf into registers and using these same
 // registers for the intermediate popcnts.
 template<>
-static inline void
+inline void
 popcount<4>(
         uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
         const uint64_t* buf) {
@@ -58,7 +58,7 @@ popcount<4>(
 // for completeness (i.e. so that popcount<n> is defined for all
 // non-negative n) and in anticipation of its use in the near future.
 template<>
-static inline void
+inline void
 popcount<3>(
         uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
         const uint64_t* buf) {
@@ -68,7 +68,7 @@ popcount<3>(
 }
 
 template<>
-static inline void
+inline void
 popcount<2>(
         uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &,
         const uint64_t* buf) {
@@ -77,7 +77,7 @@ popcount<2>(
 }
 
 template<>
-static inline void
+inline void
 popcount<1>(
         uint64_t &c0, uint64_t &, uint64_t &, uint64_t &,
         const uint64_t* buf) {
@@ -140,7 +140,7 @@ popcount_logand(
 }
 
 template<>
-static inline void
+inline void
 popcount_logand<4>(
         uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
         const uint64_t* buf1, const uint64_t *buf2) {

From e9706ff5e0f996cc4aa7db7639c485b9d1c9cfe0 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 2 Mar 2018 14:05:17 +1100
Subject: [PATCH 46/49] Update README and requirements.txt files.

---
 README.rst       | 20 ++++++++++++++------
 requirements.txt |  4 ++--
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/README.rst b/README.rst
index 483590ee..16e33001 100644
--- a/README.rst
+++ b/README.rst
@@ -113,16 +113,24 @@ matrix, which will be approximately `#comparisons * match% / 100`.
 Tests
 =====
 
-Run unit tests with nose
+Run unit tests with `pytest`:
 
 ::
 
-    $ python -m nose
-    ......................SS..............................
-    ----------------------------------------------------------------------
-    Ran 54 tests in 6.615s
+    $ pytest
+    ====================================== test session starts ======================================
+    platform linux -- Python 3.6.4, pytest-3.2.5, py-1.4.34, pluggy-0.4.0
+    rootdir: /home/hlaw/src/n1-anonlink, inifile:
+    collected 71 items
 
-    OK (SKIP=2)
+    tests/test_benchmark.py ...
+    tests/test_bloommatcher.py ..............
+    tests/test_e2e.py .............ss....
+    tests/test_matcher.py ..x.....x......x....x..
+    tests/test_similarity.py .........
+    tests/test_util.py ...
+
+    ======================== 65 passed, 2 skipped, 4 xfailed in 4.01 seconds ========================
 
 To enable slightly larger tests add the following environment variables:
 
diff --git a/requirements.txt b/requirements.txt
index f4a36d17..8ea3e2fa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 bitarray==0.8.1
 networkx==1.11
 cffi>=1.7
-nose==1.3.7
-clkhash==0.8.0
\ No newline at end of file
+pytest>=3.4
+clkhash==0.8.0

From ed0968787c1eb2dee8290add467bab3c2c942680 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Thu, 8 Mar 2018 15:41:48 +1100
Subject: [PATCH 47/49] Disable unused function.

---
 _cffi_build/dice_one_against_many.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index d288a2dc..54f315b6 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -57,6 +57,7 @@ popcount<4>(
 // NB: The specialisation to n=3 is not currently used but included
 // for completeness (i.e. so that popcount<n> is defined for all
 // non-negative n) and in anticipation of its use in the near future.
+#if 0
 template<>
 inline void
 popcount<3>(
@@ -66,6 +67,7 @@ popcount<3>(
     c1 += __builtin_popcountl(buf[1]);
     c2 += __builtin_popcountl(buf[2]);
 }
+#endif
 
 template<>
 inline void

From 63cc6e0af8e4c9b89f5b0ad25eb2a4d85a402a91 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 9 Mar 2018 16:16:01 +1100
Subject: [PATCH 48/49] Put stars in their proper place.

---
 _cffi_build/dice_one_against_many.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 54f315b6..5285a208 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -35,7 +35,7 @@ template<>
 inline void
 popcount<4>(
         uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
-        const uint64_t* buf) {
+        const uint64_t *buf) {
     uint64_t b0, b1, b2, b3;
     b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3];
     __asm__(
@@ -62,7 +62,7 @@ template<>
 inline void
 popcount<3>(
         uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &,
-        const uint64_t* buf) {
+        const uint64_t *buf) {
     c0 += __builtin_popcountl(buf[0]);
     c1 += __builtin_popcountl(buf[1]);
     c2 += __builtin_popcountl(buf[2]);
@@ -73,7 +73,7 @@ template<>
 inline void
 popcount<2>(
         uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &,
-        const uint64_t* buf) {
+        const uint64_t *buf) {
     c0 += __builtin_popcountl(buf[0]);
     c1 += __builtin_popcountl(buf[1]);
 }
@@ -82,7 +82,7 @@ template<>
 inline void
 popcount<1>(
         uint64_t &c0, uint64_t &, uint64_t &, uint64_t &,
-        const uint64_t* buf) {
+        const uint64_t *buf) {
     c0 += __builtin_popcountl(buf[0]);
 }
 
@@ -145,7 +145,7 @@ template<>
 inline void
 popcount_logand<4>(
         uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3,
-        const uint64_t* buf1, const uint64_t *buf2) {
+        const uint64_t *buf1, const uint64_t *buf2) {
     uint64_t b[4];
     b[0] = buf1[0] & buf2[0];
     b[1] = buf1[1] & buf2[1];
@@ -155,7 +155,7 @@ popcount_logand<4>(
 }
 
 static uint32_t
-_popcount_logand_array(const uint64_t* u, const uint64_t* v, int len) {
+_popcount_logand_array(const uint64_t *u, const uint64_t *v, int len) {
     // NB: The switch statement at the end of this function must have
     // cases for all i = 1, ..., LOOP_LEN - 1.
     static constexpr int LOOP_LEN = 4;

From ef82759e5e4be898a434ccf7b3cdd901b488fb30 Mon Sep 17 00:00:00 2001
From: Hamish Ivey-Law <hamish.ivey-law@data61.csiro.au>
Date: Fri, 9 Mar 2018 16:35:27 +1100
Subject: [PATCH 49/49] Add documentation.

---
 _cffi_build/dice_one_against_many.cpp | 50 +++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp
index 5285a208..c5093365 100644
--- a/_cffi_build/dice_one_against_many.cpp
+++ b/_cffi_build/dice_one_against_many.cpp
@@ -9,6 +9,9 @@
 
 static constexpr int WORD_BYTES = sizeof(uint64_t);
 
+/**
+ * The popcount of n elements of buf is the sum of c0, c1, c2, c3.
+ */
 template<int n>
 static inline void
 popcount(
@@ -31,6 +34,9 @@ popcount(
 // with Clang (3.6 and 3.8)).  We fix the mistake by explicitly
 // loading the contents of buf into registers and using these same
 // registers for the intermediate popcnts.
+/**
+ * The popcount of 4 elements of buf is the sum of c0, c1, c2, c3.
+ */
 template<>
 inline void
 popcount<4>(
@@ -69,6 +75,9 @@ popcount<3>(
 }
 #endif
 
+/**
+ * The popcount of 2 elements of buf is the sum of c0, c1.
+ */
 template<>
 inline void
 popcount<2>(
@@ -78,6 +87,9 @@ popcount<2>(
     c1 += __builtin_popcountl(buf[1]);
 }
 
+/**
+ * The popcount *buf is in c0.
+ */
 template<>
 inline void
 popcount<1>(
@@ -87,6 +99,17 @@ popcount<1>(
 }
 
 
+/**
+ * Calculate population counts of an array of inputs of nwords elements.
+ *
+ * 'arrays' must point to narrays*nwords*WORD_BYTES bytes
+ * 'counts' must point to narrays*sizeof(uint32_t) bytes.
+ * For i = 0 to narrays - 1, the population count of the nwords elements
+ *
+ *   arrays[i * nwords] ... arrays[(i + 1) * nwords - 1]
+ *
+ * is put in counts[i].
+ */
 template<int nwords>
 static void
 _popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) {
@@ -98,6 +121,9 @@ _popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) {
     }
 }
 
+/**
+ * Return the popcount of the nwords elements starting at array.
+ */
 static uint32_t
 _popcount_array(const uint64_t *array, int nwords) {
     uint64_t c0, c1, c2, c3;
@@ -132,6 +158,10 @@ _popcount_array(const uint64_t *array, int nwords) {
     return c0 + c1 + c2 + c3;
 }
 
+/**
+ * The popcount of the logical AND of n corresponding elements of buf1
+ * and buf2 is the sum of c0, c1, c2, c3.
+ */
 template<int n>
 static inline void
 popcount_logand(
@@ -141,6 +171,10 @@ popcount_logand(
     popcount_logand<n - 4>(c0, c1, c2, c3, buf1 + 4, buf2 + 4);
 }
 
+/**
+ * The popcount of the logical AND of 4 corresponding elements of buf1
+ * and buf2 is the sum of c0, c1, c2, c3.
+ */
 template<>
 inline void
 popcount_logand<4>(
@@ -154,6 +188,10 @@ popcount_logand<4>(
     popcount<4>(c0, c1, c2, c3, b);
 }
 
+/**
+ * Return the popcount of the logical AND of len corresponding
+ * elements of u and v.
+ */
 static uint32_t
 _popcount_logand_array(const uint64_t *u, const uint64_t *v, int len) {
     // NB: The switch statement at the end of this function must have
@@ -181,6 +219,10 @@ _popcount_logand_array(const uint64_t *u, const uint64_t *v, int len) {
     return c0 + c1 + c2 + c3;
 }
 
+/**
+ * Return the Sorensen-Dice coefficient of nwords length arrays u and
+ * v, whose popcounts are given in u_popc and v_popc respectively.
+ */
 static inline double
 _dice_coeff_generic(
         const uint64_t *u, uint32_t u_popc,
@@ -190,6 +232,10 @@ _dice_coeff_generic(
     return (2 * uv_popc) / (double) (u_popc + v_popc);
 }
 
+/**
+ * Return the Sorensen-Dice coefficient of nwords length arrays u and
+ * v, whose popcounts are given in u_popc and v_popc respectively.
+ */
 template<int nwords>
 static inline double
 _dice_coeff(
@@ -369,6 +415,10 @@ extern "C"
         };
 
         const uint64_t *current = comp2;
+
+        // NB: For any key length that must run at maximum speed, we
+        // need to specialise a block in the following 'if' statement
+        // (which is an example of specialising to keywords == 16).
         if (keywords == 16) {
             for (int j = 0; j < n; j++, current += 16) {
                 const uint32_t counts_many_j = counts_many[j];