From f5444a32c91e50889d3fc3526fac0b26f8938050 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 15 Jan 2018 12:03:36 +1100 Subject: [PATCH 01/49] Refactor main C++ function to avoid use "constant" memory and avoid new/delete. --- _cffi_build/dice_one_against_many.cpp | 44 ++++++++++++--------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 6182585c..9f5a87b9 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -139,14 +139,17 @@ extern "C" const uint64_t *comp1 = (const uint64_t *) one; const uint64_t *comp2 = (const uint64_t *) many; + // TODO: Given that k is 10 by default, often 5 in practice, + // and probably never ever more than 20 or so, the use of a + // priority_queue here is expensive overkill. Better to just + // store the scores in an array and do a linear search every + // time. std::priority_queue, score_cmp> max_k_scores; uint32_t count_one = builtin_popcnt_unrolled_errata_manual(comp1); uint64_t combined[KEYWORDS]; - double *all_scores = new double[n]; - uint32_t max_popcnt_delta = 1024; if(threshold > 0) { max_popcnt_delta = calculate_max_difference(count_one, threshold); @@ -155,39 +158,32 @@ extern "C" for (int j = 0; j < n; j++) { const uint64_t *current = comp2 + j * KEYWORDS; + const uint32_t counts_many_j = counts_many[j]; - if(count_one > counts_many[j]){ - current_delta = count_one - counts_many[j]; + if (count_one > counts_many_j) { + current_delta = count_one - counts_many_j; } else { - current_delta = counts_many[j] - count_one; + current_delta = counts_many_j - count_one; } - if(current_delta <= max_popcnt_delta){ - for (unsigned int i = 0 ; i < KEYWORDS; i++ ) { + if (current_delta <= max_popcnt_delta) { + for (int i = 0; i < (int)KEYWORDS; i++) { combined[i] = current[i] & comp1[i]; } uint32_t count_curr = builtin_popcnt_unrolled_errata_manual(combined); - double score = 2 * count_curr / (double) (count_one + counts_many[j]); - all_scores[j] = score; - } else { - // Skipping because popcount difference too large - all_scores[j] = -1; - } - } - - for (int j = 0; j < n; j++) { - - if(all_scores[j] >= threshold) { - max_k_scores.push(Node(j, all_scores[j])); - } - - if(max_k_scores.size() > k) max_k_scores.pop(); + // TODO: double precision is overkill for this + // problem; just use float. + double score = 2 * count_curr / (double) (count_one + counts_many_j); + if (score >= threshold) { + max_k_scores.push(Node(j, score)); + if (max_k_scores.size() > k) + max_k_scores.pop(); + } + } // else skip because popcount difference too large } - delete[] all_scores; - int i = 0; while (!max_k_scores.empty()) { From 5d5337b84256fe4cb7e558800ad152e862a9a326 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 15 Jan 2018 14:37:09 +1100 Subject: [PATCH 02/49] Implement popcount on (almost) arbitrary length arrays. --- _cffi_build/dice_one_against_many.cpp | 57 +++++++++++++++++++++------ 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 9f5a87b9..6b17a8fd 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -24,7 +24,7 @@ // explicitly loading the contents of buf into registers and using // these same registers for the intermediate popcnts. static inline uint32_t -builtin_popcnt_unrolled_errata_manual(const uint64_t* buf) { +builtin_popcnt_unrolled_errata_manual(const uint64_t* buf, int n) { uint64_t b0, b1, b2, b3; uint64_t c0, c1, c2, c3; c0 = c1 = c2 = c3 = 0; @@ -47,14 +47,48 @@ builtin_popcnt_unrolled_errata_manual(const uint64_t* buf) { "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3)); \ } while (0) - LOOP_BODY(0); - LOOP_BODY(4); - LOOP_BODY(8); - LOOP_BODY(12); + // Here we assume that 4|n and n <= 16. This means that n/4 is + // either 4, 3, 2 or 1, and these values correspond to the switch + // cases, which in turn determine whether we read and popcnt 16, 12, + // 8 or 4 elements from buf. The __attribute__ ((fallthrough)); + // thingo is to let the compiler know that we are falling through + // the switch case statements deliberately (otherwise this illicits + // a warning with -Wextra). + switch (n >> 2) { // n/4 + case 4: + LOOP_BODY(12); + __attribute__ ((fallthrough)); + case 3: + LOOP_BODY(8); + __attribute__ ((fallthrough)); + case 2: + LOOP_BODY(4); + __attribute__ ((fallthrough)); + case 1: + LOOP_BODY(0); + __attribute__ ((fallthrough)); + } return c0 + c1 + c2 + c3; } +/** + * Bit population count of the 8n bytes of memory starting at buf (8 = + * sizeof(uint64_t)). + */ +static uint32_t +popcount_array(const uint64_t *buf, int n) { + assert(n % 4 == 0); + uint32_t pc = 0; + while (n >= 16) { + pc += builtin_popcnt_unrolled_errata_manual(buf, 16); + n -= 16; + } + if (n > 0) + pc += builtin_popcnt_unrolled_errata_manual(buf, n); + return pc; +} + /** * Compute the Dice coefficient similarity measure of two bit patterns. */ @@ -65,8 +99,8 @@ dice_coeff_1024(const char *e1, const char *e2) { uint32_t count_both = 0; - count_both += builtin_popcnt_unrolled_errata_manual(comp1); - count_both += builtin_popcnt_unrolled_errata_manual(comp2); + count_both += popcount_array(comp1, KEYWORDS); + count_both += popcount_array(comp2, KEYWORDS); if(count_both == 0) { return 0.0; } @@ -76,7 +110,7 @@ dice_coeff_1024(const char *e1, const char *e2) { combined[i] = comp1[i] & comp2[i]; } - uint32_t count_and = builtin_popcnt_unrolled_errata_manual(combined); + uint32_t count_and = popcount_array(combined, KEYWORDS); return 2 * count_and / (double)count_both; } @@ -108,7 +142,7 @@ struct score_cmp{ static void popcount_1024_array(const char *many, int n, uint32_t *counts_many) { for (int i = 0; i < n; i++) { const uint64_t *sig = (const uint64_t *) many + i * KEYWORDS; - counts_many[i] = builtin_popcnt_unrolled_errata_manual(sig); + counts_many[i] = popcount_array(sig, KEYWORDS); } } @@ -146,7 +180,7 @@ extern "C" // time. std::priority_queue, score_cmp> max_k_scores; - uint32_t count_one = builtin_popcnt_unrolled_errata_manual(comp1); + uint32_t count_one = popcount_array(comp1, KEYWORDS); uint64_t combined[KEYWORDS]; @@ -171,7 +205,7 @@ extern "C" combined[i] = current[i] & comp1[i]; } - uint32_t count_curr = builtin_popcnt_unrolled_errata_manual(combined); + uint32_t count_curr = popcount_array(combined, KEYWORDS); // TODO: double precision is overkill for this // problem; just use float. @@ -210,4 +244,3 @@ extern "C" return res; } } - From 3864028f84d9b92ab6f790c998b8e6f6632ad258 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 15 Jan 2018 15:13:11 +1100 Subject: [PATCH 03/49] First pass at integrating arbitrary length keys. Slows things down a bit. --- _cffi_build/build_matcher.py | 2 +- _cffi_build/dice_one_against_many.cpp | 40 +++++++++++++++++---------- anonlink/entitymatch.py | 9 ++++-- 3 files changed, 33 insertions(+), 18 deletions(-) diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py index fb7e1161..16a4ed3f 100644 --- a/_cffi_build/build_matcher.py +++ b/_cffi_build/build_matcher.py @@ -20,7 +20,7 @@ ffibuilder.cdef(""" int match_one_against_many_dice(const char * one, const char * many, int n, double * score); - int match_one_against_many_dice_1024_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, uint32_t k, double threshold, int *indices, double *scores); + int match_one_against_many_dice_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, int keybytes, uint32_t k, double threshold, int *indices, double *scores); double dice_coeff_1024(const char *e1, const char *e2); """) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 6b17a8fd..ca69eb22 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -157,14 +157,16 @@ static uint32_t calculate_max_difference(uint32_t popcnt_a, double threshold) extern "C" { /** - * Calculate up to the top k indices and scores. - * Returns the number matched above a threshold. + * Calculate up to the top k indices and scores. Returns the + * number matched above a threshold or -1 if keybytes is not a + * multiple of 32. */ - int match_one_against_many_dice_1024_k_top( + int match_one_against_many_dice_k_top( const char *one, const char *many, const uint32_t *counts_many, int n, + int keybytes, uint32_t k, double threshold, int *indices, @@ -173,6 +175,13 @@ extern "C" const uint64_t *comp1 = (const uint64_t *) one; const uint64_t *comp2 = (const uint64_t *) many; + // keybytes must be divisible by 32, because keywords must be + // divisible by 4 for the builtin popcount function to work + // and keywords = keybytes / 8. + if (keybytes % (4 * WORDBYTES) != 0) // (keybytes & 31) + return -1; + int keywords = keybytes / WORDBYTES; + // TODO: Given that k is 10 by default, often 5 in practice, // and probably never ever more than 20 or so, the use of a // priority_queue here is expensive overkill. Better to just @@ -180,19 +189,21 @@ extern "C" // time. std::priority_queue, score_cmp> max_k_scores; - uint32_t count_one = popcount_array(comp1, KEYWORDS); - - uint64_t combined[KEYWORDS]; - - uint32_t max_popcnt_delta = 1024; + uint32_t count_one = popcount_array(comp1, keywords); + uint32_t max_popcnt_delta = keywords * WORDBITS; // = bits per key if(threshold > 0) { max_popcnt_delta = calculate_max_difference(count_one, threshold); } - uint32_t current_delta; + // TODO: This allocation could be avoided by writing a special + // popcount_array_combined() function that does the AND + // itself; this would almost certainly be faster than the + // new/delete pair and would require no memory overhead. + uint64_t *combined = new uint64_t[keywords]; for (int j = 0; j < n; j++) { - const uint64_t *current = comp2 + j * KEYWORDS; + const uint64_t *current = comp2 + j * keywords; const uint32_t counts_many_j = counts_many[j]; + uint32_t current_delta; if (count_one > counts_many_j) { current_delta = count_one - counts_many_j; @@ -201,11 +212,11 @@ extern "C" } if (current_delta <= max_popcnt_delta) { - for (int i = 0; i < (int)KEYWORDS; i++) { + for (int i = 0; i < keywords; i++) { combined[i] = current[i] & comp1[i]; } - uint32_t count_curr = popcount_array(combined, KEYWORDS); + uint32_t count_curr = popcount_array(combined, keywords); // TODO: double precision is overkill for this // problem; just use float. @@ -217,6 +228,7 @@ extern "C" } } // else skip because popcount difference too large } + delete[] combined; int i = 0; while (!max_k_scores.empty()) { @@ -237,8 +249,8 @@ extern "C" int idx_unused; uint32_t *counts_many = new uint32_t[n]; popcount_1024_array(many, n, counts_many); - int res = match_one_against_many_dice_1024_k_top( - one, many, counts_many, n, k, threshold, &idx_unused, score); + int res = match_one_against_many_dice_k_top( + one, many, counts_many, n, 128, k, threshold, &idx_unused, score); delete[] counts_many; return res; diff --git a/anonlink/entitymatch.py b/anonlink/entitymatch.py index 1484b12d..06752c1d 100644 --- a/anonlink/entitymatch.py +++ b/anonlink/entitymatch.py @@ -38,8 +38,8 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold): length_f1 = len(filters1) length_f2 = len(filters2) - # We assume the length is 1024 bit = 128 Bytes - match_one_against_many_dice_1024_k_top = lib.match_one_against_many_dice_1024_k_top + # We assume the length is a multple of 128 bits. + match_one_against_many_dice_k_top = lib.match_one_against_many_dice_k_top # An array of the *one* filter clist1 = [ffi.new("char[128]", bytes(f[0].tobytes())) @@ -69,16 +69,19 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold): for i, f1 in enumerate(filters1): assert len(clist1[i]) == 128 assert len(carr2) % 64 == 0 - matches = match_one_against_many_dice_1024_k_top( + matches = match_one_against_many_dice_k_top( clist1[i], carr2, c_popcounts, length_f2, + 128, k, threshold, c_indices, c_scores) + if matches < 0: + raise Exception('Internel error: Bad key length') for j in range(matches): ind = c_indices[j] assert ind < len(filters2) From 5d5338f6818c7fcd6932f33e6a75a44aa8d44fea Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Thu, 1 Feb 2018 11:03:20 +1100 Subject: [PATCH 04/49] Refactor Dice coefficient calculation. --- _cffi_build/dice_one_against_many.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 6182585c..15b7f56a 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -120,6 +120,17 @@ static uint32_t calculate_max_difference(uint32_t popcnt_a, double threshold) return 2 * popcnt_a * (1/threshold - 1); } +static double +dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_popc) +{ + uint64_t uv[KEYWORDS]; + for (unsigned int i = 0 ; i < KEYWORDS; i++ ) { + uv[i] = u[i] & v[i]; + } + uint32_t uv_popc = builtin_popcnt_unrolled_errata_manual(uv); + return (2 * uv_popc) / (double) (u_popc + v_popc); +} + extern "C" { /** @@ -143,8 +154,6 @@ extern "C" uint32_t count_one = builtin_popcnt_unrolled_errata_manual(comp1); - uint64_t combined[KEYWORDS]; - double *all_scores = new double[n]; uint32_t max_popcnt_delta = 1024; @@ -163,14 +172,7 @@ extern "C" } if(current_delta <= max_popcnt_delta){ - for (unsigned int i = 0 ; i < KEYWORDS; i++ ) { - combined[i] = current[i] & comp1[i]; - } - - uint32_t count_curr = builtin_popcnt_unrolled_errata_manual(combined); - - double score = 2 * count_curr / (double) (count_one + counts_many[j]); - all_scores[j] = score; + all_scores[j] = dice_coeff(comp1, count_one, current, counts_many[j]); } else { // Skipping because popcount difference too large all_scores[j] = -1; From 88e3625376edbfb94921ac5a4e7ee2f2c2de64d7 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Thu, 1 Feb 2018 11:04:03 +1100 Subject: [PATCH 05/49] Temporary fiddling with benchmark code. --- anonlink/benchmark.py | 63 +++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py index c1fba03d..eb0cdd97 100644 --- a/anonlink/benchmark.py +++ b/anonlink/benchmark.py @@ -8,7 +8,6 @@ from anonlink.entitymatch import * from anonlink.util import popcount_vector, generate_clks, generate_bitarray -from anonlink.distributed_processing import calculate_filter_similarity some_filters = generate_clks(10000) @@ -25,15 +24,16 @@ def compute_popcount_speed(n): elapsed_time = end - start print("{:6d} x 1024 bit popcounts in {:.6f} seconds".format(n, elapsed_time)) speed_in_MiB = n / (1024 * 8 * elapsed_time) - print("Popcount speed: {:.2f} MiB/s".format(speed_in_MiB)) + print("Popcount speed: {:.2f} MiB/s (bitarray.count())".format(speed_in_MiB)) return speed_in_MiB -def print_comparison_header(): - print("Size 1 | Size 2 | Comparisons | Compute Time | Million Comparisons per second") +def print_comparison_header(threshold): + print("Threshold = ", threshold) + print("Size 1 | Size 2 | Comparisons | Total Time (simat/solv) | Million Comparisons per second") -def compute_comparison_speed(n1=100, n2=100): +def compute_comparison_speed(n1=100, n2=100, threshold=0.75): """ Using the greedy solver, how fast can hashes be computed using one core. """ @@ -42,30 +42,20 @@ def compute_comparison_speed(n1=100, n2=100): filters2 = [some_filters[random.randrange(2000, 10000)] for _ in range(n2)] start = timer() - result3 = calculate_mapping_greedy(filters1, filters2) + sparse_matrix = calculate_filter_similarity(filters1, filters2, k=len(filters2), threshold=threshold) + t1 = timer() + res = greedy_solver(sparse_matrix) end = timer() - elapsed_time = end - start - print("{:6d} | {:6d} | {:12d} | {:8.3f}s | {:12.3f}".format( - n1, n2, n1*n2, elapsed_time, (n1*n2)/(1e6*elapsed_time))) - return elapsed_time - - -def compute_comparison_speed_parallel(n1=100, n2=100): - """ - Using the greedy solver in chunks, how fast can hashes be computed. - """ - - filters1 = [some_filters[random.randrange(0, 8000)] for _ in range(n1)] - filters2 = [some_filters[random.randrange(2000, 10000)] for _ in range(n2)] - - - start = timer() - calculate_filter_similarity(filters1, filters2) - end = timer() + #print("mat size = ", len(sparse_matrix)) + similarity_time = t1 - start + solver_time = end - t1 elapsed_time = end - start - print("{:6d} | {:6d} | {:12d} | {:8.3f}s | {:12.3f}".format( - n1, n2, n1*n2, elapsed_time, (n1*n2)/(1e6*elapsed_time))) + print("{:6d} | {:6d} | {:12d} | {:7.3f}s ({:3.1f}% / {:3.1f}%) | {:12.3f} -- {:8d} = {:2.1f}%".format( + n1, n2, n1*n2, elapsed_time, + 100.0*similarity_time/elapsed_time, + 100.0*solver_time/elapsed_time, + (n1*n2)/(1e6*similarity_time), len(sparse_matrix), 100.0*len(sparse_matrix)/(n1*n2))) return elapsed_time @@ -116,8 +106,6 @@ def benchmark(size, compare): compute_popcount_speed(100000) - print_comparison_header() - possible_test_sizes = [ 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, @@ -127,15 +115,14 @@ def benchmark(size, compare): 2000000 ] - for test_size in possible_test_sizes: - if test_size <= size: - compute_comparison_speed_parallel( - test_size, test_size - ) - - print("Single Core:") - compute_comparison_speed(5000, 5000) - + #for thld in [0.95, 0.85, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5]: + for thld in [0.22, 0.25, 0.27, 0.48, 0.49, 0.5, 0.51, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64]: + #print_comparison_header(thld) + print("threshold = ", thld) + for test_size in possible_test_sizes: + if test_size <= size: + compute_comparison_speed(test_size, test_size, thld) if __name__ == '__main__': - benchmark(20000, False) \ No newline at end of file + benchmark(1000, False) + #benchmark(20000, False) From a705de8b47643a36c22e6d6f5fe97fe365d779db Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Thu, 1 Feb 2018 22:40:36 +1100 Subject: [PATCH 06/49] Calculate and report popcount speed from native code implementation. --- _cffi_build/build_matcher.py | 1 + _cffi_build/dice_one_against_many.cpp | 38 ++++++++++++++++++++------- anonlink/benchmark.py | 26 +++++++++++++++--- anonlink/util.py | 35 +++++++++++++++--------- 4 files changed, 73 insertions(+), 27 deletions(-) diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py index fb7e1161..4065e4ee 100644 --- a/_cffi_build/build_matcher.py +++ b/_cffi_build/build_matcher.py @@ -22,6 +22,7 @@ int match_one_against_many_dice(const char * one, const char * many, int n, double * score); int match_one_against_many_dice_1024_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, uint32_t k, double threshold, int *indices, double *scores); double dice_coeff_1024(const char *e1, const char *e2); + double popcount_1024_array(const char *many, int n, uint32_t *counts_many); """) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 15b7f56a..7edb78dc 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -102,16 +103,6 @@ struct score_cmp{ }; -/** - * Count lots of bits. - */ -static void popcount_1024_array(const char *many, int n, uint32_t *counts_many) { - for (int i = 0; i < n; i++) { - const uint64_t *sig = (const uint64_t *) many + i * KEYWORDS; - counts_many[i] = builtin_popcnt_unrolled_errata_manual(sig); - } -} - /** * */ @@ -131,8 +122,35 @@ dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_pop return (2 * uv_popc) / (double) (u_popc + v_popc); } +static inline double to_millis(clock_t t) +{ + static constexpr double CPS = (double)CLOCKS_PER_SEC; + return t * 1.0E3 / CPS; +} + extern "C" { + /** + * Calculate population counts of an array of inputs; return how + * long it took in milliseconds. + * + * 'many' must point to n*KEYWORDS*sizeof(uint64_t) (== 128*n) bytes + * 'counts_many' must point to n*sizeof(uint32_t) bytes. + * For i = 0 to n - 1, the population count of the 1024 bits + * + * many[i * KEYWORDS] ... many[(i + 1) * KEYWORDS - 1] + * + * is put in counts_many[i]. + */ + double popcount_1024_array(const char *many, int n, uint32_t *counts_many) { + clock_t t = clock(); + for (int i = 0; i < n; i++) { + const uint64_t *sig = (const uint64_t *) many + i * KEYWORDS; + counts_many[i] = builtin_popcnt_unrolled_errata_manual(sig); + } + return to_millis(clock() - t); + } + /** * Calculate up to the top k indices and scores. * Returns the number matched above a threshold. diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py index eb0cdd97..4b96b9ab 100644 --- a/anonlink/benchmark.py +++ b/anonlink/benchmark.py @@ -18,13 +18,31 @@ def compute_popcount_speed(n): Just do as much counting of bits. """ clks = [generate_bitarray(1024) for _ in range(n)] + + print("{:6d} x 1024 bit popcounts".format(n)) + print("Implementation | Time (ms) | Bandwidth (MiB/s)") + start = timer() + popcounts = popcount_vector(clks, use_native=False) + end = timer() + elapsed_time = end - start + speed_in_MiB = (n * 128) / ((1 << 20) * elapsed_time) + print("Python (bitarray.count()): | {:7.2f} | {:9.2f} " + .format(elapsed_time * 1e3, speed_in_MiB)) + + # Native start = timer() - popcounts = popcount_vector(clks) + popcounts, ms = popcount_vector(clks, use_native=True) end = timer() elapsed_time = end - start - print("{:6d} x 1024 bit popcounts in {:.6f} seconds".format(n, elapsed_time)) - speed_in_MiB = n / (1024 * 8 * elapsed_time) - print("Popcount speed: {:.2f} MiB/s (bitarray.count())".format(speed_in_MiB)) + elapsed_nocopy = ms / 1e3 + copy_percent = 100*(elapsed_time - elapsed_nocopy) / elapsed_time + speed_in_MiB = (n * 128) / ((1 << 20) * elapsed_time) + speed_in_MiB_nocopy = (n * 128) / ((1 << 20) * elapsed_nocopy) + print("Native code (no copy): | {:7.2f} | {:9.2f} " + .format(ms, speed_in_MiB_nocopy)) + print("Native code (w/ copy): | {:7.2f} | {:9.2f} ({:.1f}% copying)" + .format(elapsed_time * 1e3, speed_in_MiB, copy_percent)) + return speed_in_MiB diff --git a/anonlink/util.py b/anonlink/util.py index dcafa80f..24982adb 100644 --- a/anonlink/util.py +++ b/anonlink/util.py @@ -2,8 +2,10 @@ import os import random +import time from bitarray import bitarray +from anonlink._entitymatcher import ffi, lib def generate_bitarray(length): a = bitarray(endian=['little', 'big'][random.randint(0, 1)]) @@ -19,25 +21,32 @@ def generate_clks(n): return res -def popcount_vector(bitarrays): - """ - Note, due to the overhead of converting bitarrays into - bytes, it is more expensive to call our C implementation +def popcount_vector(bitarrays, use_native=False): + """Return an array containing the popcounts of the elements of + bitarrays. If use_native is True, use the native code + implementation and return the time spent (in milliseconds) in the + native code as a second return value. + + Note, due to the overhead of converting bitarrays into bytes, + it is currently more expensive to call our C implementation than just calling bitarray.count() """ - return [clk.count() for clk in bitarrays] + # Use Python + if not use_native: + return [clk.count() for clk in bitarrays] + + # Use native code + n = len(bitarrays) + c_popcounts = ffi.new("uint32_t[{}]".format(n)) + many = ffi.new("char[{}]".format(128 * n), + bytes([b for f in bitarrays for b in f.tobytes()])) + ms = lib.popcount_1024_array(many, n, c_popcounts) - # n = len(clks) - # c_popcounts = ffi.new("uint32_t[{}]".format(n)) - # many = ffi.new("char[{}]".format(128 * n), - # bytes([b for f in clks for b in f.tobytes()])) - # lib.popcount_1024_array(many, n, c_popcounts) - # - # return [c_popcounts[i] for i in range(n)] + return [c_popcounts[i] for i in range(n)], ms def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): - yield l[i:i + n] \ No newline at end of file + yield l[i:i + n] From cff1cb6d22b502bbd3b56b1248639dd41c5225d4 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 2 Feb 2018 11:01:29 +1100 Subject: [PATCH 07/49] Give some values more sensible variable names. --- anonlink/benchmark.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py index 4b96b9ab..ab854542 100644 --- a/anonlink/benchmark.py +++ b/anonlink/benchmark.py @@ -17,15 +17,19 @@ def compute_popcount_speed(n): """ Just do as much counting of bits. """ - clks = [generate_bitarray(1024) for _ in range(n)] + clk_bits = 1024 + clk_bytes = clk_bits / 8 + clks_MiB = n * clk_bytes * 1.0 / (1 << 20) - print("{:6d} x 1024 bit popcounts".format(n)) + clks = [generate_bitarray(clk_bits) for _ in range(n)] + + print("{:6d} x {:d} bit popcounts".format(n, clk_bits)) print("Implementation | Time (ms) | Bandwidth (MiB/s)") start = timer() popcounts = popcount_vector(clks, use_native=False) end = timer() elapsed_time = end - start - speed_in_MiB = (n * 128) / ((1 << 20) * elapsed_time) + speed_in_MiB = clks_MiB / elapsed_time print("Python (bitarray.count()): | {:7.2f} | {:9.2f} " .format(elapsed_time * 1e3, speed_in_MiB)) @@ -36,8 +40,8 @@ def compute_popcount_speed(n): elapsed_time = end - start elapsed_nocopy = ms / 1e3 copy_percent = 100*(elapsed_time - elapsed_nocopy) / elapsed_time - speed_in_MiB = (n * 128) / ((1 << 20) * elapsed_time) - speed_in_MiB_nocopy = (n * 128) / ((1 << 20) * elapsed_nocopy) + speed_in_MiB = clks_MiB / elapsed_time + speed_in_MiB_nocopy = clks_MiB / elapsed_nocopy print("Native code (no copy): | {:7.2f} | {:9.2f} " .format(ms, speed_in_MiB_nocopy)) print("Native code (w/ copy): | {:7.2f} | {:9.2f} ({:.1f}% copying)" From 603b6d47669da2279c16a38690ae23853e7ad40f Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 2 Feb 2018 11:02:39 +1100 Subject: [PATCH 08/49] Remove unused import. --- anonlink/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anonlink/util.py b/anonlink/util.py index 24982adb..dfadfc54 100644 --- a/anonlink/util.py +++ b/anonlink/util.py @@ -2,7 +2,6 @@ import os import random -import time from bitarray import bitarray from anonlink._entitymatcher import ffi, lib From de33a67c723b3b74ce4a097c5882481e5048df39 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 2 Feb 2018 11:05:54 +1100 Subject: [PATCH 09/49] Add documentation. --- _cffi_build/dice_one_against_many.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 7edb78dc..23d5c6bd 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -122,6 +122,12 @@ dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_pop return (2 * uv_popc) / (double) (u_popc + v_popc); } +/** + * Convert clock measurement t to milliseconds. + * + * t should have been obtained as the difference of calls to clock() + * for this to make sense. + */ static inline double to_millis(clock_t t) { static constexpr double CPS = (double)CLOCKS_PER_SEC; From a458ed023c348a3d7eed975e6041a8a6c227f657 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 2 Feb 2018 14:55:20 +1100 Subject: [PATCH 10/49] Expand reporting of various measurements. --- anonlink/benchmark.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py index ab854542..83b050fc 100644 --- a/anonlink/benchmark.py +++ b/anonlink/benchmark.py @@ -51,11 +51,11 @@ def compute_popcount_speed(n): def print_comparison_header(threshold): - print("Threshold = ", threshold) - print("Size 1 | Size 2 | Comparisons | Total Time (simat/solv) | Million Comparisons per second") + print("\nThreshold:", threshold) + print("Size 1 | Size 2 | Comparisons (match %) | Total Time (simat/solv) | Throughput (1e6 cmp/s)") -def compute_comparison_speed(n1=100, n2=100, threshold=0.75): +def compute_comparison_speed(n1, n2, threshold): """ Using the greedy solver, how fast can hashes be computed using one core. """ @@ -69,15 +69,16 @@ def compute_comparison_speed(n1=100, n2=100, threshold=0.75): res = greedy_solver(sparse_matrix) end = timer() - #print("mat size = ", len(sparse_matrix)) similarity_time = t1 - start solver_time = end - t1 elapsed_time = end - start - print("{:6d} | {:6d} | {:12d} | {:7.3f}s ({:3.1f}% / {:3.1f}%) | {:12.3f} -- {:8d} = {:2.1f}%".format( - n1, n2, n1*n2, elapsed_time, + print("{:6d} | {:6d} | {:6d}e6 ({:5.2f}%) | {:6.3f}s ({:4.1f}% / {:4.1f}%) | {:8.3f}".format( + n1, n2, n1*n2 // 1000000, + 100.0*len(sparse_matrix)/(n1*n2), + elapsed_time, 100.0*similarity_time/elapsed_time, 100.0*solver_time/elapsed_time, - (n1*n2)/(1e6*similarity_time), len(sparse_matrix), 100.0*len(sparse_matrix)/(n1*n2))) + (n1*n2)/(1e6*similarity_time))) return elapsed_time @@ -137,14 +138,23 @@ def benchmark(size, compare): 2000000 ] - #for thld in [0.95, 0.85, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5]: - for thld in [0.22, 0.25, 0.27, 0.48, 0.49, 0.5, 0.51, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64]: - #print_comparison_header(thld) - print("threshold = ", thld) - for test_size in possible_test_sizes: - if test_size <= size: - compute_comparison_speed(test_size, test_size, thld) + # Testing two things: + # - the Dice coefficient calculation + # - picking the top k candidates + + thld = 0.5 + print_comparison_header(thld) + for test_size in possible_test_sizes: + if test_size <= size: + compute_comparison_speed(test_size, test_size, thld) + + thld = 0.7 + print_comparison_header(thld) + size *= 5 + for test_size in possible_test_sizes: + if test_size <= size: + compute_comparison_speed(test_size, test_size, thld) if __name__ == '__main__': - benchmark(1000, False) + benchmark(4000, False) #benchmark(20000, False) From 7d2e66c7b7d68f6211d23dcf65b445e53ff7495f Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 5 Feb 2018 12:51:27 +1100 Subject: [PATCH 11/49] Comments. --- anonlink/benchmark.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py index 83b050fc..c4c78819 100644 --- a/anonlink/benchmark.py +++ b/anonlink/benchmark.py @@ -25,6 +25,8 @@ def compute_popcount_speed(n): print("{:6d} x {:d} bit popcounts".format(n, clk_bits)) print("Implementation | Time (ms) | Bandwidth (MiB/s)") + + # Python start = timer() popcounts = popcount_vector(clks, use_native=False) end = timer() @@ -124,6 +126,8 @@ def compare_python_c(ntotal=10000, nsubset=6000, frac=0.8): def benchmark(size, compare): + print("Anonlink benchmark -- see README for explanation") + print("------------------------------------------------") if compare: print(compare_python_c(ntotal=1000, nsubset=600)) @@ -138,10 +142,6 @@ def benchmark(size, compare): 2000000 ] - # Testing two things: - # - the Dice coefficient calculation - # - picking the top k candidates - thld = 0.5 print_comparison_header(thld) for test_size in possible_test_sizes: From 9666eae628585e61429a7eef415871dbd0b61449 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 5 Feb 2018 13:54:04 +1100 Subject: [PATCH 12/49] Update README. --- README.rst | 78 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/README.rst b/README.rst index 9c025eb5..8fe60808 100644 --- a/README.rst +++ b/README.rst @@ -43,28 +43,68 @@ For linux with: Benchmark --------- +You can run the benchmark with: + :: $ python -m anonlink.benchmark - 100000 x 1024 bit popcounts in 0.016376 seconds - Popcount speed: 745.42 MiB/s - Size 1 | Size 2 | Comparisons | Compute Time | Million Comparisons per second - 1000 | 1000 | 1000000 | 0.060s | 16.632 - 2000 | 2000 | 4000000 | 0.159s | 25.232 - 3000 | 3000 | 9000000 | 0.316s | 28.524 - 4000 | 4000 | 16000000 | 0.486s | 32.943 - 5000 | 5000 | 25000000 | 0.584s | 42.825 - 6000 | 6000 | 36000000 | 0.600s | 60.027 - 7000 | 7000 | 49000000 | 0.621s | 78.875 - 8000 | 8000 | 64000000 | 0.758s | 84.404 - 9000 | 9000 | 81000000 | 0.892s | 90.827 - 10000 | 10000 | 100000000 | 1.228s | 81.411 - 20000 | 20000 | 400000000 | 3.980s | 100.504 - 30000 | 30000 | 900000000 | 9.280s | 96.986 - 40000 | 40000 | 1600000000 | 17.318s | 92.391 - -C++ version uses cpu instruction ``POPCNT`` for bitcount in a 64bit -word. http://wm.ite.pl/articles/sse-popcount.html + Anonlink benchmark -- see README for explanation + ------------------------------------------------ + 100000 x 1024 bit popcounts + Implementation | Time (ms) | Bandwidth (MiB/s) + Python (bitarray.count()): | 20.83 | 586.12 + Native code (no copy): | 0.91 | 13443.87 + Native code (w/ copy): | 381.83 | 31.97 (99.8% copying) + + Threshold: 0.5 + Size 1 | Size 2 | Comparisons (match %) | Total Time (simat/solv) | Throughput (1e6 cmp/s) + 1000 | 1000 | 1e6 (49.59%) | 0.293s (89.7% / 10.3%) | 3.812 + 2000 | 2000 | 4e6 (50.33%) | 1.151s (89.2% / 10.8%) | 3.899 + 3000 | 3000 | 9e6 (50.94%) | 2.611s (88.7% / 11.3%) | 3.886 + 4000 | 4000 | 16e6 (50.54%) | 4.635s (88.3% / 11.7%) | 3.910 + + Threshold: 0.7 + Size 1 | Size 2 | Comparisons (match %) | Total Time (simat/solv) | Throughput (1e6 cmp/s) + 1000 | 1000 | 1e6 ( 0.01%) | 0.018s (99.8% / 0.2%) | 54.846 + 2000 | 2000 | 4e6 ( 0.01%) | 0.067s (99.9% / 0.1%) | 59.983 + 3000 | 3000 | 9e6 ( 0.01%) | 0.131s (99.8% / 0.2%) | 68.958 + 4000 | 4000 | 16e6 ( 0.01%) | 0.219s (99.9% / 0.1%) | 73.092 + 5000 | 5000 | 25e6 ( 0.01%) | 0.333s (99.9% / 0.1%) | 75.280 + 6000 | 6000 | 36e6 ( 0.01%) | 0.472s (99.9% / 0.1%) | 76.373 + 7000 | 7000 | 49e6 ( 0.01%) | 0.629s (99.9% / 0.1%) | 78.030 + 8000 | 8000 | 64e6 ( 0.01%) | 0.809s (99.9% / 0.1%) | 79.255 + 9000 | 9000 | 81e6 ( 0.01%) | 1.024s (99.9% / 0.1%) | 79.212 + 10000 | 10000 | 100e6 ( 0.01%) | 1.386s (99.9% / 0.1%) | 72.233 + 20000 | 20000 | 400e6 ( 0.01%) | 4.932s (99.9% / 0.1%) | 81.185 + +The tables are interpreted as follows. The first section compares the +bandwidth doing popcounts through (i) the Python bitarray library and +(ii) a native code implementation in assembler. The latter +implementation is measured in two ways: the first measures just the +time taken to compute the popcounts, while the second includes the +time taken to copy the data out of the running Python instance as well +as copying the result back into Python. The "% copying" measure is the +proportion of time spent doing this copying. + +The second section includes two tables that measure the throughput of +the Dice coefficient comparison function. The two tables correspond to +two different choices of "matching threshold", 0.5 and 0.7, which were +chosen to characterise two different performance scenarios. Since the +data used for comparisons is randomly generated, the first threshold +value will cause about 50% of the candidates to "match", while the +second threshold value will cause <0.01% of the candidates to match +(these values are reported in the "match %" column). In the first +case, the large number of matches means that much of the time is spent +keeping the candidates in order so that the top `k` matches can be +returned. In the latter case, the tiny number of candidate matches +means that the throughput is determined primarily by the comparison +code itself. + +Finally, the Total Time column includes indications as to the +proportion of time spent calculating the (sparse) similarity matrix +(`simat`) and the proportion of time spent in the greedy solver +(`solv`). This latter is determined by the size of the similarity +matrix, which will be approximately `#comparisons * match% / 100`. Tests ===== From 6fe3663972842255d97a288952b9865b0cae2e44 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 5 Feb 2018 15:30:04 +1100 Subject: [PATCH 13/49] Bring test suite up-to-date. --- anonlink/benchmark.py | 14 +++++++------- tests/test_benchmark.py | 5 +---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py index c4c78819..5a43c101 100644 --- a/anonlink/benchmark.py +++ b/anonlink/benchmark.py @@ -31,9 +31,9 @@ def compute_popcount_speed(n): popcounts = popcount_vector(clks, use_native=False) end = timer() elapsed_time = end - start - speed_in_MiB = clks_MiB / elapsed_time + python_speed_in_MiB = clks_MiB / elapsed_time print("Python (bitarray.count()): | {:7.2f} | {:9.2f} " - .format(elapsed_time * 1e3, speed_in_MiB)) + .format(elapsed_time * 1e3, python_speed_in_MiB)) # Native start = timer() @@ -42,14 +42,14 @@ def compute_popcount_speed(n): elapsed_time = end - start elapsed_nocopy = ms / 1e3 copy_percent = 100*(elapsed_time - elapsed_nocopy) / elapsed_time - speed_in_MiB = clks_MiB / elapsed_time - speed_in_MiB_nocopy = clks_MiB / elapsed_nocopy + native_speed_in_MiB = clks_MiB / elapsed_time + native_speed_in_MiB_nocopy = clks_MiB / elapsed_nocopy print("Native code (no copy): | {:7.2f} | {:9.2f} " - .format(ms, speed_in_MiB_nocopy)) + .format(ms, native_speed_in_MiB_nocopy)) print("Native code (w/ copy): | {:7.2f} | {:9.2f} ({:.1f}% copying)" - .format(elapsed_time * 1e3, speed_in_MiB, copy_percent)) + .format(elapsed_time * 1e3, native_speed_in_MiB, copy_percent)) - return speed_in_MiB + return python_speed_in_MiB def print_comparison_header(threshold): diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index acc6fc06..cb6f0a2f 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -11,10 +11,7 @@ def test_benchmarking_popcount(self): self.assertGreater(speed, 50, "Popcounting at less than 50MiB/s") def test_comparison_speed_benchmark(self): - benchmark.compute_comparison_speed() - - def test_parallel_comparison_speed_benchmark(self): - benchmark.compute_comparison_speed_parallel() + benchmark.compute_comparison_speed(100, 100, 0.7) def test_comparing_python_c_bench(self): benchmark.compare_python_c(500, 30, frac=0.8) From 66d9b6e644606216e5a9b07d6b42067ae96e5b0a Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 15 Jan 2018 12:03:36 +1100 Subject: [PATCH 14/49] Refactor main C++ function to avoid use "constant" memory and avoid new/delete. --- _cffi_build/dice_one_against_many.cpp | 31 +++++++++------------------ 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 23d5c6bd..66565fdd 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -177,9 +177,6 @@ extern "C" std::priority_queue, score_cmp> max_k_scores; uint32_t count_one = builtin_popcnt_unrolled_errata_manual(comp1); - - double *all_scores = new double[n]; - uint32_t max_popcnt_delta = 1024; if(threshold > 0) { max_popcnt_delta = calculate_max_difference(count_one, threshold); @@ -188,32 +185,24 @@ extern "C" for (int j = 0; j < n; j++) { const uint64_t *current = comp2 + j * KEYWORDS; + const uint32_t counts_many_j = counts_many[j]; - if(count_one > counts_many[j]){ - current_delta = count_one - counts_many[j]; + if (count_one > counts_many_j) { + current_delta = count_one - counts_many_j; } else { - current_delta = counts_many[j] - count_one; + current_delta = counts_many_j - count_one; } if(current_delta <= max_popcnt_delta){ - all_scores[j] = dice_coeff(comp1, count_one, current, counts_many[j]); - } else { - // Skipping because popcount difference too large - all_scores[j] = -1; - } - } - - for (int j = 0; j < n; j++) { - - if(all_scores[j] >= threshold) { - max_k_scores.push(Node(j, all_scores[j])); + double score = dice_coeff(comp1, count_one, current, counts_many[j]); + if (score >= threshold) { + max_k_scores.push(Node(j, score)); + if (max_k_scores.size() > k) + max_k_scores.pop(); + } } - - if(max_k_scores.size() > k) max_k_scores.pop(); } - delete[] all_scores; - int i = 0; while (!max_k_scores.empty()) { From 3a55dc4145c67f7eafc46b92bf853549511776d1 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 6 Feb 2018 22:00:18 +1100 Subject: [PATCH 15/49] Screw everything up by unrolling with C++ templates, apparently. --- _cffi_build/dice_one_against_many.cpp | 29 ++++++++++++++------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 72f80d84..b4302578 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -8,16 +8,17 @@ template -uint32_t popcount(const uint64_t *buf) { - return popcount<4>(buf) + popcount(buf + 4); +void popcount(uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint64_t *buf) { + popcount<4>(c0, c1, c2, c3, buf); + popcount(c0, c1, c2, c3, buf + 4); } template<> -uint32_t popcount<4>(const uint64_t* buf) { - uint64_t b0, b1, b2, b3; - uint64_t c0, c1, c2, c3; - c0 = c1 = c2 = c3 = 0; +void popcount<4>( + uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, + const uint64_t* buf) { + uint64_t b0, b1, b2, b3; b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3]; __asm__( "popcnt %4, %4 \n\t" @@ -30,8 +31,6 @@ uint32_t popcount<4>(const uint64_t* buf) { "add %7, %3 \n\t" : "+r" (c0), "+r" (c1), "+r" (c2), "+r" (c3), "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3)); - - return c0 + c1 + c2 + c3; } static uint32_t @@ -40,10 +39,11 @@ popcount_array(const uint64_t *buf, int n) { // iteration. Currently 16, which corresponds to 16*64 = 1024 bits. static constexpr int WORDS_PER_POPCOUNT = 16; assert(n % WORDS_PER_POPCOUNT == 0); - uint32_t pc = 0; + uint64_t c0, c1, c2, c3; + c0 = c1 = c2 = c3 = 0; for (int i = 0; i < n; i += WORDS_PER_POPCOUNT) - pc += popcount(buf + i); - return pc; + popcount(c0, c1, c2, c3, buf + i); + return c0 + c1 + c2 + c3; } static uint32_t @@ -52,14 +52,15 @@ popcount_combined_array(const uint64_t *__restrict__ buf1, const uint64_t *__res // iteration. Currently 16, which corresponds to 16*64 = 1024 bits. static constexpr int WORDS_PER_POPCOUNT = 16; assert(n % WORDS_PER_POPCOUNT == 0); - uint32_t pc = 0; uint64_t combined[WORDS_PER_POPCOUNT]; + uint64_t c0, c1, c2, c3; + c0 = c1 = c2 = c3 = 0; for (int i = 0; i < n; i += WORDS_PER_POPCOUNT) { for (int j = 0; j < WORDS_PER_POPCOUNT; ++j) combined[j] = buf1[i + j] & buf2[i + j]; - pc += popcount(combined); + popcount(c0, c1, c2, c3, combined); } - return pc; + return c0 + c1 + c2 + c3; } /** From b94c555f0bdf1168d786097493379b30c31136a2 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Wed, 7 Feb 2018 12:54:45 +1100 Subject: [PATCH 16/49] Magical argument that makes the compiler generate the correct (performant) code. --- _cffi_build/build_matcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py index cebbf15f..8592e183 100644 --- a/_cffi_build/build_matcher.py +++ b/_cffi_build/build_matcher.py @@ -15,7 +15,8 @@ "_entitymatcher", source, source_extension='.cpp', - extra_compile_args=['-Wall', '-Wextra', '-Werror', '-O3', '-std=c++11', '-mssse3', '-mpopcnt'], + extra_compile_args=['-Wall', '-Wextra', '-Werror', '-O3', '-std=c++11', '-mssse3', '-mpopcnt', '-fvisibility=hidden' + ], ) ffibuilder.cdef(""" From 166f6e95058628971945469e3c72cde578acb35a Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Wed, 7 Feb 2018 13:45:15 +1100 Subject: [PATCH 17/49] Address Brian's comments. --- README.rst | 64 +++++++++++++++++++++++-------------------- anonlink/benchmark.py | 16 +++++------ anonlink/util.py | 22 +++++++++------ 3 files changed, 54 insertions(+), 48 deletions(-) diff --git a/README.rst b/README.rst index 8fe60808..f3c42eae 100644 --- a/README.rst +++ b/README.rst @@ -46,36 +46,39 @@ Benchmark You can run the benchmark with: :: - - $ python -m anonlink.benchmark + $ python3 -m anonlink.benchmark Anonlink benchmark -- see README for explanation ------------------------------------------------ 100000 x 1024 bit popcounts Implementation | Time (ms) | Bandwidth (MiB/s) - Python (bitarray.count()): | 20.83 | 586.12 - Native code (no copy): | 0.91 | 13443.87 - Native code (w/ copy): | 381.83 | 31.97 (99.8% copying) + Python (bitarray.count()): | 18.40 | 663.30 + Native code (no copy): | 0.97 | 12558.67 + Native code (w/ copy): | 347.66 | 35.11 (99.7% copying) Threshold: 0.5 - Size 1 | Size 2 | Comparisons (match %) | Total Time (simat/solv) | Throughput (1e6 cmp/s) - 1000 | 1000 | 1e6 (49.59%) | 0.293s (89.7% / 10.3%) | 3.812 - 2000 | 2000 | 4e6 (50.33%) | 1.151s (89.2% / 10.8%) | 3.899 - 3000 | 3000 | 9e6 (50.94%) | 2.611s (88.7% / 11.3%) | 3.886 - 4000 | 4000 | 16e6 (50.54%) | 4.635s (88.3% / 11.7%) | 3.910 + Size 1 | Size 2 | Comparisons | Total Time (s) | Throughput + | | (match %) | (comparisons / matching)| (1e6 cmp/s) + -------+--------+------------------+-------------------------+------------- + 1000 | 1000 | 1e6 (50.20%) | 0.249 (88.6% / 11.4%) | 4.525 + 2000 | 2000 | 4e6 (50.51%) | 1.069 (88.5% / 11.5%) | 4.227 + 3000 | 3000 | 9e6 (50.51%) | 2.412 (85.3% / 14.7%) | 4.375 + 4000 | 4000 | 16e6 (50.56%) | 4.316 (83.6% / 16.4%) | 4.434 Threshold: 0.7 - Size 1 | Size 2 | Comparisons (match %) | Total Time (simat/solv) | Throughput (1e6 cmp/s) - 1000 | 1000 | 1e6 ( 0.01%) | 0.018s (99.8% / 0.2%) | 54.846 - 2000 | 2000 | 4e6 ( 0.01%) | 0.067s (99.9% / 0.1%) | 59.983 - 3000 | 3000 | 9e6 ( 0.01%) | 0.131s (99.8% / 0.2%) | 68.958 - 4000 | 4000 | 16e6 ( 0.01%) | 0.219s (99.9% / 0.1%) | 73.092 - 5000 | 5000 | 25e6 ( 0.01%) | 0.333s (99.9% / 0.1%) | 75.280 - 6000 | 6000 | 36e6 ( 0.01%) | 0.472s (99.9% / 0.1%) | 76.373 - 7000 | 7000 | 49e6 ( 0.01%) | 0.629s (99.9% / 0.1%) | 78.030 - 8000 | 8000 | 64e6 ( 0.01%) | 0.809s (99.9% / 0.1%) | 79.255 - 9000 | 9000 | 81e6 ( 0.01%) | 1.024s (99.9% / 0.1%) | 79.212 - 10000 | 10000 | 100e6 ( 0.01%) | 1.386s (99.9% / 0.1%) | 72.233 - 20000 | 20000 | 400e6 ( 0.01%) | 4.932s (99.9% / 0.1%) | 81.185 + Size 1 | Size 2 | Comparisons | Total Time (s) | Throughput + | | (match %) | (comparisons / matching)| (1e6 cmp/s) + -------+--------+------------------+-------------------------+------------- + 1000 | 1000 | 1e6 ( 0.01%) | 0.017 (99.8% / 0.2%) | 59.605 + 2000 | 2000 | 4e6 ( 0.01%) | 0.056 (99.8% / 0.2%) | 71.484 + 3000 | 3000 | 9e6 ( 0.01%) | 0.118 (99.9% / 0.1%) | 76.500 + 4000 | 4000 | 16e6 ( 0.01%) | 0.202 (99.9% / 0.1%) | 79.256 + 5000 | 5000 | 25e6 ( 0.01%) | 0.309 (99.9% / 0.1%) | 81.093 + 6000 | 6000 | 36e6 ( 0.01%) | 0.435 (99.9% / 0.1%) | 82.841 + 7000 | 7000 | 49e6 ( 0.01%) | 0.590 (99.9% / 0.1%) | 83.164 + 8000 | 8000 | 64e6 ( 0.01%) | 0.757 (99.9% / 0.1%) | 84.619 + 9000 | 9000 | 81e6 ( 0.01%) | 0.962 (99.8% / 0.2%) | 84.358 + 10000 | 10000 | 100e6 ( 0.01%) | 1.166 (99.8% / 0.2%) | 85.895 + 20000 | 20000 | 400e6 ( 0.01%) | 4.586 (99.9% / 0.1%) | 87.334 The tables are interpreted as follows. The first section compares the bandwidth doing popcounts through (i) the Python bitarray library and @@ -93,17 +96,18 @@ chosen to characterise two different performance scenarios. Since the data used for comparisons is randomly generated, the first threshold value will cause about 50% of the candidates to "match", while the second threshold value will cause <0.01% of the candidates to match -(these values are reported in the "match %" column). In the first -case, the large number of matches means that much of the time is spent -keeping the candidates in order so that the top `k` matches can be -returned. In the latter case, the tiny number of candidate matches -means that the throughput is determined primarily by the comparison -code itself. +(these values are reported in the "match %" column). In both cases, +all matches above the threshold are returned and passed to the +solver. In the first case, the large number of matches means that much +of the time is spent keeping the candidates in order so that the top +`k` matches can be returned. In the latter case, the tiny number of +candidate matches means that the throughput is determined primarily by +the comparison code itself. Finally, the Total Time column includes indications as to the proportion of time spent calculating the (sparse) similarity matrix -(`simat`) and the proportion of time spent in the greedy solver -(`solv`). This latter is determined by the size of the similarity +`comparisons` and the proportion of time spent `matching` in the +greedy solver. This latter is determined by the size of the similarity matrix, which will be approximately `#comparisons * match% / 100`. Tests diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py index 5a43c101..14bf38e4 100644 --- a/anonlink/benchmark.py +++ b/anonlink/benchmark.py @@ -27,25 +27,21 @@ def compute_popcount_speed(n): print("Implementation | Time (ms) | Bandwidth (MiB/s)") # Python - start = timer() - popcounts = popcount_vector(clks, use_native=False) - end = timer() - elapsed_time = end - start + popcounts, elapsed_time = popcount_vector(clks, use_python=True) python_speed_in_MiB = clks_MiB / elapsed_time print("Python (bitarray.count()): | {:7.2f} | {:9.2f} " .format(elapsed_time * 1e3, python_speed_in_MiB)) # Native start = timer() - popcounts, ms = popcount_vector(clks, use_native=True) + popcounts, elapsed_nocopy = popcount_vector(clks, use_python=False) end = timer() elapsed_time = end - start - elapsed_nocopy = ms / 1e3 copy_percent = 100*(elapsed_time - elapsed_nocopy) / elapsed_time native_speed_in_MiB = clks_MiB / elapsed_time native_speed_in_MiB_nocopy = clks_MiB / elapsed_nocopy print("Native code (no copy): | {:7.2f} | {:9.2f} " - .format(ms, native_speed_in_MiB_nocopy)) + .format(elapsed_nocopy * 1e3, native_speed_in_MiB_nocopy)) print("Native code (w/ copy): | {:7.2f} | {:9.2f} ({:.1f}% copying)" .format(elapsed_time * 1e3, native_speed_in_MiB, copy_percent)) @@ -54,7 +50,9 @@ def compute_popcount_speed(n): def print_comparison_header(threshold): print("\nThreshold:", threshold) - print("Size 1 | Size 2 | Comparisons (match %) | Total Time (simat/solv) | Throughput (1e6 cmp/s)") + print("Size 1 | Size 2 | Comparisons | Total Time (s) | Throughput") + print(" | | (match %) | (comparisons / matching)| (1e6 cmp/s)") + print("-------+--------+------------------+-------------------------+-------------") def compute_comparison_speed(n1, n2, threshold): @@ -74,7 +72,7 @@ def compute_comparison_speed(n1, n2, threshold): similarity_time = t1 - start solver_time = end - t1 elapsed_time = end - start - print("{:6d} | {:6d} | {:6d}e6 ({:5.2f}%) | {:6.3f}s ({:4.1f}% / {:4.1f}%) | {:8.3f}".format( + print("{:6d} | {:6d} | {:4d}e6 ({:5.2f}%) | {:6.3f} ({:4.1f}% / {:4.1f}%) | {:8.3f}".format( n1, n2, n1*n2 // 1000000, 100.0*len(sparse_matrix)/(n1*n2), elapsed_time, diff --git a/anonlink/util.py b/anonlink/util.py index dfadfc54..b0f1cb2c 100644 --- a/anonlink/util.py +++ b/anonlink/util.py @@ -3,6 +3,7 @@ import os import random from bitarray import bitarray +from timeit import default_timer as timer from anonlink._entitymatcher import ffi, lib @@ -20,20 +21,23 @@ def generate_clks(n): return res -def popcount_vector(bitarrays, use_native=False): - """Return an array containing the popcounts of the elements of - bitarrays. If use_native is True, use the native code - implementation and return the time spent (in milliseconds) in the - native code as a second return value. +def popcount_vector(bitarrays, use_python=True): + """Return a list containing the popcounts of the elements of + bitarrays, and the time (in seconds) it took. If use_python is + False, use the native code implementation instead of Python; in + this case the returned time is the time spent in the native code, + NOT including copying to and from the Python runtime. Note, due to the overhead of converting bitarrays into bytes, it is currently more expensive to call our C implementation than just calling bitarray.count() - """ # Use Python - if not use_native: - return [clk.count() for clk in bitarrays] + if use_python: + start = timer() + counts = [clk.count() for clk in bitarrays] + elapsed = timer() - start + return counts, elapsed # Use native code n = len(bitarrays) @@ -42,7 +46,7 @@ def popcount_vector(bitarrays, use_native=False): bytes([b for f in bitarrays for b in f.tobytes()])) ms = lib.popcount_1024_array(many, n, c_popcounts) - return [c_popcounts[i] for i in range(n)], ms + return [c_popcounts[i] for i in range(n)], ms * 1e-3 def chunks(l, n): From 9cbc2432f5deb7a99ffdbb6ade773be05a0a1164 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Wed, 7 Feb 2018 13:53:06 +1100 Subject: [PATCH 18/49] Update tests; also test native code version. --- tests/test_util.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_util.py b/tests/test_util.py index 521cada5..2dae61d5 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -21,8 +21,13 @@ def test_generate_clks(self): def test_popcount_vector(self): bas = [util.generate_bitarray(1024) for i in range(100)] - popcounts = util.popcount_vector(bas) + popcounts, _ = util.popcount_vector(bas, use_python=True) + self.assertEquals(len(popcounts), 100) + for i, cnt in enumerate(popcounts): + self.assertEquals(cnt, bas[i].count()) + + popcounts, _ = util.popcount_vector(bas, use_python=False) self.assertEquals(len(popcounts), 100) for i, cnt in enumerate(popcounts): self.assertEquals(cnt, bas[i].count()) From cf26901cb9734e5b7160a245d727fbd4a5a018d0 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 9 Feb 2018 10:56:45 +1100 Subject: [PATCH 19/49] Print popcount throughput; give some variables better names. --- anonlink/benchmark.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/anonlink/benchmark.py b/anonlink/benchmark.py index 14bf38e4..2fd0e438 100644 --- a/anonlink/benchmark.py +++ b/anonlink/benchmark.py @@ -24,13 +24,15 @@ def compute_popcount_speed(n): clks = [generate_bitarray(clk_bits) for _ in range(n)] print("{:6d} x {:d} bit popcounts".format(n, clk_bits)) - print("Implementation | Time (ms) | Bandwidth (MiB/s)") + print("Implementation | Time (ms) | Bandwidth (MiB/s) | Throughput (1e6 popc/s)") # Python popcounts, elapsed_time = popcount_vector(clks, use_python=True) python_speed_in_MiB = clks_MiB / elapsed_time - print("Python (bitarray.count()): | {:7.2f} | {:9.2f} " - .format(elapsed_time * 1e3, python_speed_in_MiB)) + python_Mops = n / (1e6 * elapsed_time) + elapsed_time_ms = elapsed_time * 1e3 + print("Python (bitarray.count()): | {:7.2f} | {:9.2f} | {:7.2f}" + .format(elapsed_time_ms, python_speed_in_MiB, python_Mops)) # Native start = timer() @@ -38,12 +40,16 @@ def compute_popcount_speed(n): end = timer() elapsed_time = end - start copy_percent = 100*(elapsed_time - elapsed_nocopy) / elapsed_time + elapsed_time_ms = elapsed_time * 1e3 + elapsed_nocopy_ms = elapsed_nocopy * 1e3 native_speed_in_MiB = clks_MiB / elapsed_time native_speed_in_MiB_nocopy = clks_MiB / elapsed_nocopy - print("Native code (no copy): | {:7.2f} | {:9.2f} " - .format(elapsed_nocopy * 1e3, native_speed_in_MiB_nocopy)) - print("Native code (w/ copy): | {:7.2f} | {:9.2f} ({:.1f}% copying)" - .format(elapsed_time * 1e3, native_speed_in_MiB, copy_percent)) + native_Mops = n / (1e6 * elapsed_time) + native_Mops_nocopy = n / (1e6 * elapsed_nocopy) + print("Native code (no copy): | {:7.2f} | {:9.2f} | {:7.2f}" + .format(elapsed_nocopy_ms, native_speed_in_MiB_nocopy, native_Mops_nocopy)) + print("Native code (w/ copy): | {:7.2f} | {:9.2f} | {:7.2f} ({:.1f}% copying)" + .format(elapsed_time_ms, native_speed_in_MiB, native_Mops, copy_percent)) return python_speed_in_MiB From d02f23a06d4be731ae444ee3d431f59298b688a5 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 9 Feb 2018 11:59:13 +1100 Subject: [PATCH 20/49] Make some functions static inline. --- _cffi_build/dice_one_against_many.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index b4302578..66f0f94d 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -108,12 +108,13 @@ struct score_cmp{ /** * */ -static uint32_t calculate_max_difference(uint32_t popcnt_a, double threshold) +static inline uint32_t +calculate_max_difference(uint32_t popcnt_a, double threshold) { return 2 * popcnt_a * (1/threshold - 1); } -static double +static inline double dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_popc, int n) { uint32_t uv_popc = popcount_combined_array(u, v, n); @@ -126,12 +127,20 @@ dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_pop * t should have been obtained as the difference of calls to clock() * for this to make sense. */ -static inline double to_millis(clock_t t) +static inline double +to_millis(clock_t t) { static constexpr double CPS = (double)CLOCKS_PER_SEC; return t * 1.0E3 / CPS; } +static inline uint32_t +abs_diff(uint32_t a, uint32_t b) { + if (a > b) + return a - b; + return b - a; +} + extern "C" { /** From 888e989bedd16297a521cbae74c77fa8e26ddaaf Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 9 Feb 2018 13:19:08 +1100 Subject: [PATCH 21/49] Tidy up some expressions. --- _cffi_build/dice_one_against_many.cpp | 33 +++++++++++---------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 66f0f94d..48628496 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -167,8 +167,8 @@ extern "C" /** * Calculate up to the top k indices and scores. Returns the - * number matched above a threshold or -1 if keybytes is not a - * multiple of 32. + * number matched above the given threshold or -1 if keybytes is + * not a multiple of 8. */ int match_one_against_many_dice_k_top( const char *one, @@ -184,35 +184,28 @@ extern "C" const uint64_t *comp1 = (const uint64_t *) one; const uint64_t *comp2 = (const uint64_t *) many; - // FIXME: This comment needs to be updated - // keybytes must be divisible by 32, because keywords must be - // divisible by 4 for the builtin popcount function to work - // and keywords = keybytes / 8. static constexpr int WORDBYTES = sizeof(uint64_t); - int keywords = keybytes / WORDBYTES; - if (keywords % 16 != 0) + if (keybytes % WORDBYTES != 0) return -1; + int keywords = keybytes / WORDBYTES; - std::priority_queue, score_cmp> max_k_scores; + typedef std::vector node_vector; + typedef std::priority_queue, score_cmp> node_queue; + node_vector vec; + vec.reserve(k + 1); + node_queue max_k_scores(score_cmp(), std::move(vec)); uint32_t count_one = popcount_array(comp1, keywords); - uint32_t max_popcnt_delta = keywords * WORDBYTES * 8; // = bits per key + uint32_t max_popcnt_delta = keybytes * 8; // = bits per key if(threshold > 0) { max_popcnt_delta = calculate_max_difference(count_one, threshold); } - for (int j = 0; j < n; j++) { - const uint64_t *current = comp2 + j * keywords; + const uint64_t *current = comp2; + for (int j = 0; j < n; j++, current += keywords) { const uint32_t counts_many_j = counts_many[j]; - uint32_t current_delta; - - if (count_one > counts_many_j) { - current_delta = count_one - counts_many_j; - } else { - current_delta = counts_many_j - count_one; - } - if (current_delta <= max_popcnt_delta) { + if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) { double score = dice_coeff(comp1, count_one, current, counts_many_j, keywords); if (score >= threshold) { max_k_scores.push(Node(j, score)); From c6780f0d1358384be2c640fa53f471ea5432af22 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 9 Feb 2018 13:21:34 +1100 Subject: [PATCH 22/49] Put some braces in the right place; make fn inline. --- _cffi_build/dice_one_against_many.cpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 48628496..5b56de02 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -33,7 +33,7 @@ void popcount<4>( "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3)); } -static uint32_t +static inline uint32_t popcount_array(const uint64_t *buf, int n) { // WORDS_PER_POPCOUNT is how many elements of buf we process each // iteration. Currently 16, which corresponds to 16*64 = 1024 bits. @@ -93,13 +93,11 @@ class Node { // Constructor with default Node( int n_index = -1, double n_score = -1.0 ) - :index(n_index), score( n_score ) - { - } + :index(n_index), score( n_score ) { } }; struct score_cmp{ - bool operator()(const Node& a, const Node& b) const{ + bool operator()(const Node& a, const Node& b) const { return a.score > b.score; } }; @@ -109,14 +107,12 @@ struct score_cmp{ * */ static inline uint32_t -calculate_max_difference(uint32_t popcnt_a, double threshold) -{ +calculate_max_difference(uint32_t popcnt_a, double threshold) { return 2 * popcnt_a * (1/threshold - 1); } static inline double -dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_popc, int n) -{ +dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_popc, int n) { uint32_t uv_popc = popcount_combined_array(u, v, n); return (2 * uv_popc) / (double) (u_popc + v_popc); } @@ -128,8 +124,7 @@ dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_pop * for this to make sense. */ static inline double -to_millis(clock_t t) -{ +to_millis(clock_t t) { static constexpr double CPS = (double)CLOCKS_PER_SEC; return t * 1.0E3 / CPS; } From 3f1104f09c05e5eb2c3809a00b594e8985e377cd Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 9 Feb 2018 13:33:09 +1100 Subject: [PATCH 23/49] Reinstate comment on origin of popcount assembler. --- _cffi_build/dice_one_against_many.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 5b56de02..2e9fc049 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -13,6 +13,16 @@ void popcount(uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint popcount(c0, c1, c2, c3, buf + 4); } +// Source: http://danluu.com/assembly-intrinsics/ +// https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance +// +// NB: Dan Luu's original assembly is incorrect because it +// clobbers registers marked as "input only" (see warning at +// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands +// -- this mistake does not materialise with GCC (4.9), but it +// does with Clang (3.6 and 3.8)). We fix the mistake by +// explicitly loading the contents of buf into registers and using +// these same registers for the intermediate popcnts. template<> void popcount<4>( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, From edc7c2bee494db412a255b7e92a5a6402f89b1bc Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 9 Feb 2018 15:15:07 +1100 Subject: [PATCH 24/49] Make constant a template parameter. --- _cffi_build/dice_one_against_many.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 2e9fc049..2c9d408b 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -43,11 +43,11 @@ void popcount<4>( "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3)); } -static inline uint32_t +// WORDS_PER_POPCOUNT is how many elements of buf we process each +// iteration. Currently 16, which corresponds to 16*64 = 1024 bits. +template< int WORDS_PER_POPCOUNT = 16 > +uint32_t popcount_array(const uint64_t *buf, int n) { - // WORDS_PER_POPCOUNT is how many elements of buf we process each - // iteration. Currently 16, which corresponds to 16*64 = 1024 bits. - static constexpr int WORDS_PER_POPCOUNT = 16; assert(n % WORDS_PER_POPCOUNT == 0); uint64_t c0, c1, c2, c3; c0 = c1 = c2 = c3 = 0; @@ -56,11 +56,11 @@ popcount_array(const uint64_t *buf, int n) { return c0 + c1 + c2 + c3; } -static uint32_t +// WORDS_PER_POPCOUNT is how many elements of buf we process each +// iteration. Currently 16, which corresponds to 16*64 = 1024 bits. +template< int WORDS_PER_POPCOUNT = 16 > +uint32_t popcount_combined_array(const uint64_t *__restrict__ buf1, const uint64_t *__restrict__ buf2, int n) { - // WORDS_PER_POPCOUNT is how many elements of buf we process each - // iteration. Currently 16, which corresponds to 16*64 = 1024 bits. - static constexpr int WORDS_PER_POPCOUNT = 16; assert(n % WORDS_PER_POPCOUNT == 0); uint64_t combined[WORDS_PER_POPCOUNT]; uint64_t c0, c1, c2, c3; From 892c599167a47c26e331400b11a98951a287f5fe Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Thu, 15 Feb 2018 11:12:28 +1100 Subject: [PATCH 25/49] Comment. --- _cffi_build/dice_one_against_many.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 2c9d408b..1c500a2b 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -194,6 +194,11 @@ extern "C" return -1; int keywords = keybytes / WORDBYTES; + // Here we create max_k_scores on the stack by providing it + // with a vector in which to put its elements. We do this so + // that we can reserve the amount of space needed for the + // scores in advance and avoid potential memory reallocation + // and copying. typedef std::vector node_vector; typedef std::priority_queue, score_cmp> node_queue; node_vector vec; From f500231263c8bd8a0257e9139e0acb8bf43f1577 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Thu, 15 Feb 2018 18:16:19 +1100 Subject: [PATCH 26/49] Complete version working with multiples of 1024 bits. --- _cffi_build/build_matcher.py | 4 +- _cffi_build/dice_one_against_many.cpp | 187 +++++++++++++++----------- anonlink/bloommatcher.py | 5 +- anonlink/util.py | 2 +- 4 files changed, 113 insertions(+), 85 deletions(-) diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py index 8592e183..9e0c685a 100644 --- a/_cffi_build/build_matcher.py +++ b/_cffi_build/build_matcher.py @@ -22,8 +22,8 @@ ffibuilder.cdef(""" int match_one_against_many_dice(const char * one, const char * many, int n, double * score); int match_one_against_many_dice_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, int keybytes, uint32_t k, double threshold, int *indices, double *scores); - double dice_coeff_1024(const char *e1, const char *e2); - double popcount_1024_array(const char *many, int n, uint32_t *counts_many); + double dice_coeff(const char *array1, const char *array2, int array_bytes); + double popcount_arrays(uint32_t *counts, const char *arrays, int narrays, int array_bytes); """) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 1c500a2b..a5eeda9c 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -5,7 +5,13 @@ #include #include #include +#include +// WORDS_PER_POPCOUNT determines how much we unroll the popcounting in +// each iteration of a loop. Currently 16, which corresponds to 16*64 +// = 1024 bits per loop. +static constexpr int WORDS_PER_POPCOUNT = 16; +static constexpr int WORD_BYTES = sizeof(uint64_t); template void popcount(uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint64_t *buf) { @@ -16,13 +22,14 @@ void popcount(uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint // Source: http://danluu.com/assembly-intrinsics/ // https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance // -// NB: Dan Luu's original assembly is incorrect because it -// clobbers registers marked as "input only" (see warning at +// NB: Dan Luu's original assembly (and the SO answer it was based on) +// is incorrect because it clobbers registers marked as "input only" +// (see warning at // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#InputOperands -// -- this mistake does not materialise with GCC (4.9), but it -// does with Clang (3.6 and 3.8)). We fix the mistake by -// explicitly loading the contents of buf into registers and using -// these same registers for the intermediate popcnts. +// -- this mistake does not materialise with GCC (4.9), but it does +// with Clang (3.6 and 3.8)). We fix the mistake by explicitly +// loading the contents of buf into registers and using these same +// registers for the intermediate popcnts. template<> void popcount<4>( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, @@ -43,60 +50,46 @@ void popcount<4>( "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3)); } -// WORDS_PER_POPCOUNT is how many elements of buf we process each -// iteration. Currently 16, which corresponds to 16*64 = 1024 bits. -template< int WORDS_PER_POPCOUNT = 16 > -uint32_t -popcount_array(const uint64_t *buf, int n) { - assert(n % WORDS_PER_POPCOUNT == 0); +// "Assumes" WORDS_PER_POPCOUNT divides nwords +static uint32_t +_popcount_array(const uint64_t *array, int nwords) { uint64_t c0, c1, c2, c3; c0 = c1 = c2 = c3 = 0; - for (int i = 0; i < n; i += WORDS_PER_POPCOUNT) - popcount(c0, c1, c2, c3, buf + i); + for (int i = 0; i < nwords; i += WORDS_PER_POPCOUNT) + popcount(c0, c1, c2, c3, array + i); return c0 + c1 + c2 + c3; } -// WORDS_PER_POPCOUNT is how many elements of buf we process each -// iteration. Currently 16, which corresponds to 16*64 = 1024 bits. -template< int WORDS_PER_POPCOUNT = 16 > -uint32_t -popcount_combined_array(const uint64_t *__restrict__ buf1, const uint64_t *__restrict__ buf2, int n) { - assert(n % WORDS_PER_POPCOUNT == 0); +// "Assumes" WORDS_PER_POPCOUNT divides nwords +static uint32_t +_popcount_combined_array( + const uint64_t *array1, + const uint64_t *array2, + int nwords) { uint64_t combined[WORDS_PER_POPCOUNT]; uint64_t c0, c1, c2, c3; c0 = c1 = c2 = c3 = 0; - for (int i = 0; i < n; i += WORDS_PER_POPCOUNT) { + for (int i = 0; i < nwords; i += WORDS_PER_POPCOUNT) { for (int j = 0; j < WORDS_PER_POPCOUNT; ++j) - combined[j] = buf1[i + j] & buf2[i + j]; + combined[j] = array1[i + j] & array2[i + j]; popcount(c0, c1, c2, c3, combined); } return c0 + c1 + c2 + c3; } -/** - * Compute the Dice coefficient similarity measure of two bit patterns. - */ -static double -dice_coeff_1024(const char *e1, const char *e2) { - const uint64_t *comp1 = (const uint64_t *) e1; - const uint64_t *comp2 = (const uint64_t *) e2; - - static constexpr int KEYWORDS = 16; - uint32_t count_both = 0; - - count_both += popcount_array(comp1, KEYWORDS); - count_both += popcount_array(comp2, KEYWORDS); - if(count_both == 0) { - return 0.0; - } - uint32_t count_and = popcount_combined_array(comp1, comp2, KEYWORDS); - - return 2 * count_and / (double)count_both; +// "Assumes" WORDS_PER_POPCOUNT divides nwords +// assumes u_popc or v_popc is nonzero. +static inline double +_dice_coeff( + const uint64_t *u, uint32_t u_popc, + const uint64_t *v, uint32_t v_popc, + int nwords) { + uint32_t uv_popc = _popcount_combined_array(u, v, nwords); + return (2 * uv_popc) / (double) (u_popc + v_popc); } class Node { - public: int index; double score; @@ -106,7 +99,7 @@ class Node { :index(n_index), score( n_score ) { } }; -struct score_cmp{ +struct score_cmp { bool operator()(const Node& a, const Node& b) const { return a.score > b.score; } @@ -121,12 +114,6 @@ calculate_max_difference(uint32_t popcnt_a, double threshold) { return 2 * popcnt_a * (1/threshold - 1); } -static inline double -dice_coeff(const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_popc, int n) { - uint32_t uv_popc = popcount_combined_array(u, v, n); - return (2 * uv_popc) / (double) (u_popc + v_popc); -} - /** * Convert clock measurement t to milliseconds. * @@ -146,6 +133,7 @@ abs_diff(uint32_t a, uint32_t b) { return b - a; } + extern "C" { /** @@ -160,39 +148,74 @@ extern "C" * * is put in counts_many[i]. */ - double popcount_1024_array(const char *many, int n, uint32_t *counts_many) { - static constexpr int KEYWORDS = 16; + double + popcount_arrays( + uint32_t *counts, + const char *arrays, int narrays, int array_bytes) { + // assumes WORD_BYTES divides array_bytes + int nwords = array_bytes / WORD_BYTES; + const uint64_t *u = reinterpret_cast(arrays); + + // assumes WORD_PER_POPCOUNT divides nwords clock_t t = clock(); - for (int i = 0; i < n; i++) { - const uint64_t *sig = (const uint64_t *) many + i * KEYWORDS; - counts_many[i] = popcount_array(sig, KEYWORDS); - } + for (int i = 0; i < narrays; ++i, u += nwords) + counts[i] = _popcount_array(u, nwords); return to_millis(clock() - t); } + /** + * Compute the Dice coefficient similarity measure of two arrays. + */ + double + dice_coeff( + const char *array1, + const char *array2, + int array_bytes) { + const uint64_t *u, *v; + uint32_t u_popc, v_popc; + // assumes WORD_BYTES divides array_bytes + int nwords = array_bytes / WORD_BYTES; + + u = reinterpret_cast(array1); + v = reinterpret_cast(array2); + + // assumes WORD_PER_POPCOUNT divides array_words + + // If the popcount of one of the arrays is zero, then the + // popcount of the "intersection" (logical AND) will be zero, + // hence the whole Dice coefficient will be zero. + u_popc = _popcount_array(u, nwords); + if (u_popc == 0) + return 0.0; + v_popc = _popcount_array(v, nwords); + if (v_popc == 0) + return 0.0; + + return _dice_coeff(u, u_popc, v, v_popc, nwords); + } + /** * Calculate up to the top k indices and scores. Returns the * number matched above the given threshold or -1 if keybytes is * not a multiple of 8. */ int match_one_against_many_dice_k_top( - const char *one, - const char *many, - const uint32_t *counts_many, - int n, - int keybytes, - uint32_t k, - double threshold, - int *indices, - double *scores) { + const char *one, + const char *many, + const uint32_t *counts_many, + int n, + int keybytes, + uint32_t k, + double threshold, + int *indices, + double *scores) { const uint64_t *comp1 = (const uint64_t *) one; const uint64_t *comp2 = (const uint64_t *) many; - static constexpr int WORDBYTES = sizeof(uint64_t); - if (keybytes % WORDBYTES != 0) + if (keybytes % WORD_BYTES != 0) return -1; - int keywords = keybytes / WORDBYTES; + int keywords = keybytes / WORD_BYTES; // Here we create max_k_scores on the stack by providing it // with a vector in which to put its elements. We do this so @@ -203,10 +226,10 @@ extern "C" typedef std::priority_queue, score_cmp> node_queue; node_vector vec; vec.reserve(k + 1); - node_queue max_k_scores(score_cmp(), std::move(vec)); + node_queue top_k_scores(score_cmp(), std::move(vec)); - uint32_t count_one = popcount_array(comp1, keywords); - uint32_t max_popcnt_delta = keybytes * 8; // = bits per key + uint32_t count_one = _popcount_array(comp1, keywords); + uint32_t max_popcnt_delta = keybytes * CHAR_BIT; // = bits per key if(threshold > 0) { max_popcnt_delta = calculate_max_difference(count_one, threshold); } @@ -216,20 +239,23 @@ extern "C" const uint32_t counts_many_j = counts_many[j]; if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) { - double score = dice_coeff(comp1, count_one, current, counts_many_j, keywords); + double score = _dice_coeff(comp1, count_one, current, counts_many_j, keywords); if (score >= threshold) { - max_k_scores.push(Node(j, score)); - if (max_k_scores.size() > k) - max_k_scores.pop(); + top_k_scores.push(Node(j, score)); + if (top_k_scores.size() > k) { + // Popping the top element is O(log(k))! + top_k_scores.pop(); + } } } } int i = 0; - while ( ! max_k_scores.empty()) { - scores[i] = max_k_scores.top().score; - indices[i] = max_k_scores.top().index; - max_k_scores.pop(); + while ( ! top_k_scores.empty()) { + scores[i] = top_k_scores.top().score; + indices[i] = top_k_scores.top().index; + // Popping the top element is O(log(k))! + top_k_scores.pop(); i += 1; } return i; @@ -237,13 +263,14 @@ extern "C" int match_one_against_many_dice(const char *one, const char *many, int n, double *score) { + static const int array_bytes = 128; static const double threshold = 0.0; static const int k = 1; int idx_unused; uint32_t *counts_many = new uint32_t[n]; - popcount_1024_array(many, n, counts_many); + popcount_arrays(counts_many, many, n, array_bytes); int res = match_one_against_many_dice_k_top( - one, many, counts_many, n, 128, k, threshold, &idx_unused, score); + one, many, counts_many, n, array_bytes, k, threshold, &idx_unused, score); delete[] counts_many; return res; diff --git a/anonlink/bloommatcher.py b/anonlink/bloommatcher.py index a8977166..d1a55acf 100644 --- a/anonlink/bloommatcher.py +++ b/anonlink/bloommatcher.py @@ -29,10 +29,11 @@ def dicecoeff(e1, e2): :return: real 0-1 similarity measure """ - if len(e1) == 1024 and len(e2) == 1024: + # TODO: Remove restriction to lengths divisible by 128 bytes + if e1.length() == e2.length() and (e1.length()/8) % (8*16) == 0: e1array = ffi.new("char[]", e1.tobytes()) e2array = ffi.new("char[]", e2.tobytes()) - return lib.dice_coeff_1024(e1array, e2array) + return lib.dice_coeff(e1array, e2array, len(e1array)) else: return dicecoeff_pure_python(e1, e2) diff --git a/anonlink/util.py b/anonlink/util.py index b0f1cb2c..d8280c29 100644 --- a/anonlink/util.py +++ b/anonlink/util.py @@ -44,7 +44,7 @@ def popcount_vector(bitarrays, use_python=True): c_popcounts = ffi.new("uint32_t[{}]".format(n)) many = ffi.new("char[{}]".format(128 * n), bytes([b for f in bitarrays for b in f.tobytes()])) - ms = lib.popcount_1024_array(many, n, c_popcounts) + ms = lib.popcount_arrays(c_popcounts, many, n, 128) return [c_popcounts[i] for i in range(n)], ms * 1e-3 From 063115a54ddabb045329360e8c6a0270dde3b44f Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 19 Feb 2018 10:12:10 +1100 Subject: [PATCH 27/49] Add -march=native compiler option. --- _cffi_build/build_matcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py index 9e0c685a..d191cac0 100644 --- a/_cffi_build/build_matcher.py +++ b/_cffi_build/build_matcher.py @@ -15,7 +15,7 @@ "_entitymatcher", source, source_extension='.cpp', - extra_compile_args=['-Wall', '-Wextra', '-Werror', '-O3', '-std=c++11', '-mssse3', '-mpopcnt', '-fvisibility=hidden' + extra_compile_args=['-Wall', '-Wextra', '-Werror', '-O3', '-std=c++11', '-march=native', '-mssse3', '-mpopcnt', '-fvisibility=hidden' ], ) From c9134d01ee3c19e93051b09cf35f25a05a622ce2 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 19 Feb 2018 10:15:51 +1100 Subject: [PATCH 28/49] Implementation of arbitrary length CLKs. --- _cffi_build/dice_one_against_many.cpp | 69 ++++++++++++++++++++++----- anonlink/util.py | 5 +- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index a5eeda9c..b8059da6 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -14,11 +14,15 @@ static constexpr int WORDS_PER_POPCOUNT = 16; static constexpr int WORD_BYTES = sizeof(uint64_t); template -void popcount(uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint64_t *buf) { +void popcount( + uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, + const uint64_t *buf) { popcount<4>(c0, c1, c2, c3, buf); popcount(c0, c1, c2, c3, buf + 4); } +// Fast Path +// // Source: http://danluu.com/assembly-intrinsics/ // https://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance // @@ -32,9 +36,8 @@ void popcount(uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint // registers for the intermediate popcnts. template<> void popcount<4>( - uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, - const uint64_t* buf) { - + uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, + const uint64_t* buf) { uint64_t b0, b1, b2, b3; b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3]; __asm__( @@ -50,17 +53,61 @@ void popcount<4>( "+r" (b0), "+r" (b1), "+r" (b2), "+r" (b3)); } -// "Assumes" WORDS_PER_POPCOUNT divides nwords +// Slow paths +// TODO: Assumes sizeof(long) == 8 +template<> +void popcount<3>( + uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &, + const uint64_t* buf) { + c0 = __builtin_popcountl(buf[0]); + c1 = __builtin_popcountl(buf[1]); + c2 = __builtin_popcountl(buf[2]); +} +template<> +void popcount<2>( + uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &, + const uint64_t* buf) { + c0 = __builtin_popcountl(buf[0]); + c1 = __builtin_popcountl(buf[1]); +} +template<> +void popcount<1>( + uint64_t &c0, uint64_t &, uint64_t &, uint64_t &, + const uint64_t* buf) { + c0 = __builtin_popcountl(buf[0]); +} + + static uint32_t _popcount_array(const uint64_t *array, int nwords) { uint64_t c0, c1, c2, c3; c0 = c1 = c2 = c3 = 0; - for (int i = 0; i < nwords; i += WORDS_PER_POPCOUNT) - popcount(c0, c1, c2, c3, array + i); + + while (nwords >= 16) { + popcount<16>(c0, c1, c2, c3, array += 16); + nwords -= 16; + } + // nwords < 16 + if (nwords >= 8) { + popcount<8>(c0, c1, c2, c3, array += 8); + nwords -= 8; + } + // nwords < 8 + if (nwords >= 4) { + popcount<4>(c0, c1, c2, c3, array += 4); + nwords -= 4; + } + // nwords < 4 + if (nwords >= 2) { + popcount<2>(c0, c1, c2, c3, array += 2); + nwords -= 2; + } + // nwords < 2 + if (nwords == 1) + popcount<1>(c0, c1, c2, c3, array + 1); return c0 + c1 + c2 + c3; } -// "Assumes" WORDS_PER_POPCOUNT divides nwords static uint32_t _popcount_combined_array( const uint64_t *array1, @@ -77,7 +124,6 @@ _popcount_combined_array( return c0 + c1 + c2 + c3; } -// "Assumes" WORDS_PER_POPCOUNT divides nwords // assumes u_popc or v_popc is nonzero. static inline double _dice_coeff( @@ -210,12 +256,11 @@ extern "C" int *indices, double *scores) { - const uint64_t *comp1 = (const uint64_t *) one; - const uint64_t *comp2 = (const uint64_t *) many; - if (keybytes % WORD_BYTES != 0) return -1; int keywords = keybytes / WORD_BYTES; + const uint64_t *comp1 = (const uint64_t *) one; + const uint64_t *comp2 = (const uint64_t *) many; // Here we create max_k_scores on the stack by providing it // with a vector in which to put its elements. We do this so diff --git a/anonlink/util.py b/anonlink/util.py index d8280c29..1fec0d96 100644 --- a/anonlink/util.py +++ b/anonlink/util.py @@ -41,10 +41,11 @@ def popcount_vector(bitarrays, use_python=True): # Use native code n = len(bitarrays) + arr_bytes = bitarrays[0].length() // 8 c_popcounts = ffi.new("uint32_t[{}]".format(n)) - many = ffi.new("char[{}]".format(128 * n), + many = ffi.new("char[{}]".format(arr_bytes * n), bytes([b for f in bitarrays for b in f.tobytes()])) - ms = lib.popcount_arrays(c_popcounts, many, n, 128) + ms = lib.popcount_arrays(c_popcounts, many, n, arr_bytes) return [c_popcounts[i] for i in range(n)], ms * 1e-3 From b2435f9b2065b70d7cacb6f3f43fc556cf78da7b Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 19 Feb 2018 11:09:40 +1100 Subject: [PATCH 29/49] Fix dumb mistakes in updating array pointer and popcounts. --- _cffi_build/dice_one_against_many.cpp | 28 ++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index b8059da6..59368a12 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -59,22 +59,24 @@ template<> void popcount<3>( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &, const uint64_t* buf) { - c0 = __builtin_popcountl(buf[0]); - c1 = __builtin_popcountl(buf[1]); - c2 = __builtin_popcountl(buf[2]); + c0 += __builtin_popcountl(buf[0]); + c1 += __builtin_popcountl(buf[1]); + c2 += __builtin_popcountl(buf[2]); } + template<> void popcount<2>( uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &, const uint64_t* buf) { - c0 = __builtin_popcountl(buf[0]); - c1 = __builtin_popcountl(buf[1]); + c0 += __builtin_popcountl(buf[0]); + c1 += __builtin_popcountl(buf[1]); } + template<> void popcount<1>( uint64_t &c0, uint64_t &, uint64_t &, uint64_t &, const uint64_t* buf) { - c0 = __builtin_popcountl(buf[0]); + c0 += __builtin_popcountl(buf[0]); } @@ -84,27 +86,31 @@ _popcount_array(const uint64_t *array, int nwords) { c0 = c1 = c2 = c3 = 0; while (nwords >= 16) { - popcount<16>(c0, c1, c2, c3, array += 16); + popcount<16>(c0, c1, c2, c3, array); + array += 16; nwords -= 16; } // nwords < 16 if (nwords >= 8) { - popcount<8>(c0, c1, c2, c3, array += 8); + popcount<8>(c0, c1, c2, c3, array); + array += 8; nwords -= 8; } // nwords < 8 if (nwords >= 4) { - popcount<4>(c0, c1, c2, c3, array += 4); + popcount<4>(c0, c1, c2, c3, array); + array += 4; nwords -= 4; } // nwords < 4 if (nwords >= 2) { - popcount<2>(c0, c1, c2, c3, array += 2); + popcount<2>(c0, c1, c2, c3, array); + array += 2; nwords -= 2; } // nwords < 2 if (nwords == 1) - popcount<1>(c0, c1, c2, c3, array + 1); + popcount<1>(c0, c1, c2, c3, array); return c0 + c1 + c2 + c3; } From 4acd62f02bb57c4420c723a125adce1619ec9d65 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 19 Feb 2018 16:54:45 +1100 Subject: [PATCH 30/49] Tests for arbitrary length popcounts. --- tests/test_util.py | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/tests/test_util.py b/tests/test_util.py index 2dae61d5..059dab49 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3.4 import unittest +from itertools import combinations_with_replacement from anonlink import util +from bitarray import bitarray class TestUtilDataGeneration(unittest.TestCase): @@ -19,15 +21,30 @@ def test_generate_clks(self): self.assertEqual(len(clk[0]), 1024) self.assertEqual(clk[0].count(), clk[2]) - def test_popcount_vector(self): - bas = [util.generate_bitarray(1024) for i in range(100)] - popcounts, _ = util.popcount_vector(bas, use_python=True) - self.assertEquals(len(popcounts), 100) - for i, cnt in enumerate(popcounts): - self.assertEquals(cnt, bas[i].count()) - - popcounts, _ = util.popcount_vector(bas, use_python=False) - self.assertEquals(len(popcounts), 100) - for i, cnt in enumerate(popcounts): - self.assertEquals(cnt, bas[i].count()) +def concat_bitarrays(products): + for p in products: + yield sum(p, bitarray()) + +# Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1 +# of various lengths between 1 and 65 words. +def test_generator(): + key_lengths = [1, 2, 3, 4, 8, 9, 10, 15, 16, 17, + 23, 24, 25, 30, 31, 32, 33, 63, 64, 65] + special_words = [64*bitarray('0'), + 63*bitarray('0') + bitarray('1'), + bitarray('1') + 63*bitarray('0'), + 64*bitarray('1')] + for L in key_lengths: + words = combinations_with_replacement(special_words, L) + # '+' on bitarrays is concatenation + bas = [sum(w, bitarray()) for w in words] + yield check_popcount_vector, bas + +def check_popcount_vector(bas): + bas_counts = [b.count() for b in bas] + + popcounts, _ = util.popcount_vector(bas, use_python=True) + assert(popcounts == bas_counts) + popcounts, _ = util.popcount_vector(bas, use_python=False) + assert(popcounts == bas_counts) From 38ca3ce4be6355b8229e4ca7a44a5b36923c58d3 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 19 Feb 2018 17:26:51 +1100 Subject: [PATCH 31/49] Update some comments. --- _cffi_build/dice_one_against_many.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 59368a12..3a553b6d 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -192,13 +192,15 @@ extern "C" * Calculate population counts of an array of inputs; return how * long it took in milliseconds. * - * 'many' must point to n*KEYWORDS*sizeof(uint64_t) (== 128*n) bytes - * 'counts_many' must point to n*sizeof(uint32_t) bytes. - * For i = 0 to n - 1, the population count of the 1024 bits + * 'arrays' must point to narrays*array_bytes bytes + * 'counts' must point to narrays*sizeof(uint32_t) bytes. + * For i = 0 to n - 1, the population count of the array_bytes*8 bits * - * many[i * KEYWORDS] ... many[(i + 1) * KEYWORDS - 1] + * arrays[i * array_bytes] ... arrays[(i + 1) * array_bytes - 1] * - * is put in counts_many[i]. + * is put in counts[i]. + * + * ASSUMES: array_bytes is divisible by 8. */ double popcount_arrays( @@ -216,7 +218,10 @@ extern "C" } /** - * Compute the Dice coefficient similarity measure of two arrays. + * Compute the Dice coefficient similarity measure of two arrays + * of length array_bytes. + * + * ASSUMES: array_bytes is divisible by 8. */ double dice_coeff( @@ -231,8 +236,6 @@ extern "C" u = reinterpret_cast(array1); v = reinterpret_cast(array2); - // assumes WORD_PER_POPCOUNT divides array_words - // If the popcount of one of the arrays is zero, then the // popcount of the "intersection" (logical AND) will be zero, // hence the whole Dice coefficient will be zero. From e8c77bc94fa1fbfcbd9b4f43a891ae3115d2e085 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 19 Feb 2018 17:51:54 +1100 Subject: [PATCH 32/49] Arbitrary length Dice coefficient. --- _cffi_build/dice_one_against_many.cpp | 34 ++++++++++++++++++--------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 3a553b6d..375cbdc3 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -7,10 +7,6 @@ #include #include -// WORDS_PER_POPCOUNT determines how much we unroll the popcounting in -// each iteration of a loop. Currently 16, which corresponds to 16*64 -// = 1024 bits per loop. -static constexpr int WORDS_PER_POPCOUNT = 16; static constexpr int WORD_BYTES = sizeof(uint64_t); template @@ -114,20 +110,36 @@ _popcount_array(const uint64_t *array, int nwords) { return c0 + c1 + c2 + c3; } +static inline void +logand_array(uint64_t *out, const uint64_t *arr1, const uint64_t *arr2, int n) { + for (int j = 0; j < n; ++j) + out[j] = arr1[j] & arr2[j]; +} + static uint32_t _popcount_combined_array( const uint64_t *array1, const uint64_t *array2, int nwords) { - uint64_t combined[WORDS_PER_POPCOUNT]; - uint64_t c0, c1, c2, c3; + const uint64_t *arr1 = array1, *arr2 = array2; + int n = nwords; + static constexpr int BUF_WORDS = 16; + uint64_t combined[BUF_WORDS]; + uint64_t c0, c1, c2, c3, rest; + c0 = c1 = c2 = c3 = 0; - for (int i = 0; i < nwords; i += WORDS_PER_POPCOUNT) { - for (int j = 0; j < WORDS_PER_POPCOUNT; ++j) - combined[j] = array1[i + j] & array2[i + j]; - popcount(c0, c1, c2, c3, combined); + + while (n >= BUF_WORDS) { + logand_array(combined, arr1, arr2, BUF_WORDS); + popcount(c0, c1, c2, c3, combined); + arr1 += BUF_WORDS; + arr2 += BUF_WORDS; + n -= BUF_WORDS; } - return c0 + c1 + c2 + c3; + logand_array(combined, arr1, arr2, n); + rest = _popcount_array(combined, n); + + return c0 + c1 + c2 + c3 + rest; } // assumes u_popc or v_popc is nonzero. From 1febd65bfe3efab60740576f965e53f60f17178c Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Mon, 19 Feb 2018 17:56:09 +1100 Subject: [PATCH 33/49] Rename function. --- _cffi_build/dice_one_against_many.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 375cbdc3..2c1b1e61 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -117,7 +117,7 @@ logand_array(uint64_t *out, const uint64_t *arr1, const uint64_t *arr2, int n) { } static uint32_t -_popcount_combined_array( +_popcount_logand_array( const uint64_t *array1, const uint64_t *array2, int nwords) { @@ -148,7 +148,7 @@ _dice_coeff( const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_popc, int nwords) { - uint32_t uv_popc = _popcount_combined_array(u, v, nwords); + uint32_t uv_popc = _popcount_logand_array(u, v, nwords); return (2 * uv_popc) / (double) (u_popc + v_popc); } From 21390c4c0d81cc8f06391a19bca3d83bd878bbf7 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 20 Feb 2018 15:21:49 +1100 Subject: [PATCH 34/49] Move native dicecoeff calculation into its own function. --- anonlink/bloommatcher.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/anonlink/bloommatcher.py b/anonlink/bloommatcher.py index d1a55acf..102c8653 100644 --- a/anonlink/bloommatcher.py +++ b/anonlink/bloommatcher.py @@ -22,6 +22,10 @@ def dicecoeff_pure_python(e1, e2): else: return 2.0 * overlap_count / combined_count +def dicecoeff_native(e1, e2): + e1array = ffi.new("char[]", e1.tobytes()) + e2array = ffi.new("char[]", e2.tobytes()) + return lib.dice_coeff(e1array, e2array, len(e1array)) def dicecoeff(e1, e2): """ @@ -29,11 +33,8 @@ def dicecoeff(e1, e2): :return: real 0-1 similarity measure """ - # TODO: Remove restriction to lengths divisible by 128 bytes - if e1.length() == e2.length() and (e1.length()/8) % (8*16) == 0: - e1array = ffi.new("char[]", e1.tobytes()) - e2array = ffi.new("char[]", e2.tobytes()) - return lib.dice_coeff(e1array, e2array, len(e1array)) + if e1.length() == e2.length() and (e1.length()/8) % 8 == 0: + return dicecoeff_native(e1, e2) else: return dicecoeff_pure_python(e1, e2) From 75cef8e19bdb8e0ec12f11ffe78a6ab02fcb30f0 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 20 Feb 2018 15:22:18 +1100 Subject: [PATCH 35/49] Add tests for native Dice coefficient calculation. --- tests/test_util.py | 53 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/tests/test_util.py b/tests/test_util.py index ceb880e3..edfca8cb 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,9 +1,12 @@ #!/usr/bin/env python3.4 import unittest +import os from itertools import combinations_with_replacement +from collections import deque from anonlink import util from bitarray import bitarray +from anonlink import bloommatcher as bm class TestUtilDataGeneration(unittest.TestCase): @@ -21,24 +24,27 @@ def test_generate_clks(self): self.assertEqual(len(clk[0]), 1024) self.assertEqual(clk[0].count(), clk[2]) -def concat_bitarrays(products): - for p in products: - yield sum(p, bitarray()) - -# Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1 -# of various lengths between 1 and 65 words. -def test_generator(): - key_lengths = [1, 2, 3, 4, 8, 9, 10, 15, 16, 17, - 23, 24, 25, 30, 31, 32, 33, 63, 64, 65] +# Return a bit array of length L*64 whose contents are combinations of +# the words 0, 2^64-1, 1 or 2^63 (ie. all zeros, all ones, or a one in +# the least or most significant position). +def bitarrays_of_length(L): special_words = [64*bitarray('0'), 63*bitarray('0') + bitarray('1'), bitarray('1') + 63*bitarray('0'), 64*bitarray('1')] + # '+' on bitarrays is concatenation + return [sum(word, bitarray()) + for word in combinations_with_replacement(special_words, L)] + +# Interesting key lengths (usually around 2^something +/-1). +key_lengths = [1, 2, 3, 4, 8, 9, 10, 15, 16, 17, + 23, 24, 25, 30, 31, 32, 33, 63, 64, 65] + +# Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1 +# of various lengths between 1 and 65 words. +def test_popcount_vector(): for L in key_lengths: - words = combinations_with_replacement(special_words, L) - # '+' on bitarrays is concatenation - bas = [sum(w, bitarray()) for w in words] - yield check_popcount_vector, bas + yield check_popcount_vector, bitarrays_of_length(L) def check_popcount_vector(bas): bas_counts = [b.count() for b in bas] @@ -47,3 +53,24 @@ def check_popcount_vector(bas): assert(popcounts == bas_counts) popcounts, _ = util.popcount_vector(bas, use_python=False) assert(popcounts == bas_counts) + +def test_dicecoeff(): + for L in key_lengths: + yield check_dicecoeff, bitarrays_of_length(L) + +def check_dicecoeff(bas): + # Test the Dice coefficient of bitarrays in bas with other + # bitarrays of bas. rotations is the number of times we rotate + # bas to generate pairs to test the Dice coefficient; 10 takes + # around 10s, 100 around 60s. + rotations = 100 if "INCLUDE_10K" in os.environ else 10; + + # We check that the native code and Python versions of dicecoeff + # don't ever differ by more than 10^{-6}. + eps = 0.000001 + d = deque(bas) + for _ in range(min(rotations, len(bas))): + for a, b in zip(bas, d): + diff = bm.dicecoeff_pure_python(a, b) - bm.dicecoeff_native(a, b) + assert(abs(diff) < eps) + d.rotate(1) From c338c3295510c69db0e6a479062d60712d511a71 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 20 Feb 2018 15:48:26 +1100 Subject: [PATCH 36/49] Move dicecoeff tests to bloommatcher tests; move common bitarray utilities to their own file. --- tests/bitarray_utils.py | 18 +++++++++++++++ tests/test_bloommatcher.py | 25 +++++++++++++++++++++ tests/test_util.py | 45 +++----------------------------------- 3 files changed, 46 insertions(+), 42 deletions(-) create mode 100644 tests/bitarray_utils.py diff --git a/tests/bitarray_utils.py b/tests/bitarray_utils.py new file mode 100644 index 00000000..4574cf2b --- /dev/null +++ b/tests/bitarray_utils.py @@ -0,0 +1,18 @@ +from bitarray import bitarray +from itertools import combinations_with_replacement + +# Return a bit array of length L*64 whose contents are combinations of +# the words 0, 2^64-1, 1 or 2^63 (ie. all zeros, all ones, or a one in +# the least or most significant position). +def bitarrays_of_length(L): + special_words = [64*bitarray('0'), + 63*bitarray('0') + bitarray('1'), + bitarray('1') + 63*bitarray('0'), + 64*bitarray('1')] + # '+' on bitarrays is concatenation + return [sum(word, bitarray()) + for word in combinations_with_replacement(special_words, L)] + +# Interesting key lengths (usually around 2^something +/-1). +key_lengths = [1, 2, 3, 4, 8, 9, 10, 15, 16, 17, + 23, 24, 25, 30, 31, 32, 33, 63, 64, 65] diff --git a/tests/test_bloommatcher.py b/tests/test_bloommatcher.py index 4c0f06fd..2eb0fa9e 100644 --- a/tests/test_bloommatcher.py +++ b/tests/test_bloommatcher.py @@ -1,8 +1,11 @@ import unittest import random +import os +from collections import deque from bitarray import bitarray from anonlink import bloommatcher as bm +from tests import bitarray_utils __author__ = 'shardy' @@ -70,6 +73,28 @@ def test_dice_4_c(self): self.assertEqual(result, 0.0) +# Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1 +# of various lengths between 1 and 65 words. +def test_dicecoeff(): + for L in bitarray_utils.key_lengths: + yield check_dicecoeff, bitarray_utils.bitarrays_of_length(L) + +def check_dicecoeff(bas): + # Test the Dice coefficient of bitarrays in bas with other + # bitarrays of bas. rotations is the number of times we rotate + # bas to generate pairs to test the Dice coefficient; 10 takes + # around 10s, 100 around 60s. + rotations = 100 if "INCLUDE_10K" in os.environ else 10; + + # We check that the native code and Python versions of dicecoeff + # don't ever differ by more than 10^{-6}. + eps = 0.000001 + d = deque(bas) + for _ in range(min(rotations, len(bas))): + for a, b in zip(bas, d): + diff = bm.dicecoeff_pure_python(a, b) - bm.dicecoeff_native(a, b) + assert(abs(diff) < eps) + d.rotate(1) if __name__ == '__main__': unittest.main() diff --git a/tests/test_util.py b/tests/test_util.py index edfca8cb..3c8e4312 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,12 +1,9 @@ #!/usr/bin/env python3.4 import unittest -import os -from itertools import combinations_with_replacement -from collections import deque from anonlink import util -from bitarray import bitarray from anonlink import bloommatcher as bm +from tests import bitarray_utils class TestUtilDataGeneration(unittest.TestCase): @@ -24,27 +21,11 @@ def test_generate_clks(self): self.assertEqual(len(clk[0]), 1024) self.assertEqual(clk[0].count(), clk[2]) -# Return a bit array of length L*64 whose contents are combinations of -# the words 0, 2^64-1, 1 or 2^63 (ie. all zeros, all ones, or a one in -# the least or most significant position). -def bitarrays_of_length(L): - special_words = [64*bitarray('0'), - 63*bitarray('0') + bitarray('1'), - bitarray('1') + 63*bitarray('0'), - 64*bitarray('1')] - # '+' on bitarrays is concatenation - return [sum(word, bitarray()) - for word in combinations_with_replacement(special_words, L)] - -# Interesting key lengths (usually around 2^something +/-1). -key_lengths = [1, 2, 3, 4, 8, 9, 10, 15, 16, 17, - 23, 24, 25, 30, 31, 32, 33, 63, 64, 65] - # Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1 # of various lengths between 1 and 65 words. def test_popcount_vector(): - for L in key_lengths: - yield check_popcount_vector, bitarrays_of_length(L) + for L in bitarray_utils.key_lengths: + yield check_popcount_vector, bitarray_utils.bitarrays_of_length(L) def check_popcount_vector(bas): bas_counts = [b.count() for b in bas] @@ -54,23 +35,3 @@ def check_popcount_vector(bas): popcounts, _ = util.popcount_vector(bas, use_python=False) assert(popcounts == bas_counts) -def test_dicecoeff(): - for L in key_lengths: - yield check_dicecoeff, bitarrays_of_length(L) - -def check_dicecoeff(bas): - # Test the Dice coefficient of bitarrays in bas with other - # bitarrays of bas. rotations is the number of times we rotate - # bas to generate pairs to test the Dice coefficient; 10 takes - # around 10s, 100 around 60s. - rotations = 100 if "INCLUDE_10K" in os.environ else 10; - - # We check that the native code and Python versions of dicecoeff - # don't ever differ by more than 10^{-6}. - eps = 0.000001 - d = deque(bas) - for _ in range(min(rotations, len(bas))): - for a, b in zip(bas, d): - diff = bm.dicecoeff_pure_python(a, b) - bm.dicecoeff_native(a, b) - assert(abs(diff) < eps) - d.rotate(1) From 4d74b1c2939cf66b53f8399090605c09cefed152 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 20 Feb 2018 16:13:54 +1100 Subject: [PATCH 37/49] Simplify slow path / reduce branches in fast path. --- _cffi_build/dice_one_against_many.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 2c1b1e61..e30a8e60 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -125,7 +125,7 @@ _popcount_logand_array( int n = nwords; static constexpr int BUF_WORDS = 16; uint64_t combined[BUF_WORDS]; - uint64_t c0, c1, c2, c3, rest; + uint64_t c0, c1, c2, c3; c0 = c1 = c2 = c3 = 0; @@ -136,10 +136,12 @@ _popcount_logand_array( arr2 += BUF_WORDS; n -= BUF_WORDS; } - logand_array(combined, arr1, arr2, n); - rest = _popcount_array(combined, n); + if (n > 0) { + logand_array(combined, arr1, arr2, n); + c0 += _popcount_array(combined, n); + } - return c0 + c1 + c2 + c3 + rest; + return c0 + c1 + c2 + c3; } // assumes u_popc or v_popc is nonzero. From 2d6b5f7be1fa6a5557937d4813d3702b07df1e8e Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 20 Feb 2018 16:28:57 +1100 Subject: [PATCH 38/49] Adapt entitymatcher to arbitrary length CLK interface. --- anonlink/entitymatch.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/anonlink/entitymatch.py b/anonlink/entitymatch.py index ba03416d..cbcf1251 100644 --- a/anonlink/entitymatch.py +++ b/anonlink/entitymatch.py @@ -33,15 +33,26 @@ def python_filter_similarity(filters1, filters2): def cffi_filter_similarity_k(filters1, filters2, k, threshold): """Accelerated method for determining Bloom Filter similarity. + + Assumes all filters are the same length, being a multiple of 64 + bits. + """ length_f1 = len(filters1) length_f2 = len(filters2) - # We assume the length is a multple of 128 bits. + if length_f1 == 0: + return [] + + # Length must be a multple of 64 bits. + assert(len(filters1[0][0]) % 8 == 0) + filter_bytes = len(filters1[0][0]) // 8 + assert(filter_bytes % 8 == 0) + match_one_against_many_dice_k_top = lib.match_one_against_many_dice_k_top # An array of the *one* filter - clist1 = [ffi.new("char[128]", bytes(f[0].tobytes())) + clist1 = [ffi.new("char[{}]".format(filter_bytes), bytes(f[0].tobytes())) for f in filters1] if sys.version_info < (3, 0): @@ -51,10 +62,10 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold): for b in f[0].tobytes(): data.append(b) - carr2 = ffi.new("char[{}]".format(128 * length_f2), ''.join(data)) + carr2 = ffi.new("char[{}]".format(filter_bytes * length_f2), ''.join(data)) else: # Works in Python 3+ - carr2 = ffi.new("char[{}]".format(128 * length_f2), + carr2 = ffi.new("char[{}]".format(filter_bytes * length_f2), bytes([b for f in filters2 for b in f[0].tobytes()])) c_popcounts = ffi.new("uint32_t[{}]".format(length_f2), [f[2] for f in filters2]) @@ -66,14 +77,13 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold): result = [] for i, f1 in enumerate(filters1): - assert len(clist1[i]) == 128 - assert len(carr2) % 64 == 0 + assert len(clist1[i]) == filter_bytes matches = match_one_against_many_dice_k_top( clist1[i], carr2, c_popcounts, length_f2, - 128, + filter_bytes, k, threshold, c_indices, From ab45ea86d474e39a1c871f6fab08aaf906a7636a Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 20 Feb 2018 16:31:04 +1100 Subject: [PATCH 39/49] Remove unused function. --- _cffi_build/build_matcher.py | 1 - _cffi_build/dice_one_against_many.cpp | 15 --------------- 2 files changed, 16 deletions(-) diff --git a/_cffi_build/build_matcher.py b/_cffi_build/build_matcher.py index d191cac0..6277f3df 100644 --- a/_cffi_build/build_matcher.py +++ b/_cffi_build/build_matcher.py @@ -20,7 +20,6 @@ ) ffibuilder.cdef(""" - int match_one_against_many_dice(const char * one, const char * many, int n, double * score); int match_one_against_many_dice_k_top(const char *one, const char *many, const uint32_t *counts_many, int n, int keybytes, uint32_t k, double threshold, int *indices, double *scores); double dice_coeff(const char *array1, const char *array2, int array_bytes); double popcount_arrays(uint32_t *counts, const char *arrays, int narrays, int array_bytes); diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index e30a8e60..f0a8b930 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -328,19 +328,4 @@ extern "C" } return i; } - - int match_one_against_many_dice(const char *one, const char *many, int n, double *score) { - - static const int array_bytes = 128; - static const double threshold = 0.0; - static const int k = 1; - int idx_unused; - uint32_t *counts_many = new uint32_t[n]; - popcount_arrays(counts_many, many, n, array_bytes); - int res = match_one_against_many_dice_k_top( - one, many, counts_many, n, array_bytes, k, threshold, &idx_unused, score); - delete[] counts_many; - - return res; - } } From 9ccaa8d13654eea8a057a9736d478cfdd2d3d947 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Tue, 20 Feb 2018 16:47:57 +1100 Subject: [PATCH 40/49] Update README. --- README.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.rst b/README.rst index 483590ee..81c64d1b 100644 --- a/README.rst +++ b/README.rst @@ -135,8 +135,6 @@ Limitations - The linkage process has order n^2 time complexity - although algorithms exist to significantly speed this up. Several possible speedups are described in http://dbs.uni-leipzig.de/file/P4Join-BTW2015.pdf -- The C++ code makes an assumption of 1024 bit keys (although this would be easy - to change). License From e515b34008685c768afc62dc943d011f534d8fbd Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Thu, 22 Feb 2018 10:07:23 +1100 Subject: [PATCH 41/49] Address Brian's comments. --- _cffi_build/dice_one_against_many.cpp | 6 +++++- anonlink/bloommatcher.py | 12 +++++++++++- anonlink/entitymatch.py | 9 ++++----- tests/bitarray_utils.py | 8 +++++--- tests/test_bloommatcher.py | 21 ++++++++++----------- tests/test_util.py | 12 ++++-------- 6 files changed, 39 insertions(+), 29 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index f0a8b930..b47441ec 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -51,6 +51,10 @@ void popcount<4>( // Slow paths // TODO: Assumes sizeof(long) == 8 +// +// NB: The specialisation to n=3 is not currently used but included +// for completeness (i.e. so that popcount is defined for all +// non-negative n) and in anticipation of its use in the near future. template<> void popcount<3>( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &, @@ -285,7 +289,7 @@ extern "C" const uint64_t *comp1 = (const uint64_t *) one; const uint64_t *comp2 = (const uint64_t *) many; - // Here we create max_k_scores on the stack by providing it + // Here we create top_k_scores on the stack by providing it // with a vector in which to put its elements. We do this so // that we can reserve the amount of space needed for the // scores in advance and avoid potential memory reallocation diff --git a/anonlink/bloommatcher.py b/anonlink/bloommatcher.py index 102c8653..24675c3b 100644 --- a/anonlink/bloommatcher.py +++ b/anonlink/bloommatcher.py @@ -10,7 +10,9 @@ def dicecoeff_pure_python(e1, e2): """ Dice coefficient measures the similarity of two bit patterns. - :param e1,e2: bitset arrays of same length + Implemented exclusively in Python. + + :param e1, e2: bitarrays of same length :return: real 0-1 similarity measure """ count1 = e1.count() @@ -23,6 +25,14 @@ def dicecoeff_pure_python(e1, e2): return 2.0 * overlap_count / combined_count def dicecoeff_native(e1, e2): + """ + Dice coefficient measures the similarity of two bit patterns. + + Implemented via an external library. + + :param e1, e2: bitarrays of same length + :return: real 0-1 similarity measure + """ e1array = ffi.new("char[]", e1.tobytes()) e2array = ffi.new("char[]", e2.tobytes()) return lib.dice_coeff(e1array, e2array, len(e1array)) diff --git a/anonlink/entitymatch.py b/anonlink/entitymatch.py index cbcf1251..fae7b4f7 100644 --- a/anonlink/entitymatch.py +++ b/anonlink/entitymatch.py @@ -44,10 +44,9 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold): if length_f1 == 0: return [] - # Length must be a multple of 64 bits. - assert(len(filters1[0][0]) % 8 == 0) - filter_bytes = len(filters1[0][0]) // 8 - assert(filter_bytes % 8 == 0) + filter_bits = len(filters1[0][0]) + assert(filter_bits % 64 == 0, 'Filter length must be a multple of 64 bits.') + filter_bytes = filter_bits // 8 match_one_against_many_dice_k_top = lib.match_one_against_many_dice_k_top @@ -90,7 +89,7 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold): c_scores) if matches < 0: - raise Exception('Internel error: Bad key length') + raise ValueError('Internel error: Bad key length') for j in range(matches): ind = c_indices[j] assert ind < len(filters2) diff --git a/tests/bitarray_utils.py b/tests/bitarray_utils.py index 4574cf2b..76426911 100644 --- a/tests/bitarray_utils.py +++ b/tests/bitarray_utils.py @@ -1,10 +1,12 @@ from bitarray import bitarray from itertools import combinations_with_replacement -# Return a bit array of length L*64 whose contents are combinations of -# the words 0, 2^64-1, 1 or 2^63 (ie. all zeros, all ones, or a one in -# the least or most significant position). def bitarrays_of_length(L): + """ + Return a bit array of length L*64 whose contents are combinations of + the words 0, 2^64-1, 1 or 2^63 (ie. all zeros, all ones, or a one in + the least or most significant position). + """ special_words = [64*bitarray('0'), 63*bitarray('0') + bitarray('1'), bitarray('1') + 63*bitarray('0'), diff --git a/tests/test_bloommatcher.py b/tests/test_bloommatcher.py index 2eb0fa9e..8fff895e 100644 --- a/tests/test_bloommatcher.py +++ b/tests/test_bloommatcher.py @@ -1,4 +1,5 @@ import unittest +import pytest import random import os from collections import deque @@ -73,18 +74,16 @@ def test_dice_4_c(self): self.assertEqual(result, 0.0) -# Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1 -# of various lengths between 1 and 65 words. -def test_dicecoeff(): - for L in bitarray_utils.key_lengths: - yield check_dicecoeff, bitarray_utils.bitarrays_of_length(L) - -def check_dicecoeff(bas): - # Test the Dice coefficient of bitarrays in bas with other - # bitarrays of bas. rotations is the number of times we rotate - # bas to generate pairs to test the Dice coefficient; 10 takes - # around 10s, 100 around 60s. +@pytest.mark.parametrize("L", bitarray_utils.key_lengths) +def test_dicecoeff(L): + """ + Test the Dice coefficient of bitarrays in bas with other + bitarrays of bas. rotations is the number of times we rotate + bas to generate pairs to test the Dice coefficient; 10 takes + around 10s, 100 around 60s. + """ rotations = 100 if "INCLUDE_10K" in os.environ else 10; + bas = bitarray_utils.bitarrays_of_length(L) # We check that the native code and Python versions of dicecoeff # don't ever differ by more than 10^{-6}. diff --git a/tests/test_util.py b/tests/test_util.py index 3c8e4312..2715de3d 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3.4 import unittest +import pytest from anonlink import util from anonlink import bloommatcher as bm from tests import bitarray_utils @@ -21,17 +22,12 @@ def test_generate_clks(self): self.assertEqual(len(clk[0]), 1024) self.assertEqual(clk[0].count(), clk[2]) -# Generate bit arrays that are combinations of words 0, 1, 2^63, 2^64 - 1 -# of various lengths between 1 and 65 words. -def test_popcount_vector(): - for L in bitarray_utils.key_lengths: - yield check_popcount_vector, bitarray_utils.bitarrays_of_length(L) - -def check_popcount_vector(bas): +@pytest.mark.parametrize("L", bitarray_utils.key_lengths) +def test_popcount_vector(L): + bas = bitarray_utils.bitarrays_of_length(L) bas_counts = [b.count() for b in bas] popcounts, _ = util.popcount_vector(bas, use_python=True) assert(popcounts == bas_counts) popcounts, _ = util.popcount_vector(bas, use_python=False) assert(popcounts == bas_counts) - From 446033f607edea2dde5a3ff104adced9f8fbcd6f Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Thu, 22 Feb 2018 10:27:52 +1100 Subject: [PATCH 42/49] Exit early if filter is zero. --- _cffi_build/dice_one_against_many.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index b47441ec..b5a5af97 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -301,6 +301,9 @@ extern "C" node_queue top_k_scores(score_cmp(), std::move(vec)); uint32_t count_one = _popcount_array(comp1, keywords); + if (count_one == 0) + return 0; + uint32_t max_popcnt_delta = keybytes * CHAR_BIT; // = bits per key if(threshold > 0) { max_popcnt_delta = calculate_max_difference(count_one, threshold); From dea0a0d272326375c45fbc516c3bcd5a8df90cdf Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 23 Feb 2018 14:11:39 +1100 Subject: [PATCH 43/49] Specialise popcount arrays calls on array length. --- _cffi_build/dice_one_against_many.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index b5a5af97..44943424 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -80,6 +80,16 @@ void popcount<1>( } +template +void _my_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) { + uint64_t c0, c1, c2, c3; + for (int i = 0; i < narrays; ++i, arrays += nwords) { + c0 = c1 = c2 = c3 = 0; + popcount(c0, c1, c2, c3, arrays); + counts[i] = c0 + c1 + c2 + c3; + } +} + static uint32_t _popcount_array(const uint64_t *array, int nwords) { uint64_t c0, c1, c2, c3; @@ -230,8 +240,14 @@ extern "C" // assumes WORD_PER_POPCOUNT divides nwords clock_t t = clock(); - for (int i = 0; i < narrays; ++i, u += nwords) - counts[i] = _popcount_array(u, nwords); + switch (nwords) { + case 32: _my_popcount_arrays<32>(counts, u, narrays); break; + case 16: _my_popcount_arrays<16>(counts, u, narrays); break; + case 8: _my_popcount_arrays< 8>(counts, u, narrays); break; + default: + for (int i = 0; i < narrays; ++i, u += nwords) + counts[i] = _popcount_array(u, nwords); + } return to_millis(clock() - t); } From d3671a22bfd0bf8ddb4a8d250989c17cc1479355 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 2 Mar 2018 12:03:47 +1100 Subject: [PATCH 44/49] Fix performance regression. --- _cffi_build/dice_one_against_many.cpp | 137 +++++++++++++++++--------- 1 file changed, 91 insertions(+), 46 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 44943424..3eccb2b7 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -10,7 +10,8 @@ static constexpr int WORD_BYTES = sizeof(uint64_t); template -void popcount( +static inline void +popcount( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint64_t *buf) { popcount<4>(c0, c1, c2, c3, buf); @@ -31,7 +32,8 @@ void popcount( // loading the contents of buf into registers and using these same // registers for the intermediate popcnts. template<> -void popcount<4>( +static inline void +popcount<4>( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint64_t* buf) { uint64_t b0, b1, b2, b3; @@ -56,7 +58,8 @@ void popcount<4>( // for completeness (i.e. so that popcount is defined for all // non-negative n) and in anticipation of its use in the near future. template<> -void popcount<3>( +static inline void +popcount<3>( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &, const uint64_t* buf) { c0 += __builtin_popcountl(buf[0]); @@ -65,7 +68,8 @@ void popcount<3>( } template<> -void popcount<2>( +static inline void +popcount<2>( uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &, const uint64_t* buf) { c0 += __builtin_popcountl(buf[0]); @@ -73,7 +77,8 @@ void popcount<2>( } template<> -void popcount<1>( +static inline void +popcount<1>( uint64_t &c0, uint64_t &, uint64_t &, uint64_t &, const uint64_t* buf) { c0 += __builtin_popcountl(buf[0]); @@ -81,7 +86,8 @@ void popcount<1>( template -void _my_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) { +static void +_popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) { uint64_t c0, c1, c2, c3; for (int i = 0; i < narrays; ++i, arrays += nwords) { c0 = c1 = c2 = c3 = 0; @@ -124,43 +130,57 @@ _popcount_array(const uint64_t *array, int nwords) { return c0 + c1 + c2 + c3; } +template static inline void -logand_array(uint64_t *out, const uint64_t *arr1, const uint64_t *arr2, int n) { - for (int j = 0; j < n; ++j) - out[j] = arr1[j] & arr2[j]; +popcount_logand( + uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, + const uint64_t *buf1, const uint64_t *buf2) { + popcount_logand<4>(c0, c1, c2, c3, buf1, buf2); + popcount_logand(c0, c1, c2, c3, buf1 + 4, buf2 + 4); +} + +template<> +static inline void +popcount_logand<4>( + uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, + const uint64_t* buf1, const uint64_t *buf2) { + uint64_t b[4]; + b[0] = buf1[0] & buf2[0]; + b[1] = buf1[1] & buf2[1]; + b[2] = buf1[2] & buf2[2]; + b[3] = buf1[3] & buf2[3]; + popcount<4>(c0, c1, c2, c3, b); } static uint32_t -_popcount_logand_array( - const uint64_t *array1, - const uint64_t *array2, - int nwords) { - const uint64_t *arr1 = array1, *arr2 = array2; - int n = nwords; - static constexpr int BUF_WORDS = 16; - uint64_t combined[BUF_WORDS]; +_popcount_logand_array(const uint64_t* u, const uint64_t* v, int len) { + // NB: The switch statement at the end of this function must have + // cases for all i = 1, ..., LOOP_LEN - 1. + static constexpr int LOOP_LEN = 4; uint64_t c0, c1, c2, c3; - c0 = c1 = c2 = c3 = 0; - while (n >= BUF_WORDS) { - logand_array(combined, arr1, arr2, BUF_WORDS); - popcount(c0, c1, c2, c3, combined); - arr1 += BUF_WORDS; - arr2 += BUF_WORDS; - n -= BUF_WORDS; + int i = 0; + for ( ; i + LOOP_LEN <= len; i += LOOP_LEN) { + popcount_logand(c0, c1, c2, c3, u, v); + u += LOOP_LEN; + v += LOOP_LEN; } - if (n > 0) { - logand_array(combined, arr1, arr2, n); - c0 += _popcount_array(combined, n); + + // NB: The "fall through" comments are necessary to tell GCC and + // Clang not to complain about the fact that the case clauses + // don't have break statements in them. + switch (len - i) { + case 3: c2 += __builtin_popcountl(u[2] & v[2]); /* fall through */ + case 2: c1 += __builtin_popcountl(u[1] & v[1]); /* fall through */ + case 1: c0 += __builtin_popcountl(u[0] & v[0]); /* fall through */ } return c0 + c1 + c2 + c3; } -// assumes u_popc or v_popc is nonzero. static inline double -_dice_coeff( +_dice_coeff_generic( const uint64_t *u, uint32_t u_popc, const uint64_t *v, uint32_t v_popc, int nwords) { @@ -168,6 +188,17 @@ _dice_coeff( return (2 * uv_popc) / (double) (u_popc + v_popc); } +template +static inline double +_dice_coeff( + const uint64_t *u, uint32_t u_popc, + const uint64_t *v, uint32_t v_popc) { + uint64_t c0, c1, c2, c3; + c0 = c1 = c2 = c3 = 0; + popcount_logand(c0, c1, c2, c3, u, v); + uint32_t uv_popc = c0 + c1 + c2 + c3; + return (2 * uv_popc) / (double) (u_popc + v_popc); +} class Node { public: @@ -241,9 +272,9 @@ extern "C" // assumes WORD_PER_POPCOUNT divides nwords clock_t t = clock(); switch (nwords) { - case 32: _my_popcount_arrays<32>(counts, u, narrays); break; - case 16: _my_popcount_arrays<16>(counts, u, narrays); break; - case 8: _my_popcount_arrays< 8>(counts, u, narrays); break; + case 32: _popcount_arrays<32>(counts, u, narrays); break; + case 16: _popcount_arrays<16>(counts, u, narrays); break; + case 8: _popcount_arrays< 8>(counts, u, narrays); break; default: for (int i = 0; i < narrays; ++i, u += nwords) counts[i] = _popcount_array(u, nwords); @@ -280,7 +311,7 @@ extern "C" if (v_popc == 0) return 0.0; - return _dice_coeff(u, u_popc, v, v_popc, nwords); + return _dice_coeff_generic(u, u_popc, v, v_popc, nwords); } /** @@ -302,8 +333,8 @@ extern "C" if (keybytes % WORD_BYTES != 0) return -1; int keywords = keybytes / WORD_BYTES; - const uint64_t *comp1 = (const uint64_t *) one; - const uint64_t *comp2 = (const uint64_t *) many; + const uint64_t *comp1 = reinterpret_cast(one); + const uint64_t *comp2 = reinterpret_cast(many); // Here we create top_k_scores on the stack by providing it // with a vector in which to put its elements. We do this so @@ -325,18 +356,31 @@ extern "C" max_popcnt_delta = calculate_max_difference(count_one, threshold); } + auto push_score = [&](double score, int idx) { + if (score >= threshold) { + top_k_scores.push(Node(idx, score)); + if (top_k_scores.size() > k) { + // Popping the top element is O(log(k))! + top_k_scores.pop(); + } + } + }; + const uint64_t *current = comp2; - for (int j = 0; j < n; j++, current += keywords) { - const uint32_t counts_many_j = counts_many[j]; - - if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) { - double score = _dice_coeff(comp1, count_one, current, counts_many_j, keywords); - if (score >= threshold) { - top_k_scores.push(Node(j, score)); - if (top_k_scores.size() > k) { - // Popping the top element is O(log(k))! - top_k_scores.pop(); - } + if (keywords == 16) { + for (int j = 0; j < n; j++, current += 16) { + const uint32_t counts_many_j = counts_many[j]; + if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) { + double score = _dice_coeff<16>(comp1, count_one, current, counts_many_j); + push_score(score, j); + } + } + } else { + for (int j = 0; j < n; j++, current += keywords) { + const uint32_t counts_many_j = counts_many[j]; + if (abs_diff(count_one, counts_many_j) <= max_popcnt_delta) { + double score = _dice_coeff_generic(comp1, count_one, current, counts_many_j, keywords); + push_score(score, j); } } } @@ -349,6 +393,7 @@ extern "C" top_k_scores.pop(); i += 1; } + return i; } } From 93abfae8f223727fb601bd6395865ea114cdb607 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 2 Mar 2018 12:53:57 +1100 Subject: [PATCH 45/49] Remove storage class specifiers from explicit template specialisations. --- _cffi_build/dice_one_against_many.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 3eccb2b7..128cc4f4 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -32,7 +32,7 @@ popcount( // loading the contents of buf into registers and using these same // registers for the intermediate popcnts. template<> -static inline void +inline void popcount<4>( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint64_t* buf) { @@ -58,7 +58,7 @@ popcount<4>( // for completeness (i.e. so that popcount is defined for all // non-negative n) and in anticipation of its use in the near future. template<> -static inline void +inline void popcount<3>( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &, const uint64_t* buf) { @@ -68,7 +68,7 @@ popcount<3>( } template<> -static inline void +inline void popcount<2>( uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &, const uint64_t* buf) { @@ -77,7 +77,7 @@ popcount<2>( } template<> -static inline void +inline void popcount<1>( uint64_t &c0, uint64_t &, uint64_t &, uint64_t &, const uint64_t* buf) { @@ -140,7 +140,7 @@ popcount_logand( } template<> -static inline void +inline void popcount_logand<4>( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, const uint64_t* buf1, const uint64_t *buf2) { From e9706ff5e0f996cc4aa7db7639c485b9d1c9cfe0 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 2 Mar 2018 14:05:17 +1100 Subject: [PATCH 46/49] Update README and requirements.txt files. --- README.rst | 20 ++++++++++++++------ requirements.txt | 4 ++-- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index 483590ee..16e33001 100644 --- a/README.rst +++ b/README.rst @@ -113,16 +113,24 @@ matrix, which will be approximately `#comparisons * match% / 100`. Tests ===== -Run unit tests with nose +Run unit tests with `pytest`: :: - $ python -m nose - ......................SS.............................. - ---------------------------------------------------------------------- - Ran 54 tests in 6.615s + $ pytest + ====================================== test session starts ====================================== + platform linux -- Python 3.6.4, pytest-3.2.5, py-1.4.34, pluggy-0.4.0 + rootdir: /home/hlaw/src/n1-anonlink, inifile: + collected 71 items - OK (SKIP=2) + tests/test_benchmark.py ... + tests/test_bloommatcher.py .............. + tests/test_e2e.py .............ss.... + tests/test_matcher.py ..x.....x......x....x.. + tests/test_similarity.py ......... + tests/test_util.py ... + + ======================== 65 passed, 2 skipped, 4 xfailed in 4.01 seconds ======================== To enable slightly larger tests add the following environment variables: diff --git a/requirements.txt b/requirements.txt index f4a36d17..8ea3e2fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ bitarray==0.8.1 networkx==1.11 cffi>=1.7 -nose==1.3.7 -clkhash==0.8.0 \ No newline at end of file +pytest>=3.4 +clkhash==0.8.0 From ed0968787c1eb2dee8290add467bab3c2c942680 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Thu, 8 Mar 2018 15:41:48 +1100 Subject: [PATCH 47/49] Disable unused function. --- _cffi_build/dice_one_against_many.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index d288a2dc..54f315b6 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -57,6 +57,7 @@ popcount<4>( // NB: The specialisation to n=3 is not currently used but included // for completeness (i.e. so that popcount is defined for all // non-negative n) and in anticipation of its use in the near future. +#if 0 template<> inline void popcount<3>( @@ -66,6 +67,7 @@ popcount<3>( c1 += __builtin_popcountl(buf[1]); c2 += __builtin_popcountl(buf[2]); } +#endif template<> inline void From 63cc6e0af8e4c9b89f5b0ad25eb2a4d85a402a91 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 9 Mar 2018 16:16:01 +1100 Subject: [PATCH 48/49] Put stars in their proper place. --- _cffi_build/dice_one_against_many.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 54f315b6..5285a208 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -35,7 +35,7 @@ template<> inline void popcount<4>( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, - const uint64_t* buf) { + const uint64_t *buf) { uint64_t b0, b1, b2, b3; b0 = buf[0]; b1 = buf[1]; b2 = buf[2]; b3 = buf[3]; __asm__( @@ -62,7 +62,7 @@ template<> inline void popcount<3>( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &, - const uint64_t* buf) { + const uint64_t *buf) { c0 += __builtin_popcountl(buf[0]); c1 += __builtin_popcountl(buf[1]); c2 += __builtin_popcountl(buf[2]); @@ -73,7 +73,7 @@ template<> inline void popcount<2>( uint64_t &c0, uint64_t &c1, uint64_t &, uint64_t &, - const uint64_t* buf) { + const uint64_t *buf) { c0 += __builtin_popcountl(buf[0]); c1 += __builtin_popcountl(buf[1]); } @@ -82,7 +82,7 @@ template<> inline void popcount<1>( uint64_t &c0, uint64_t &, uint64_t &, uint64_t &, - const uint64_t* buf) { + const uint64_t *buf) { c0 += __builtin_popcountl(buf[0]); } @@ -145,7 +145,7 @@ template<> inline void popcount_logand<4>( uint64_t &c0, uint64_t &c1, uint64_t &c2, uint64_t &c3, - const uint64_t* buf1, const uint64_t *buf2) { + const uint64_t *buf1, const uint64_t *buf2) { uint64_t b[4]; b[0] = buf1[0] & buf2[0]; b[1] = buf1[1] & buf2[1]; @@ -155,7 +155,7 @@ popcount_logand<4>( } static uint32_t -_popcount_logand_array(const uint64_t* u, const uint64_t* v, int len) { +_popcount_logand_array(const uint64_t *u, const uint64_t *v, int len) { // NB: The switch statement at the end of this function must have // cases for all i = 1, ..., LOOP_LEN - 1. static constexpr int LOOP_LEN = 4; From ef82759e5e4be898a434ccf7b3cdd901b488fb30 Mon Sep 17 00:00:00 2001 From: Hamish Ivey-Law Date: Fri, 9 Mar 2018 16:35:27 +1100 Subject: [PATCH 49/49] Add documentation. --- _cffi_build/dice_one_against_many.cpp | 50 +++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/_cffi_build/dice_one_against_many.cpp b/_cffi_build/dice_one_against_many.cpp index 5285a208..c5093365 100644 --- a/_cffi_build/dice_one_against_many.cpp +++ b/_cffi_build/dice_one_against_many.cpp @@ -9,6 +9,9 @@ static constexpr int WORD_BYTES = sizeof(uint64_t); +/** + * The popcount of n elements of buf is the sum of c0, c1, c2, c3. + */ template static inline void popcount( @@ -31,6 +34,9 @@ popcount( // with Clang (3.6 and 3.8)). We fix the mistake by explicitly // loading the contents of buf into registers and using these same // registers for the intermediate popcnts. +/** + * The popcount of 4 elements of buf is the sum of c0, c1, c2, c3. + */ template<> inline void popcount<4>( @@ -69,6 +75,9 @@ popcount<3>( } #endif +/** + * The popcount of 2 elements of buf is the sum of c0, c1. + */ template<> inline void popcount<2>( @@ -78,6 +87,9 @@ popcount<2>( c1 += __builtin_popcountl(buf[1]); } +/** + * The popcount *buf is in c0. + */ template<> inline void popcount<1>( @@ -87,6 +99,17 @@ popcount<1>( } +/** + * Calculate population counts of an array of inputs of nwords elements. + * + * 'arrays' must point to narrays*nwords*WORD_BYTES bytes + * 'counts' must point to narrays*sizeof(uint32_t) bytes. + * For i = 0 to narrays - 1, the population count of the nwords elements + * + * arrays[i * nwords] ... arrays[(i + 1) * nwords - 1] + * + * is put in counts[i]. + */ template static void _popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) { @@ -98,6 +121,9 @@ _popcount_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) { } } +/** + * Return the popcount of the nwords elements starting at array. + */ static uint32_t _popcount_array(const uint64_t *array, int nwords) { uint64_t c0, c1, c2, c3; @@ -132,6 +158,10 @@ _popcount_array(const uint64_t *array, int nwords) { return c0 + c1 + c2 + c3; } +/** + * The popcount of the logical AND of n corresponding elements of buf1 + * and buf2 is the sum of c0, c1, c2, c3. + */ template static inline void popcount_logand( @@ -141,6 +171,10 @@ popcount_logand( popcount_logand(c0, c1, c2, c3, buf1 + 4, buf2 + 4); } +/** + * The popcount of the logical AND of 4 corresponding elements of buf1 + * and buf2 is the sum of c0, c1, c2, c3. + */ template<> inline void popcount_logand<4>( @@ -154,6 +188,10 @@ popcount_logand<4>( popcount<4>(c0, c1, c2, c3, b); } +/** + * Return the popcount of the logical AND of len corresponding + * elements of u and v. + */ static uint32_t _popcount_logand_array(const uint64_t *u, const uint64_t *v, int len) { // NB: The switch statement at the end of this function must have @@ -181,6 +219,10 @@ _popcount_logand_array(const uint64_t *u, const uint64_t *v, int len) { return c0 + c1 + c2 + c3; } +/** + * Return the Sorensen-Dice coefficient of nwords length arrays u and + * v, whose popcounts are given in u_popc and v_popc respectively. + */ static inline double _dice_coeff_generic( const uint64_t *u, uint32_t u_popc, @@ -190,6 +232,10 @@ _dice_coeff_generic( return (2 * uv_popc) / (double) (u_popc + v_popc); } +/** + * Return the Sorensen-Dice coefficient of nwords length arrays u and + * v, whose popcounts are given in u_popc and v_popc respectively. + */ template static inline double _dice_coeff( @@ -369,6 +415,10 @@ extern "C" }; const uint64_t *current = comp2; + + // NB: For any key length that must run at maximum speed, we + // need to specialise a block in the following 'if' statement + // (which is an example of specialising to keywords == 16). if (keywords == 16) { for (int j = 0; j < n; j++, current += 16) { const uint32_t counts_many_j = counts_many[j];