Skip to content

Commit

Permalink
Function to compute Dice coefficients of bitarray pairs (#567)
Browse files Browse the repository at this point in the history
* Function in anonlink.similarities to compute the Dice coefficient on pairs of bitarrays

* Add changelog entry

cleanup PR

* Test with all zeros

* Remove ubuntu-18.04 unittests
  • Loading branch information
hardbyte committed May 1, 2023
1 parent cdca890 commit 5786439
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unittests.yml
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [macos-latest, windows-latest, ubuntu-18.04, ubuntu-20.04]
os: [macos-latest, windows-latest, ubuntu-20.04]
python: ["3.8", "3.9", "3.10", "3.11"]

steps:
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.rst
@@ -1,3 +1,8 @@
0.15.3
======

- Added function to compute Dice coefficients of bitarray pairs. #567

0.15.2
======

Expand Down
3 changes: 2 additions & 1 deletion anonlink/similarities/__init__.py
Expand Up @@ -10,7 +10,8 @@
functions are possible as well.
"""

from anonlink.similarities._dice_python import dice_coefficient_python
from anonlink.similarities._dice_python import (dice_coefficient_python,
dice_coefficient_pairs_python)
from anonlink.similarities._smc import (hamming_similarity,
simple_matching_coefficient)

Expand Down
41 changes: 41 additions & 0 deletions anonlink/similarities/_dice_python.py
Expand Up @@ -2,6 +2,7 @@
from itertools import repeat
from typing import Iterable, Optional, Sequence, Tuple

import numpy as np
from bitarray import bitarray

from anonlink.similarities._utils import (sort_similarities_inplace,
Expand Down Expand Up @@ -77,3 +78,43 @@ def dice_coefficient_python(
sort_similarities_inplace(result_sims, result_indices0, result_indices1)

return result_sims, (result_indices0, result_indices1)



def dice_coefficient_pairs_python(
datasets: Sequence[Tuple[bitarray, bitarray]]
):
"""Find Dice coefficients of bitarray pairs.
This version is written in Python, so it does not rely on
architecture-specific instructions. It may be slower than an
accelerated version.
A similarity is computed for every pair of bitarrays in the input
datasets, the similarity for each pair is returned as a floating-point
value.
:param datasets: A sequence of candidate pairs. Each pair in a tuple
of bitarrays.
:return: Similarity scores for every input pair as an array of
floating-point values.
"""
candidate_pair_count = len(datasets)

# Preallocate the result array.
result_sims = np.zeros(candidate_pair_count, dtype=np.float64)

for i, (f0, f1) in enumerate(datasets):
f0_count = f0.count()
f1_count = f1.count()
combined_count = f0_count + f1_count

if combined_count:
score: float = (2.0 * (f0 & f1).count() / combined_count)
else: # Avoid division by zero.
score = 0.0

result_sims[i] = score

return result_sims
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -68,7 +68,7 @@

setup(
name="anonlink",
version='0.15.2',
version='0.15.3',
description='Anonymous linkage using cryptographic hashes and bloom filters',
long_description=readme,
long_description_content_type='text/x-rst',
Expand Down
43 changes: 43 additions & 0 deletions tests/test_e2e.py
Expand Up @@ -273,5 +273,48 @@ def test_greedy_chunked_matching_works(self):
assert mapping == merged_mapping



class TestSimilarityStream(EntityHelperMixin, unittest.TestCase):

proportion = 0.8
sample = 150

def setUp(self):
self.nl = randomnames.NameList(300)
self.s1, self.s2 = self.nl.generate_subsets(self.sample, self.proportion)
self.key_lists = generate_key_lists('secret', len(self.nl.schema_types))
self.f1 = tuple(map(itemgetter(0),
bloomfilter.stream_bloom_filters(
self.s1, self.key_lists, self.nl.SCHEMA)))
self.f2 = tuple(map(itemgetter(0),
bloomfilter.stream_bloom_filters(
self.s2, self.key_lists, self.nl.SCHEMA)))

def test_similarity_stream(self):
candidate_pairs = []
for f1 in self.f1:
for f2 in self.f2:
candidate_pairs.append((f1, f2))

similarity_stream = anonlink.similarities.dice_coefficient_pairs_python(
candidate_pairs
)

assert len(similarity_stream) == len(self.f1) * len(self.f2)

candidate_pairs = anonlink.candidate_generation.find_candidate_pairs(
(self.f1, self.f2),
anonlink.similarities.dice_coefficient_accelerated,
threshold=0.0,
)

scores, _, (l_indicies, r_indicies) = candidate_pairs

for score, l_index, r_index in zip(scores, l_indicies, r_indicies):
# Calculate the index in the streamed candidate pairs list
index = l_index * len(self.f2) + r_index
assert similarity_stream[index] == score


if __name__ == '__main__':
unittest.main()
16 changes: 16 additions & 0 deletions tests/test_similarity_dice.py
Expand Up @@ -4,6 +4,7 @@
from clkhash.key_derivation import generate_key_lists
from hypothesis import given, strategies

import anonlink.similarities
from anonlink import similarities

FLOAT_ARRAY_TYPES = 'fd'
Expand Down Expand Up @@ -258,6 +259,21 @@ def test_all_low(self, sim_fun, k, threshold):
assert (rec_is0.typecode in UINT_ARRAY_TYPES
and rec_is1.typecode in UINT_ARRAY_TYPES)

def test_candidate_stream_right_low(self):
datasets = list(zip(*[[bitarray('01001011') * 8],
[bitarray('00000000') * 8]]))
sims = anonlink.similarities.dice_coefficient_pairs_python(datasets)
assert len(sims) == 1
assert all(s == 0.0 for s in sims)

def test_candidate_stream_all_low(self):
datasets = list(zip(*[[bitarray('00000000') * 8],
[bitarray('00000000') * 8]]))
sims = anonlink.similarities.dice_coefficient_pairs_python(datasets)

assert len(sims) == 1
assert all(s == 0.0 for s in sims)

@pytest.mark.parametrize('sim_fun', SIM_FUNS)
def test_order(self, sim_fun):
similarity = sim_fun(
Expand Down

0 comments on commit 5786439

Please sign in to comment.