Skip to content

Commit

Permalink
Merge pull request #186 from n1analytics/release-0.11.0
Browse files Browse the repository at this point in the history
Release 0.11.0 into master
  • Loading branch information
hardbyte committed Feb 28, 2019
2 parents fff50f0 + 75e7b3c commit fdcbd01
Show file tree
Hide file tree
Showing 34 changed files with 2,109 additions and 215 deletions.
10 changes: 8 additions & 2 deletions .travis.yml
Expand Up @@ -2,7 +2,6 @@ language: python
sudo: false

python:
- '3.6'
- '3.7-dev'
- 'nightly'
- 'pypy3'
Expand All @@ -16,6 +15,11 @@ matrix:
- python: 'nightly'
- python: 'pypy3'

include:
- python: '3.6'
env:
- MYPY_TYPING=1

jobs:
include:
- stage: Build wheels
Expand All @@ -30,11 +34,13 @@ jobs:
- ls wheelhouse/

install:
- if [ "${MYPY_TYPING}" == "1" ]; then travis_retry pip install mypy; fi
- travis_retry pip install pytest pytest-cov
- if [ -n "$USE_ASV" ]; then pip install asv; fi
- travis_retry pip install -r requirements.txt
- travis_retry python setup.py sdist bdist_wheel
- travis_retry pip install -e .

script:
- pytest --cov=anonlink -W ignore:DeprecationWarning
- if [ "${MYPY_TYPING}" == "1" ]; then mypy anonlink --ignore-missing-imports; fi
- pytest --cov=anonlink -W ignore::DeprecationWarning
19 changes: 19 additions & 0 deletions CHANGELOG.rst
@@ -1,3 +1,22 @@
0.11.0
======

Major changes:
--------------
- The greedy solver has been ported to C++, bringing performance improvements. The pure Python version remains in the package as `anonlink.solving.greedy_solve_python`.
- Candidate pair generation now supports blocking. Some blocking functions are defined in `anonlink.blocking` but custom ones may be defined.
- New utilities assist in analysis of similarity scores. They can help an analyst find a good threshold or determine the quality of the linkage, and can be found in `anonlink.stats`. Examples are located in `docs/examples/similarity-plots`.
- Adds a _probabilistic_ multiparty greedy solver. It generally yields more accurate results than the previous multiparty greedy solver. It is able to infer that some pairs match even they are below the similarity threshold.

Minor changes:
--------------
- The `hamming_similarity` in the `similarities` module is renamed to `simple_matching_coefficient`, which is the canonical name for this similarity measure. `hamming_similarity` is now a deprecated alias.
- `anonlink.similarities` is now imported whenever `anonlink` is imported. This means that `anonlink.similarities` no longer has to be imported separately.
- The new helper function `anonlink.solving.pairs_from_groups` turns the output of the greedy solver (a set of groups) into an iterable of pairs for bipartite problems.
- Dice similarity functions now support `bytes` as input. Previously the inputs had to be `bitarray`s.
- Mypy typing is enforced in the automated tests.
- Adds a heuristic for estimating the quality of the linkage, `anonlink.stats.nonmatch_index_score`.

0.10.0
======

Expand Down
2 changes: 1 addition & 1 deletion Jenkinsfile.groovy
Expand Up @@ -45,7 +45,7 @@ def build(python_version, compiler, label, release = false) {

PythonVirtualEnvironment venv = prepareVirtualEnvironment(python_version, clkhashPackageName, compiler)
try {
venv.runChosenCommand("pytest --cov=anonlink --junit-xml=testoutput.xml --cov-report=xml:coverage.xml -W ignore:DeprecationWarning")
venv.runChosenCommand("pytest --cov=anonlink --junit-xml=testoutput.xml --cov-report=xml:coverage.xml -W ignore::DeprecationWarning")
if (release) {
// This will be the official release
archiveArtifacts artifacts: "dist/anonlink-*.whl"
Expand Down
2 changes: 2 additions & 0 deletions anonlink/__init__.py
Expand Up @@ -7,7 +7,9 @@
from anonlink import entitymatch
from anonlink import network_flow
from anonlink import serialization
from anonlink import similarities
from anonlink import solving
from anonlink import stats
from anonlink import typechecking

__version__ = pkg_resources.get_distribution('anonlink').version
Expand Down
35 changes: 28 additions & 7 deletions anonlink/_deprecation.py
Expand Up @@ -2,6 +2,8 @@
import typing as _typing
import warnings as _warnings

import mypy_extensions as _mypy_extensions


def _warn_deprecated(
module_name: str,
Expand All @@ -16,20 +18,39 @@ def _warn_deprecated(
_warnings.warn(msg, DeprecationWarning, stacklevel=3)


# Mypy typing this is too hard. ¯\_(ツ)_/¯
def make_decorator(module_name):
def deprecate_decorator(f=None, *, replacement=None):
def deprecate_decorator_inner(f):
_WrappedF = _typing.TypeVar('_WrappedF',
bound=_typing.Callable[..., _typing.Any])
def make_decorator(
module_name: str
) -> _typing.Callable[..., _typing.Any]:
@_typing.overload
def deprecate_decorator(
f: None = None,
*,
replacement: _typing.Optional[str] = None
) -> _typing.Callable[[_WrappedF], _WrappedF]:
...
@_typing.overload
def deprecate_decorator(
f: _WrappedF,
*,
replacement: _typing.Optional[str] = None
) -> _WrappedF:
...
def deprecate_decorator(
f: _typing.Optional[_WrappedF] = None,
*,
replacement: _typing.Optional[str] = None
) -> _typing.Union[_WrappedF, _typing.Callable[[_WrappedF], _WrappedF]]:
def deprecate_decorator_inner(f: _WrappedF) -> _WrappedF:
@_functools.wraps(f)
def f_inner(*args, **kwargs):
_warn_deprecated(module_name, f.__name__, replacement)
return f(*args, **kwargs)
return f_inner
return _typing.cast(_WrappedF, f_inner)
# Use decorator with or without arguments.
if f is None:
return deprecate_decorator_inner
else:
return deprecate_decorator_inner(f)
return deprecate_decorator


5 changes: 2 additions & 3 deletions anonlink/blocking.py
Expand Up @@ -11,7 +11,6 @@
"""

import itertools as _itertools
import numbers as _numbers
import random as _random
import typing as _typing

Expand Down Expand Up @@ -183,8 +182,8 @@ def bit_blocking_inner(


def continuous_blocking(
radius: _numbers.Real,
source: _typing.Sequence[_typing.Sequence[_numbers.Real]]
radius: float,
source: _typing.Sequence[_typing.Sequence[float]]
) -> _typechecking.BlockingFunction[_typechecking.Record]:
"""Block on continuous variables.
Expand Down
149 changes: 101 additions & 48 deletions anonlink/candidate_generation.py
@@ -1,61 +1,102 @@
"""Finding candidate pairs from multiple datasets."""

import array as _array
import collections as _collections
import heapq as _heapq
import itertools as _itertools
import numbers as _numbers
import typing as _typing

import anonlink.typechecking as _typechecking

_OnePairSimilarities = _typing.Tuple[
_typing.Tuple[int, int],
_typing.Tuple[_typechecking.FloatArrayType,
_typing.Sequence[_typechecking.IntArrayType]]]
_Block = _typing.Tuple[_typing.List[int], ...]
_CandidatePair = _typing.Tuple[float, int, int, int, int]
_CandidatePairIterable = _typing.Iterable[_CandidatePair]


def _to_candidate_pairs(
sims: _typing.Iterable[float],
rec_is0: _typing.Iterable[int],
rec_is1: _typing.Iterable[int],
i0: int,
i1: int,
block: _typing.Sequence[_typing.Sequence[int]]
) -> _CandidatePairIterable:
return ((sim, i0, i1, block[i0][rec_i0], block[i1][rec_i1])
for sim, rec_i0, rec_i1 in zip(sims, rec_is0, rec_is1))


def _block_similarities(
block: _Block,
datasets: _typing.Sequence[_typechecking.Dataset],
similarity_f: _typechecking.SimilarityFunction,
threshold: float,
k: _typing.Optional[int]
) -> _typing.Iterable[_CandidatePairIterable]:
for i0, i1 in _itertools.combinations(range(len(block)), 2):
recs_dset0 = tuple(map(datasets[i0].__getitem__, block[i0]))
recs_dset1 = tuple(map(datasets[i1].__getitem__, block[i1]))
sims, (rec_is0, rec_is1) = similarity_f(
(recs_dset0, recs_dset1), threshold, k=k)

yield _to_candidate_pairs(sims, rec_is0, rec_is1, i0, i1, block)


def _enforce_k(
similarities: _CandidatePairIterable,
k: int
) -> _CandidatePairIterable:
candidates_counter: _typing.Counter[_typing.Tuple[int, int, int]] \
= _collections.Counter()
for similarity in similarities:
_, dset_i0, dset_i1, rec_i0, rec_i1 = similarity
# At most k candidate pairs per record per dataset pair.
i0 = dset_i0, dset_i1, rec_i1
i1 = dset_i1, dset_i0, rec_i0
candidates_counter[i0] += 1
candidates_counter[i1] += 1
if candidates_counter[i0] <= k and candidates_counter[i1] <= k:
yield similarity


def _merge_similarities(
similarities: _typing.Sequence[_OnePairSimilarities]
similarities: _typing.Iterable[_CandidatePairIterable],
k: _typing.Optional[int]
) -> _typechecking.CandidatePairs:
# Merge similarities in sorted order.
# This is almost certainly an inefficient way of doing this, but
# it's hard to get anything more efficient in pure Python.
# Future: redo this in Cython, which has the ability to directly
# modify and resize array buffers.
if not similarities:
# Empty but correct type.
return (_array.array('d'),
(_array.array('I'), _array.array('I')),
(_array.array('I'), _array.array('I')))

similarities_iters = (
zip(sims, _itertools.repeat(dataset_is), zip(*record_is))
for dataset_is, (sims, record_is) in similarities)
merged_similarities = _heapq.merge(*similarities_iters,
# Merge multiple sorted sequences
sorted_similarities = _heapq.merge(*similarities,
key=lambda x: (-x[0], *x[1:]))

# One record can be in multiple blocks. Remove duplicates.
deduplicated_similarities = (
k for k, _ in _itertools.groupby(sorted_similarities))

if k is None:
k_enforced_similarities: _CandidatePairIterable \
= deduplicated_similarities
else:
k_enforced_similarities = _enforce_k(deduplicated_similarities, k)

# Assume all arrays are the same type.
# Future: this may require changing.
fst_datset_is, (fst_sims, fst_record_is) = similarities[0]
result_sims: _typechecking.FloatArrayType = _array.array(fst_sims.typecode)
result_dataset_is: _typing.Tuple[_typechecking.IntArrayType, ...] \
= tuple(_array.array('I') for _ in fst_datset_is)
result_record_is: _typing.Tuple[_typechecking.IntArrayType, ...] \
= tuple(_array.array(f.typecode) for f in fst_record_is)
for sim, dataset_is, record_is in merged_similarities:
result_sims.append(sim)
for result_dataset_i, dataset_i in zip(result_dataset_is, dataset_is):
result_dataset_i.append(dataset_i)
for result_record_i, record_i in zip(result_record_is, record_is):
result_record_i.append(record_i)
return result_sims, result_dataset_is, result_record_is
sims: _typechecking.FloatArrayType = _array.array('d')
dset_is0: _typechecking.IntArrayType = _array.array('I')
dset_is1: _typechecking.IntArrayType = _array.array('I')
rec_is0: _typechecking.IntArrayType = _array.array('I')
rec_is1: _typechecking.IntArrayType = _array.array('I')
for sim, dset_i0, dset_i1, rec_i0, rec_i1 in k_enforced_similarities:
sims.append(sim)
dset_is0.append(dset_i0)
dset_is1.append(dset_i1)
rec_is0.append(rec_i0)
rec_is1.append(rec_i1)
return sims, (dset_is0, dset_is1), (rec_is0, rec_is1)


def find_candidate_pairs(
datasets: _typing.Sequence[_typechecking.Dataset],
similarity_f: _typechecking.SimilarityFunction,
threshold: _numbers.Real,
k: _typing.Optional[_numbers.Integral] = None,
threshold: float,
k: _typing.Optional[int] = None,
blocking_f: _typing.Optional[_typechecking.BlockingFunction] = None
) -> _typechecking.CandidatePairs:
"""Find candidate pairs from multiple datasets. Optional blocking.
Expand All @@ -70,9 +111,9 @@ def find_candidate_pairs(
:param k: Only permit this many candidate pairs per dataset pair per
record. Set to `None` to permit all pairs above with similarity
at least `threshold`.
:param blocking_f: Not yet implemented. Future: A function returning
all block IDs for a record. Two records are compared iff they
have at least one block ID in common.
:param blocking_f: A function returning all block IDs for a record.
Two records are compared iff they have at least one block ID in
common. Support for this is experimental and subject to change.
:return: A 3-tuple `(similarity, dataset_i, record_i)`. `dataset_i`
and `record_i` are sequences of sequences. Every sequence in
Expand All @@ -87,14 +128,26 @@ def find_candidate_pairs(
second record in the pair; `record_[1][i]` is this record's
index in its dataset. `similarity[i]` is the pair's similarity;
this value will be greater than `threshold`.
"""
if blocking_f is not None:
raise NotImplementedError('blocking is not yet implemented')
"""
if blocking_f is None:
# Dummy blocking function.
def blocking_f(dataset_index, record_index, hash_):
return None,
assert blocking_f is not None # This is for Mypy.

blocks: _typing.DefaultDict[_typing.Hashable, _Block] \
= _collections.defaultdict(lambda: tuple([] for _ in datasets))
for i, dataset in enumerate(datasets):
for j, record in enumerate(dataset):
for block_id in blocking_f(i, j, record):
blocks[block_id][i].append(j)

similarities = []
for (i0, dataset0), (i1, dataset1) \
in _itertools.combinations(enumerate(datasets), 2):
similarity = similarity_f((dataset0, dataset1), threshold, k=k)
similarities.append(((i0, i1), similarity))
similarities = tuple(_itertools.chain.from_iterable(
map(_block_similarities,
blocks.values(),
_itertools.repeat(datasets),
_itertools.repeat(similarity_f),
_itertools.repeat(threshold),
_itertools.repeat(k))))

return _merge_similarities(similarities)
return _merge_similarities(similarities, k)
9 changes: 4 additions & 5 deletions anonlink/concurrency.py
Expand Up @@ -3,7 +3,6 @@
import array as _array
import itertools as _itertools
import math as _math
import numbers as _numbers
import typing as _typing

import numpy as _np
Expand Down Expand Up @@ -37,11 +36,11 @@ def _chunks_1d(


def split_to_chunks(
chunk_size_aim: _numbers.Real,
chunk_size_aim: float,
*,
# Keyword-only for forwards compatibility: this argument may not be
# needed once we do blocking
dataset_sizes: _typing.Sequence[_numbers.Integral]
dataset_sizes: _typing.Sequence[int]
) -> _typing.Iterable[_typechecking.ChunkInfo]:
"""Split datasets into chunks for parallel processing.
Expand Down Expand Up @@ -107,8 +106,8 @@ def process_chunk(
chunk: _typechecking.ChunkInfo,
datasets: _typing.Sequence[_typechecking.Dataset],
similarity_f: _typechecking.SimilarityFunction,
threshold: _numbers.Real,
k: _typing.Optional[_numbers.Integral] = None
threshold: float,
k: _typing.Optional[int] = None
) -> _typechecking.CandidatePairs:
"""Find candidate pairs for the chunk.
Expand Down
9 changes: 5 additions & 4 deletions anonlink/similarities/__init__.py
Expand Up @@ -5,13 +5,14 @@
similarity of at least this threshold are returned. We call these the
candidate pairs.
Currently, the Dice Coefficient and Hamming Similarity are implemented.
These work on binary strings. However, other similarity functions are
possible as well.
Currently, the Dice Coefficient and the Simple Matching Coefficient are
implemented. These work on binary strings. However, other similarity
functions are possible as well.
"""

from anonlink.similarities._dice_python import dice_coefficient_python
from anonlink.similarities._hamming import hamming_similarity
from anonlink.similarities._smc import (hamming_similarity,
simple_matching_coefficient)

try:
from anonlink.similarities._dice_x86 import dice_coefficient_accelerated
Expand Down

0 comments on commit fdcbd01

Please sign in to comment.