Skip to content

Commit

Permalink
Move predicate filtering out of datamodel
Browse files Browse the repository at this point in the history
Part of the quest to remove the implementation details
of predicates out of DataModel and into the things that
actually care about them.

This slightly changes the behavior in the test because we don't
do any filtering either way, so we use ALL predicates from the
variable definitions
  • Loading branch information
NickCrews committed Aug 17, 2022
1 parent 8474469 commit 7e26aaf
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 11 deletions.
11 changes: 3 additions & 8 deletions dedupe/datamodel.py
Expand Up @@ -69,17 +69,12 @@ def _field_comparators(
yield (var.field, comparator, start, stop)
start = stop

def predicates(self, canopies: bool = True) -> set[Predicate]:
@property
def predicates(self) -> set[Predicate]:
predicates = set()
for var in self.primary_variables:
for predicate in var.predicates:
if hasattr(predicate, "index"):
is_canopy = hasattr(predicate, "canopy")
if is_canopy == canopies:
predicates.add(predicate)
else:
predicates.add(predicate)

predicates.add(predicate)
return predicates

def distances(
Expand Down
20 changes: 18 additions & 2 deletions dedupe/labeler.py
Expand Up @@ -225,6 +225,20 @@ def _sample_indices(self, sample_size: int) -> Iterable[RecordIDPair]:
return sample_ids


def _filter_canopy_predicates(
predicates: Iterable[Predicate], canopies: bool
) -> set[Predicate]:
result = set()
for predicate in predicates:
if hasattr(predicate, "index"):
is_canopy = hasattr(predicate, "canopy")
if is_canopy == canopies:
result.add(predicate)
else:
result.add(predicate)
return result


class DedupeBlockLearner(BlockLearner):
def __init__(
self,
Expand All @@ -239,7 +253,8 @@ def __init__(

index_data = sample_records(data, 50000)
sampled_records = sample_records(index_data, N_SAMPLED_RECORDS)
preds = self.data_model.predicates()
preds = self.data_model.predicates
preds = _filter_canopy_predicates(preds, canopies=True)

self.block_learner = training.DedupeBlockLearner(
preds, sampled_records, index_data
Expand Down Expand Up @@ -293,7 +308,8 @@ def __init__(
index_data = sample_records(data_2, 50000)
sampled_records_2 = sample_records(index_data, N_SAMPLED_RECORDS)

preds = self.data_model.predicates(canopies=False)
preds = self.data_model.predicates
preds = _filter_canopy_predicates(preds, canopies=False)

self.block_learner = training.RecordLinkBlockLearner(
preds, sampled_records_1, sampled_records_2, index_data
Expand Down
2 changes: 1 addition & 1 deletion tests/test_training.py
Expand Up @@ -34,7 +34,7 @@ def setUp(self):

self.block_learner = training.BlockLearner
self.block_learner.blocker = dedupe.blocking.Fingerprinter(
self.data_model.predicates()
self.data_model.predicates
)
self.block_learner.blocker.index_all(
{i: x for i, x in enumerate(self.training_records)}
Expand Down

0 comments on commit 7e26aaf

Please sign in to comment.