Skip to content

Commit

Permalink
Merge pull request #1079 from NickCrews/extract-predicate-filtering-f…
Browse files Browse the repository at this point in the history
…rom-datamodel

Extract predicate filtering from data model
  • Loading branch information
fgregg committed Aug 30, 2022
2 parents bf028e9 + 7e26aaf commit c595052
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 14 deletions.
14 changes: 3 additions & 11 deletions dedupe/datamodel.py
Expand Up @@ -69,20 +69,12 @@ def _field_comparators(
yield (var.field, comparator, start, stop)
start = stop

def predicates(self, canopies: bool = True) -> set[Predicate]:
@property
def predicates(self) -> set[Predicate]:
predicates = set()
for var in self.primary_variables:
for predicate in var.predicates:
if hasattr(predicate, "index"):
if hasattr(predicate, "canopy"):
if canopies:
predicates.add(predicate)
else:
if not canopies:
predicates.add(predicate)
else:
predicates.add(predicate)

predicates.add(predicate)
return predicates

def distances(
Expand Down
20 changes: 18 additions & 2 deletions dedupe/labeler.py
Expand Up @@ -225,6 +225,20 @@ def _sample_indices(self, sample_size: int) -> Iterable[RecordIDPair]:
return sample_ids


def _filter_canopy_predicates(
predicates: Iterable[Predicate], canopies: bool
) -> set[Predicate]:
result = set()
for predicate in predicates:
if hasattr(predicate, "index"):
is_canopy = hasattr(predicate, "canopy")
if is_canopy == canopies:
result.add(predicate)
else:
result.add(predicate)
return result


class DedupeBlockLearner(BlockLearner):
def __init__(
self,
Expand All @@ -239,7 +253,8 @@ def __init__(

index_data = sample_records(data, 50000)
sampled_records = sample_records(index_data, N_SAMPLED_RECORDS)
preds = self.data_model.predicates()
preds = self.data_model.predicates
preds = _filter_canopy_predicates(preds, canopies=True)

self.block_learner = training.DedupeBlockLearner(
preds, sampled_records, index_data
Expand Down Expand Up @@ -293,7 +308,8 @@ def __init__(
index_data = sample_records(data_2, 50000)
sampled_records_2 = sample_records(index_data, N_SAMPLED_RECORDS)

preds = self.data_model.predicates(canopies=False)
preds = self.data_model.predicates
preds = _filter_canopy_predicates(preds, canopies=False)

self.block_learner = training.RecordLinkBlockLearner(
preds, sampled_records_1, sampled_records_2, index_data
Expand Down
2 changes: 1 addition & 1 deletion tests/test_training.py
Expand Up @@ -34,7 +34,7 @@ def setUp(self):

self.block_learner = training.BlockLearner
self.block_learner.blocker = dedupe.blocking.Fingerprinter(
self.data_model.predicates()
self.data_model.predicates
)
self.block_learner.blocker.index_all(
{i: x for i, x in enumerate(self.training_records)}
Expand Down

0 comments on commit c595052

Please sign in to comment.