Merge pull request #1079 from NickCrews/extract-predicate-filtering-f…

…rom-datamodel Extract predicate filtering from data model
dedupeio · Aug 30, 2022 · c595052 · c595052
2 parents bf028e9 + 7e26aaf
commit c595052
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 14 deletions.
diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
@@ -69,20 +69,12 @@ def _field_comparators(
             yield (var.field, comparator, start, stop)
             start = stop
 
-    def predicates(self, canopies: bool = True) -> set[Predicate]:
+    @property
+    def predicates(self) -> set[Predicate]:
         predicates = set()
         for var in self.primary_variables:
             for predicate in var.predicates:
-                if hasattr(predicate, "index"):
-                    if hasattr(predicate, "canopy"):
-                        if canopies:
-                            predicates.add(predicate)
-                    else:
-                        if not canopies:
-                            predicates.add(predicate)
-                else:
-                    predicates.add(predicate)
-
+                predicates.add(predicate)
         return predicates
 
     def distances(

diff --git a/dedupe/labeler.py b/dedupe/labeler.py
@@ -225,6 +225,20 @@ def _sample_indices(self, sample_size: int) -> Iterable[RecordIDPair]:
         return sample_ids
 
 
+def _filter_canopy_predicates(
+    predicates: Iterable[Predicate], canopies: bool
+) -> set[Predicate]:
+    result = set()
+    for predicate in predicates:
+        if hasattr(predicate, "index"):
+            is_canopy = hasattr(predicate, "canopy")
+            if is_canopy == canopies:
+                result.add(predicate)
+        else:
+            result.add(predicate)
+    return result
+
+
 class DedupeBlockLearner(BlockLearner):
     def __init__(
         self,
@@ -239,7 +253,8 @@ def __init__(
 
         index_data = sample_records(data, 50000)
         sampled_records = sample_records(index_data, N_SAMPLED_RECORDS)
-        preds = self.data_model.predicates()
+        preds = self.data_model.predicates
+        preds = _filter_canopy_predicates(preds, canopies=True)
 
         self.block_learner = training.DedupeBlockLearner(
             preds, sampled_records, index_data
@@ -293,7 +308,8 @@ def __init__(
         index_data = sample_records(data_2, 50000)
         sampled_records_2 = sample_records(index_data, N_SAMPLED_RECORDS)
 
-        preds = self.data_model.predicates(canopies=False)
+        preds = self.data_model.predicates
+        preds = _filter_canopy_predicates(preds, canopies=False)
 
         self.block_learner = training.RecordLinkBlockLearner(
             preds, sampled_records_1, sampled_records_2, index_data

diff --git a/tests/test_training.py b/tests/test_training.py
@@ -34,7 +34,7 @@ def setUp(self):
 
         self.block_learner = training.BlockLearner
         self.block_learner.blocker = dedupe.blocking.Fingerprinter(
-            self.data_model.predicates()
+            self.data_model.predicates
         )
         self.block_learner.blocker.index_all(
             {i: x for i, x in enumerate(self.training_records)}