thresholding moved to clustering

dedupeio · Mar 9, 2020 · 39f4379 · 39f4379
1 parent 4850987
commit 39f4379
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 17 deletions.
diff --git a/dedupe/api.py b/dedupe/api.py
@@ -253,7 +253,7 @@ def pairs(self, data):
 
     def cluster(self,
                 scores: numpy.ndarray,
-                threshold: float) -> Clusters:
+                threshold: float = 0.5) -> Clusters:
         """
         From the similarity scores of pairs of records, decide which groups
         of records are all referring to the same entity.
@@ -284,6 +284,8 @@ def cluster(self,
                        Lowering the number will increase recall,
                        raising it will increase precision
 
+                       Defaults to 0.5.
+
         .. code:: python
 
            > pairs = matcher.pairs(data)
@@ -458,14 +460,12 @@ def join(self,
         pairs = self.pairs(data_1, data_2)
         pair_scores = self.score(pairs)
 
-        pair_scores = pair_scores[pair_scores['score'] > threshold]
-
         if constraint == 'one-to-one':
-            links = self.one_to_one(pair_scores)
+            links = self.one_to_one(pair_scores, threshold)
         elif constraint == 'many-to-one':
-            links = self.many_to_one(pair_scores)
+            links = self.many_to_one(pair_scores, threshold)
         elif constraint == 'many-to-many':
-            links = pair_scores
+            links = pair_scores[pair_scores['score'] > threshold]
 
         links = list(links)
 
@@ -479,9 +479,9 @@ def join(self,
         return links
 
     def one_to_one(self,
-                   scores: numpy.ndarray) -> Links:
-        """
-        From the similarity scores of pairs of records, decide which
+                   scores: numpy.ndarray,
+                   threshold: float = 0.0) -> Links:
+        """From the similarity scores of pairs of records, decide which
         pairs refer to the same entity.
 
         Every record in data_1 can match at most one record from
@@ -508,6 +508,14 @@ def one_to_one(self,
                     should contains the similarity score for that
                     pair of records.
 
+            threshold: Number between 0 and 1 (default is 0.0). We
+                       will consider records as potential
+                       duplicates if the predicted probability of
+                       being a duplicate is above the threshold.
+
+                       Lowering the number will increase recall, raising it
+                       will increase precision
+
 
         .. code:: python
 
@@ -520,13 +528,16 @@ def one_to_one(self,
             ((10, 11), 0.899)]
 
         """
+        if threshold:
+            scores = scores[scores['score'] > threshold]
 
         logger.debug("matching done, begin clustering")
 
         yield from clustering.greedyMatching(scores)
 
     def many_to_one(self,
-                    scores: numpy.ndarray) -> Links:
+                    scores: numpy.ndarray,
+                    threshold: float = 0.0) -> Links:
         """
         From the similarity scores of pairs of records, decide which
         pairs refer to the same entity.
@@ -553,6 +564,13 @@ def many_to_one(self,
                     should contains the similarity score for that
                     pair of records.
 
+            threshold: Number between 0 and 1 (default is 0.0). We
+                       will consider records as potential
+                       duplicates if the predicted probability of
+                       being a duplicate is above the threshold.
+
+                       Lowering the number will increase recall, raising it
+                       will increase precision
 
         .. code:: python
 
@@ -569,7 +587,7 @@ def many_to_one(self,
 
         logger.debug("matching done, begin clustering")
 
-        yield from clustering.pair_gazette_matching(scores, 1)
+        yield from clustering.pair_gazette_matching(scores, threshold, 1)
 
 
 class GazetteerMatching(Matching):
@@ -742,7 +760,7 @@ def score(self,
 
     def many_to_n(self,
                   score_blocks: Iterable[numpy.ndarray],
-                  threshold: float = 0.,
+                  threshold: float = 0.0,
                   n_matches: int = 1) -> Links:
         """
         For each group of scored pairs, yield the highest scoring N pairs
@@ -757,6 +775,14 @@ def many_to_n(self,
                           should contains the similarity score for that
                           pair of records.
 
+            threshold: Number between 0 and 1 (default is 0.0). We
+                       will consider records as potential
+                       duplicates if the predicted probability of
+                       being a duplicate is above the threshold.
+
+                       Lowering the number will increase recall, raising it
+                       will increase precision
+
             n_matches: How many top scoring pairs to select per group
 
         """

diff --git a/dedupe/clustering.py b/dedupe/clustering.py
@@ -245,6 +245,7 @@ def gazetteMatching(scored_blocks: Iterable[numpy.ndarray],
 
 
 def pair_gazette_matching(scored_pairs: numpy.ndarray,
+                          threshold: float = 0.0,
                           n_matches: int = 1) -> Links:
 
     scored_pairs.sort(order='pairs')
@@ -253,6 +254,6 @@ def pair_gazette_matching(scored_pairs: numpy.ndarray,
     change_points = numpy.where(numpy.roll(group_key, 1) != group_key)[0]
     scored_blocks = numpy.split(scored_pairs, change_points)
 
-    for match in gazetteMatching(scored_blocks, 0, n_matches):
+    for match in gazetteMatching(scored_blocks, threshold, n_matches):
         if match:
             yield from match
diff --git a/docs/API-documentation.rst b/docs/API-documentation.rst
@@ -160,7 +160,7 @@ Dedupe and StaticDedupe
 
        Same as :func:`dedupe.Dedupe.score`
 
-    .. method:: cluster(matches, threshold)
+    .. method:: cluster(matches, threshold=0.5)
 
        Same as :func:`dedupe.Dedupe.cluster`
 
@@ -194,11 +194,11 @@ RecordLink and StaticRecordLink
 
 	Same as :func:`dedupe.RecordLink.score`
 
-   .. method:: one_to_one(scores)
+   .. method:: one_to_one(scores, threshold=0.0)
 
         Same as :func:`dedupe.RecordLink.one_to_one`
 
-   .. method:: many_to_one(scores)
+   .. method:: many_to_one(scores, threshold=0.0)
 
 	Same as :func:`dedupe.RecordLink.many_to_one`
 
@@ -231,7 +231,7 @@ Gazetteer and StaticGazetteer
 
 	Same as :func:`dedupe.Gazetteer.score`
 
-    .. method:: many_to_n(score_blocks, n_matches=1)
+    .. method:: many_to_n(score_blocks, threshold=0.0, n_matches=1)
 
 	Same as :func:`dedupe.Gazetteer.many_to_n`