Skip to content

Commit

Permalink
thresholding moved to clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Mar 9, 2020
1 parent 4850987 commit 39f4379
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 17 deletions.
50 changes: 38 additions & 12 deletions dedupe/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def pairs(self, data):

def cluster(self,
scores: numpy.ndarray,
threshold: float) -> Clusters:
threshold: float = 0.5) -> Clusters:
"""
From the similarity scores of pairs of records, decide which groups
of records are all referring to the same entity.
Expand Down Expand Up @@ -284,6 +284,8 @@ def cluster(self,
Lowering the number will increase recall,
raising it will increase precision
Defaults to 0.5.
.. code:: python
> pairs = matcher.pairs(data)
Expand Down Expand Up @@ -458,14 +460,12 @@ def join(self,
pairs = self.pairs(data_1, data_2)
pair_scores = self.score(pairs)

pair_scores = pair_scores[pair_scores['score'] > threshold]

if constraint == 'one-to-one':
links = self.one_to_one(pair_scores)
links = self.one_to_one(pair_scores, threshold)
elif constraint == 'many-to-one':
links = self.many_to_one(pair_scores)
links = self.many_to_one(pair_scores, threshold)
elif constraint == 'many-to-many':
links = pair_scores
links = pair_scores[pair_scores['score'] > threshold]

links = list(links)

Expand All @@ -479,9 +479,9 @@ def join(self,
return links

def one_to_one(self,
scores: numpy.ndarray) -> Links:
"""
From the similarity scores of pairs of records, decide which
scores: numpy.ndarray,
threshold: float = 0.0) -> Links:
"""From the similarity scores of pairs of records, decide which
pairs refer to the same entity.
Every record in data_1 can match at most one record from
Expand All @@ -508,6 +508,14 @@ def one_to_one(self,
should contains the similarity score for that
pair of records.
threshold: Number between 0 and 1 (default is 0.0). We
will consider records as potential
duplicates if the predicted probability of
being a duplicate is above the threshold.
Lowering the number will increase recall, raising it
will increase precision
.. code:: python
Expand All @@ -520,13 +528,16 @@ def one_to_one(self,
((10, 11), 0.899)]
"""
if threshold:
scores = scores[scores['score'] > threshold]

logger.debug("matching done, begin clustering")

yield from clustering.greedyMatching(scores)

def many_to_one(self,
scores: numpy.ndarray) -> Links:
scores: numpy.ndarray,
threshold: float = 0.0) -> Links:
"""
From the similarity scores of pairs of records, decide which
pairs refer to the same entity.
Expand All @@ -553,6 +564,13 @@ def many_to_one(self,
should contains the similarity score for that
pair of records.
threshold: Number between 0 and 1 (default is 0.0). We
will consider records as potential
duplicates if the predicted probability of
being a duplicate is above the threshold.
Lowering the number will increase recall, raising it
will increase precision
.. code:: python
Expand All @@ -569,7 +587,7 @@ def many_to_one(self,

logger.debug("matching done, begin clustering")

yield from clustering.pair_gazette_matching(scores, 1)
yield from clustering.pair_gazette_matching(scores, threshold, 1)


class GazetteerMatching(Matching):
Expand Down Expand Up @@ -742,7 +760,7 @@ def score(self,

def many_to_n(self,
score_blocks: Iterable[numpy.ndarray],
threshold: float = 0.,
threshold: float = 0.0,
n_matches: int = 1) -> Links:
"""
For each group of scored pairs, yield the highest scoring N pairs
Expand All @@ -757,6 +775,14 @@ def many_to_n(self,
should contains the similarity score for that
pair of records.
threshold: Number between 0 and 1 (default is 0.0). We
will consider records as potential
duplicates if the predicted probability of
being a duplicate is above the threshold.
Lowering the number will increase recall, raising it
will increase precision
n_matches: How many top scoring pairs to select per group
"""
Expand Down
3 changes: 2 additions & 1 deletion dedupe/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ def gazetteMatching(scored_blocks: Iterable[numpy.ndarray],


def pair_gazette_matching(scored_pairs: numpy.ndarray,
threshold: float = 0.0,
n_matches: int = 1) -> Links:

scored_pairs.sort(order='pairs')
Expand All @@ -253,6 +254,6 @@ def pair_gazette_matching(scored_pairs: numpy.ndarray,
change_points = numpy.where(numpy.roll(group_key, 1) != group_key)[0]
scored_blocks = numpy.split(scored_pairs, change_points)

for match in gazetteMatching(scored_blocks, 0, n_matches):
for match in gazetteMatching(scored_blocks, threshold, n_matches):
if match:
yield from match
8 changes: 4 additions & 4 deletions docs/API-documentation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ Dedupe and StaticDedupe

Same as :func:`dedupe.Dedupe.score`

.. method:: cluster(matches, threshold)
.. method:: cluster(matches, threshold=0.5)

Same as :func:`dedupe.Dedupe.cluster`

Expand Down Expand Up @@ -194,11 +194,11 @@ RecordLink and StaticRecordLink

Same as :func:`dedupe.RecordLink.score`

.. method:: one_to_one(scores)
.. method:: one_to_one(scores, threshold=0.0)

Same as :func:`dedupe.RecordLink.one_to_one`

.. method:: many_to_one(scores)
.. method:: many_to_one(scores, threshold=0.0)

Same as :func:`dedupe.RecordLink.many_to_one`

Expand Down Expand Up @@ -231,7 +231,7 @@ Gazetteer and StaticGazetteer

Same as :func:`dedupe.Gazetteer.score`

.. method:: many_to_n(score_blocks, n_matches=1)
.. method:: many_to_n(score_blocks, threshold=0.0, n_matches=1)

Same as :func:`dedupe.Gazetteer.many_to_n`

Expand Down

0 comments on commit 39f4379

Please sign in to comment.