Skip to content

Commit

Permalink
improve typing
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Jan 17, 2023
1 parent 9d527ac commit f0503e0
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 11 deletions.
15 changes: 14 additions & 1 deletion dedupe/_typing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import sys
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -42,10 +43,20 @@
Tuple[RecordID, ...], Union[numpy.typing.NDArray[numpy.float_], Tuple[float, ...]]
]
Clusters = Iterable[Cluster]

# this is not quite right. we are saying that Data is a dictionary that
# could have mixed string or integer keys, but really Data needs to be either
# a dictionary with only string keys, or a dictionary with only integer keys.
#
# we might be able to express that instead with a lot of overloads, but i'm
# not sure it's worth it.
Data = Mapping[RecordID, RecordDict]

RecordDictPair = Tuple[RecordDict, RecordDict]
RecordDictPairs = List[RecordDictPair]
Links = Iterable[Union[numpy.ndarray, Tuple[Tuple[RecordID, RecordID], float]]]
ArrayLinks = Iterable[numpy.ndarray]
TupleLinks = Iterable[Tuple[Tuple[RecordID, RecordID], float]]
Links = Union[ArrayLinks, TupleLinks]
LookupResults = Iterable[Tuple[RecordID, Tuple[Tuple[RecordID, float], ...]]]
JoinConstraint = Literal["one-to-one", "many-to-one", "many-to-many"]
Comparator = Callable[[Any, Any], Union[Union[int, float], Sequence[Union[int, float]]]]
Expand Down Expand Up @@ -107,3 +118,5 @@ def join(self) -> None:


MapLike = Callable[[Callable[[Any], Any], Iterable], Iterable]

PathLike = Union[str, os.PathLike]
19 changes: 13 additions & 6 deletions dedupe/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import numpy.typing

from dedupe._typing import (
ArrayLinks,
Blocks,
Classifier,
Clusters,
Expand All @@ -43,6 +44,7 @@
LabelsLike,
Links,
LookupResults,
PathLike,
RecordDict,
)
from dedupe._typing import RecordDictPair as TrainingExample
Expand All @@ -52,6 +54,7 @@
RecordPairs,
Scores,
TrainingData,
TupleLinks,
VariableDefinition,
)

Expand Down Expand Up @@ -512,18 +515,19 @@ def join(
pairs = self.pairs(data_1, data_2)
pair_scores = self.score(pairs)

links: Links
if constraint == "one-to-one":
links = self.one_to_one(pair_scores, threshold)
elif constraint == "many-to-one":
links = self.many_to_one(pair_scores, threshold)
else:
links = pair_scores[pair_scores["score"] > threshold]

links = list(links)
links_evaluated: Links = list(links) # type: ignore[assignment]
_cleanup_scores(pair_scores)
return links
return links_evaluated

def one_to_one(self, scores: Scores, threshold: float = 0.0) -> Links:
def one_to_one(self, scores: Scores, threshold: float = 0.0) -> TupleLinks:
"""From the similarity scores of pairs of records, decide which
pairs refer to the same entity.
Expand Down Expand Up @@ -578,7 +582,7 @@ def one_to_one(self, scores: Scores, threshold: float = 0.0) -> Links:

yield from clustering.greedyMatching(scores)

def many_to_one(self, scores: Scores, threshold: float = 0.0) -> Links:
def many_to_one(self, scores: Scores, threshold: float = 0.0) -> ArrayLinks:
"""
From the similarity scores of pairs of records, decide which
pairs refer to the same entity.
Expand Down Expand Up @@ -638,6 +642,7 @@ def __init__(

super().__init__(num_cores, in_memory, **kwargs)

self.db: PathLike
if self.in_memory:
self.db = ":memory:"
else:
Expand Down Expand Up @@ -834,7 +839,7 @@ def many_to_n(
score_blocks: Iterable[Scores],
threshold: float = 0.0,
n_matches: int = 1,
) -> Links:
) -> ArrayLinks:
"""
For each group of scored pairs, yield the highest scoring N pairs
Expand Down Expand Up @@ -920,7 +925,9 @@ def search(
else:
return list(results)

def _format_search_results(self, search_d: Data, results: Links) -> LookupResults:
def _format_search_results(
self, search_d: Data, results: ArrayLinks
) -> LookupResults:

seen: set[RecordID] = set()

Expand Down
8 changes: 4 additions & 4 deletions dedupe/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import numpy.typing
import scipy.cluster.hierarchy

from dedupe._typing import Clusters, Links, RecordID, Scores
from dedupe._typing import ArrayLinks, Clusters, RecordID, Scores, TupleLinks

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -291,7 +291,7 @@ def confidences(
return scores


def greedyMatching(dupes: Scores) -> Links:
def greedyMatching(dupes: Scores) -> TupleLinks:
A: set[RecordID] = set()
B: set[RecordID] = set()

Expand All @@ -308,7 +308,7 @@ def greedyMatching(dupes: Scores) -> Links:

def gazetteMatching(
scored_blocks: Iterable[Scores], threshold: float = 0, n_matches: int = 1
) -> Links:
) -> ArrayLinks:

for block in scored_blocks:
block = block[block["score"] > threshold]
Expand All @@ -324,7 +324,7 @@ def gazetteMatching(

def pair_gazette_matching(
scored_pairs: Scores, threshold: float = 0.0, n_matches: int = 1
) -> Links:
) -> ArrayLinks:

scored_pairs.sort(order="pairs")

Expand Down

0 comments on commit f0503e0

Please sign in to comment.