improve typing

dedupeio · Jan 17, 2023 · f0503e0 · f0503e0
1 parent 9d527ac
commit f0503e0
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 11 deletions.
diff --git a/dedupe/_typing.py b/dedupe/_typing.py
@@ -1,3 +1,4 @@
+import os
 import sys
 from typing import (
     TYPE_CHECKING,
@@ -42,10 +43,20 @@
     Tuple[RecordID, ...], Union[numpy.typing.NDArray[numpy.float_], Tuple[float, ...]]
 ]
 Clusters = Iterable[Cluster]
+
+# this is not quite right. we are saying that Data is a dictionary that
+# could have mixed string or integer keys, but really Data needs to be either
+# a dictionary with only string keys, or a dictionary with only integer keys.
+#
+# we might be able to express that instead with a lot of overloads, but i'm
+# not sure it's worth it.
 Data = Mapping[RecordID, RecordDict]
+
 RecordDictPair = Tuple[RecordDict, RecordDict]
 RecordDictPairs = List[RecordDictPair]
-Links = Iterable[Union[numpy.ndarray, Tuple[Tuple[RecordID, RecordID], float]]]
+ArrayLinks = Iterable[numpy.ndarray]
+TupleLinks = Iterable[Tuple[Tuple[RecordID, RecordID], float]]
+Links = Union[ArrayLinks, TupleLinks]
 LookupResults = Iterable[Tuple[RecordID, Tuple[Tuple[RecordID, float], ...]]]
 JoinConstraint = Literal["one-to-one", "many-to-one", "many-to-many"]
 Comparator = Callable[[Any, Any], Union[Union[int, float], Sequence[Union[int, float]]]]
@@ -107,3 +118,5 @@ def join(self) -> None:
 
 
 MapLike = Callable[[Callable[[Any], Any], Iterable], Iterable]
+
+PathLike = Union[str, os.PathLike]
diff --git a/dedupe/api.py b/dedupe/api.py
@@ -35,6 +35,7 @@
     import numpy.typing
 
     from dedupe._typing import (
+        ArrayLinks,
         Blocks,
         Classifier,
         Clusters,
@@ -43,6 +44,7 @@
         LabelsLike,
         Links,
         LookupResults,
+        PathLike,
         RecordDict,
     )
     from dedupe._typing import RecordDictPair as TrainingExample
@@ -52,6 +54,7 @@
         RecordPairs,
         Scores,
         TrainingData,
+        TupleLinks,
         VariableDefinition,
     )
 
@@ -512,18 +515,19 @@ def join(
         pairs = self.pairs(data_1, data_2)
         pair_scores = self.score(pairs)
 
+        links: Links
         if constraint == "one-to-one":
             links = self.one_to_one(pair_scores, threshold)
         elif constraint == "many-to-one":
             links = self.many_to_one(pair_scores, threshold)
         else:
             links = pair_scores[pair_scores["score"] > threshold]
 
-        links = list(links)
+        links_evaluated: Links = list(links)  # type: ignore[assignment]
         _cleanup_scores(pair_scores)
-        return links
+        return links_evaluated
 
-    def one_to_one(self, scores: Scores, threshold: float = 0.0) -> Links:
+    def one_to_one(self, scores: Scores, threshold: float = 0.0) -> TupleLinks:
         """From the similarity scores of pairs of records, decide which
         pairs refer to the same entity.
 
@@ -578,7 +582,7 @@ def one_to_one(self, scores: Scores, threshold: float = 0.0) -> Links:
 
         yield from clustering.greedyMatching(scores)
 
-    def many_to_one(self, scores: Scores, threshold: float = 0.0) -> Links:
+    def many_to_one(self, scores: Scores, threshold: float = 0.0) -> ArrayLinks:
         """
         From the similarity scores of pairs of records, decide which
         pairs refer to the same entity.
@@ -638,6 +642,7 @@ def __init__(
 
         super().__init__(num_cores, in_memory, **kwargs)
 
+        self.db: PathLike
         if self.in_memory:
             self.db = ":memory:"
         else:
@@ -834,7 +839,7 @@ def many_to_n(
         score_blocks: Iterable[Scores],
         threshold: float = 0.0,
         n_matches: int = 1,
-    ) -> Links:
+    ) -> ArrayLinks:
         """
         For each group of scored pairs, yield the highest scoring N pairs
 
@@ -920,7 +925,9 @@ def search(
         else:
             return list(results)
 
-    def _format_search_results(self, search_d: Data, results: Links) -> LookupResults:
+    def _format_search_results(
+        self, search_d: Data, results: ArrayLinks
+    ) -> LookupResults:
 
         seen: set[RecordID] = set()
 

diff --git a/dedupe/clustering.py b/dedupe/clustering.py
@@ -13,7 +13,7 @@
 import numpy.typing
 import scipy.cluster.hierarchy
 
-from dedupe._typing import Clusters, Links, RecordID, Scores
+from dedupe._typing import ArrayLinks, Clusters, RecordID, Scores, TupleLinks
 
 logger = logging.getLogger(__name__)
 
@@ -291,7 +291,7 @@ def confidences(
     return scores
 
 
-def greedyMatching(dupes: Scores) -> Links:
+def greedyMatching(dupes: Scores) -> TupleLinks:
     A: set[RecordID] = set()
     B: set[RecordID] = set()
 
@@ -308,7 +308,7 @@ def greedyMatching(dupes: Scores) -> Links:
 
 def gazetteMatching(
     scored_blocks: Iterable[Scores], threshold: float = 0, n_matches: int = 1
-) -> Links:
+) -> ArrayLinks:
 
     for block in scored_blocks:
         block = block[block["score"] > threshold]
@@ -324,7 +324,7 @@ def gazetteMatching(
 
 def pair_gazette_matching(
     scored_pairs: Scores, threshold: float = 0.0, n_matches: int = 1
-) -> Links:
+) -> ArrayLinks:
 
     scored_pairs.sort(order="pairs")