Merge pull request #1138 from dedupeio/pure_id_types

purify id types
dedupeio · Jan 17, 2023 · 2bccbb0 · 2bccbb0
2 parents f0503e0 + b9b1768
commit 2bccbb0
Show file tree

Hide file tree

Showing 7 changed files with 256 additions and 57 deletions.
diff --git a/dedupe/_typing.py b/dedupe/_typing.py
@@ -33,38 +33,52 @@
 RecordDict = Mapping[str, Any]
 RecordID = Union[int, str]
 RecordIDDType = Union[Type[int], Tuple[Type[str], Literal[256]]]
-RecordIDPair = Tuple[RecordID, RecordID]
-Record = Tuple[RecordID, RecordDict]
-RecordPair = Tuple[Record, Record]
-RecordPairs = Iterator[RecordPair]
-Block = List[RecordPair]
-Blocks = Iterator[Block]
-Cluster = Tuple[
-    Tuple[RecordID, ...], Union[numpy.typing.NDArray[numpy.float_], Tuple[float, ...]]
+RecordIDPair = Union[Tuple[int, int], Tuple[str, str]]
+RecordInt = Tuple[int, RecordDict]
+RecordStr = Tuple[str, RecordDict]
+Record = Union[RecordInt, RecordStr]
+RecordPairInt = Tuple[RecordInt, RecordInt]
+RecordPairStr = Tuple[RecordStr, RecordStr]
+RecordPairs = Union[Iterator[RecordPairInt], Iterator[RecordPairStr]]
+BlockInt = List[RecordPairInt]
+BlockStr = List[RecordPairStr]
+Block = Union[RecordPairInt, RecordPairStr]
+BlocksInt = Iterator[BlockInt]
+BlocksStr = Iterator[BlockStr]
+Blocks = Union[BlocksInt, BlocksStr]
+ClusterInt = Tuple[
+    Tuple[int, ...], Union[numpy.typing.NDArray[numpy.float_], Tuple[float, ...]]
 ]
-Clusters = Iterable[Cluster]
+ClusterStr = Tuple[
+    Tuple[str, ...], Union[numpy.typing.NDArray[numpy.float_], Tuple[float, ...]]
+]
+ClustersInt = Iterable[ClusterInt]
+ClustersStr = Iterable[ClusterStr]
+Clusters = Union[ClustersInt, ClustersStr]
 
-# this is not quite right. we are saying that Data is a dictionary that
-# could have mixed string or integer keys, but really Data needs to be either
-# a dictionary with only string keys, or a dictionary with only integer keys.
-#
-# we might be able to express that instead with a lot of overloads, but i'm
-# not sure it's worth it.
-Data = Mapping[RecordID, RecordDict]
+DataInt = Mapping[int, RecordDict]
+DataStr = Mapping[str, RecordDict]
+Data = Union[DataInt, DataStr]
 
 RecordDictPair = Tuple[RecordDict, RecordDict]
 RecordDictPairs = List[RecordDictPair]
 ArrayLinks = Iterable[numpy.ndarray]
-TupleLinks = Iterable[Tuple[Tuple[RecordID, RecordID], float]]
+TupleLinksInt = Iterable[Tuple[Tuple[int, int], float]]
+TupleLinksStr = Iterable[Tuple[Tuple[str, str], float]]
+TupleLinks = Union[TupleLinksInt, TupleLinksStr]
 Links = Union[ArrayLinks, TupleLinks]
-LookupResults = Iterable[Tuple[RecordID, Tuple[Tuple[RecordID, float], ...]]]
+LookupResultsInt = Iterable[Tuple[int, Tuple[Tuple[int, float], ...]]]
+LookupResultsStr = Iterable[Tuple[str, Tuple[Tuple[str, float], ...]]]
+LookupResults = Union[LookupResultsInt, LookupResultsStr]
 JoinConstraint = Literal["one-to-one", "many-to-one", "many-to-many"]
 Comparator = Callable[[Any, Any], Union[Union[int, float], Sequence[Union[int, float]]]]
 Scores = Union[numpy.memmap, numpy.ndarray]
 Labels = List[Literal[0, 1]]
 LabelsLike = Iterable[Literal[0, 1]]
 Cover = Dict["Predicate", FrozenSet[int]]
-ComparisonCover = Dict["Predicate", FrozenSet[Tuple[RecordID, RecordID]]]
+ComparisonCoverInt = Dict["Predicate", FrozenSet[Tuple[int, int]]]
+ComparisonCoverStr = Dict["Predicate", FrozenSet[Tuple[str, str]]]
+ComparisonCover = Union[ComparisonCoverInt, ComparisonCoverStr]
 PredicateFunction = Callable[[Any], Iterable[str]]
 
 VariableDefinition = TypedDict(

diff --git a/dedupe/api.py b/dedupe/api.py
@@ -14,7 +14,7 @@
 import sqlite3
 import tempfile
 import warnings
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, cast, overload
 
 import numpy
 import sklearn.linear_model
@@ -30,20 +30,36 @@
 from dedupe._typing import Literal
 
 if TYPE_CHECKING:
-    from typing import BinaryIO, Collection, Generator, Iterable, MutableMapping, TextIO
+    from typing import (
+        BinaryIO,
+        Collection,
+        Generator,
+        Iterable,
+        MutableMapping,
+        TextIO,
+        Union,
+    )
 
     import numpy.typing
 
     from dedupe._typing import (
         ArrayLinks,
         Blocks,
+        BlocksInt,
+        BlocksStr,
         Classifier,
         Clusters,
+        ClustersInt,
+        ClustersStr,
         Data,
+        DataInt,
+        DataStr,
         JoinConstraint,
         LabelsLike,
         Links,
         LookupResults,
+        LookupResultsInt,
+        LookupResultsStr,
         PathLike,
         RecordDict,
     )
@@ -133,9 +149,19 @@ class DedupeMatching(IntegralMatching):
 
     """
 
+    @overload
+    def partition(
+        self, data: DataInt, threshold: float = 0.5
+    ) -> ClustersInt:  # pragma: no cover
+        ...
+
+    @overload
     def partition(
-        self, data: Data, threshold: float = 0.5
-    ) -> Clusters:  # pragma: no cover
+        self, data: DataStr, threshold: float = 0.5
+    ) -> ClustersStr:  # pragma: no cover
+        ...
+
+    def partition(self, data, threshold=0.5):  # pragma: no cover
         """
         Identifies records that all refer to the same entity, returns
         tuples containing a sequence of record ids and corresponding
@@ -177,9 +203,9 @@ def partition(
         pair_scores = self.score(pairs)
         clusters = self.cluster(pair_scores, threshold)
         clusters = self._add_singletons(data.keys(), clusters)
-        clusters = list(clusters)
+        clusters_eval = list(clusters)
         _cleanup_scores(pair_scores)
-        return clusters
+        return clusters_eval
 
     @staticmethod
     def _add_singletons(all_ids: Iterable[RecordID], clusters: Clusters) -> Clusters:
@@ -649,7 +675,10 @@ def __init__(
             self.temp_dir = tempfile.TemporaryDirectory()
             self.db = self.temp_dir.name + "/blocks.db"
 
-        self.indexed_data: MutableMapping[RecordID, RecordDict] = {}
+        self.indexed_data: Union[
+            MutableMapping[int, RecordDict], MutableMapping[str, RecordDict]
+        ]
+        self.indexed_data = {}  # type: ignore[assignment]
 
     def _close(self) -> None:
         if not self.in_memory:
@@ -658,7 +687,15 @@ def _close(self) -> None:
     def __del__(self) -> None:
         self._close()
 
-    def index(self, data: Data) -> None:  # pragma: no cover
+    @overload
+    def index(self, data: DataInt) -> None:
+        ...
+
+    @overload
+    def index(self, data: DataStr) -> None:
+        ...
+
+    def index(self, data):  # pragma: no cover
         """
         Add records to the index of records to match against. If a record in
         `canonical_data` has the same key as a previously indexed record, the
@@ -705,7 +742,15 @@ def index(self, data: Data) -> None:  # pragma: no cover
 
         self.indexed_data.update(data)
 
-    def unindex(self, data: Data) -> None:  # pragma: no cover
+    @overload
+    def unindex(self, data: DataInt) -> None:  # pragma: no cover
+        ...
+
+    @overload
+    def unindex(self, data: DataStr) -> None:  # pragma: no cover
+        ...
+
+    def unindex(self, data):  # pragma: no cover
         """
         Remove records from the index of records to match against.
 
@@ -734,7 +779,15 @@ def unindex(self, data: Data) -> None:  # pragma: no cover
         for k in data:
             del self.indexed_data[k]
 
-    def blocks(self, data: Data) -> Blocks:
+    @overload
+    def blocks(self, data: DataInt) -> BlocksInt:
+        ...
+
+    @overload
+    def blocks(self, data: DataStr) -> BlocksStr:
+        ...
+
+    def blocks(self, data):
         """
         Yield groups of pairs of records that share fingerprints.
 
@@ -800,9 +853,12 @@ def blocks(self, data: Data) -> Blocks:
                                ORDER BY a.record_id"""
         )
 
-        pair_blocks: Iterable[
-            tuple[RecordID, Iterable[tuple[RecordID, RecordID]]]
-        ] = itertools.groupby(pairs, lambda x: x[0])
+        pair_blocks: Union[
+            Iterable[tuple[int, Iterable[tuple[int, int]]]],
+            Iterable[tuple[str, Iterable[tuple[str, str]]]],
+        ]
+
+        pair_blocks = itertools.groupby(pairs, lambda x: x[0])
 
         for _, pair_block in pair_blocks:
 
@@ -866,13 +922,33 @@ def many_to_n(
 
         yield from clustering.gazetteMatching(score_blocks, threshold, n_matches)
 
+    @overload
     def search(
         self,
-        data: Data,
+        data: DataInt,
+        threshold: float = 0.0,
+        n_matches: int = 1,
+        generator: bool = False,
+    ) -> LookupResultsInt:  # pragma: no cover
+        ...
+
+    @overload
+    def search(
+        self,
+        data: DataStr,
         threshold: float = 0.0,
         n_matches: int = 1,
         generator: bool = False,
-    ) -> LookupResults:  # pragma: no cover
+    ) -> LookupResultsStr:  # pragma: no cover
+        ...
+
+    def search(
+        self,
+        data,
+        threshold=0.0,
+        n_matches=1,
+        generator=False,
+    ):  # pragma: no cover
         """
         Identifies pairs of records that could refer to the same entity,
         returns tuples containing tuples of possible matches, with a
@@ -936,7 +1012,7 @@ def _format_search_results(
             b: RecordID
             score: float
             prepared_result: list[tuple[RecordID, float]] = []
-            for (a, b), score in result:  # type: ignore
+            for (a, b), score in result:
                 prepared_result.append((b, score))
 
             assert a is not None

diff --git a/dedupe/canonical.py b/dedupe/canonical.py
@@ -28,7 +28,7 @@ def getCentroid(attribute_variants: Sequence[str], comparator: Comparator) -> st
 
     # there can be ties for minimum, average distance string
     min_dist_indices: numpy.typing.NDArray[numpy.int_]
-    min_dist_indices = numpy.where(average_distance == average_distance.min())[0]  # type: ignore
+    min_dist_indices = numpy.where(average_distance == average_distance.min())[0]
 
     if len(min_dist_indices) > 1:
         centroid = breakCentroidTie(attribute_variants, min_dist_indices)

diff --git a/dedupe/convenience.py b/dedupe/convenience.py
@@ -7,13 +7,14 @@
 import random
 import sys
 import warnings
-from typing import Iterator, Tuple
+from typing import Iterator, Tuple, overload
 
 import numpy
 
 import dedupe
 from dedupe._typing import (
-    Data,
+    DataInt,
+    DataStr,
     Literal,
     RecordDict,
     RecordDictPair,
@@ -203,8 +204,22 @@ def console_label(deduper: dedupe.api.ActiveMatching) -> None:  # pragma: no cov
         _mark_pair(deduper, labeled_pair)
 
 
+@overload
 def training_data_link(
-    data_1: Data, data_2: Data, common_key: str, training_size: int = 50000
+    data_1: DataInt, data_2: DataInt, common_key: str, training_size: int = 50000
+) -> TrainingData:  # pragma: nocover
+    ...
+
+
+@overload
+def training_data_link(
+    data_1: DataStr, data_2: DataStr, common_key: str, training_size: int = 50000
+) -> TrainingData:  # pragma: nocover
+    ...
+
+
+def training_data_link(
+    data_1, data_2, common_key, training_size=50000
 ) -> TrainingData:  # pragma: nocover
     """
     Construct training data for consumption by the func:`mark_pairs`
@@ -265,8 +280,22 @@ def training_data_link(
     return training_pairs
 
 
+@overload
+def training_data_dedupe(
+    data: DataInt, common_key: str, training_size: int = 50000
+) -> TrainingData:  # pragma: nocover
+    ...
+
+
+@overload
+def training_data_dedupe(
+    data: DataStr, common_key: str, training_size: int = 50000
+) -> TrainingData:  # pragma: nocover
+    ...
+
+
 def training_data_dedupe(
-    data: Data, common_key: str, training_size: int = 50000
+    data, common_key, training_size=50000
 ) -> TrainingData:  # pragma: nocover
     """
     Construct training data for consumption by the func:`mark_pairs`

diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
@@ -240,12 +240,12 @@ def interaction_indices(variables: list[Variable]) -> list[list[int]]:
     indices = []
     for var in variables:
         if hasattr(var, "interaction_fields"):
-            interaction_indices = [var_names.index(f) for f in var.interaction_fields]  # type: ignore
+            interaction_indices = [var_names.index(f) for f in var.interaction_fields]
             indices.append(interaction_indices)
     return indices
 
 
-def reduce_method(m):  # type: ignore[no-untyped-def]
+def reduce_method(m):
     return (getattr, (m.__self__, m.__func__.__name__))