Skip to content

Commit

Permalink
Merge pull request #1138 from dedupeio/pure_id_types
Browse files Browse the repository at this point in the history
purify id types
  • Loading branch information
fgregg committed Jan 17, 2023
2 parents f0503e0 + b9b1768 commit 2bccbb0
Show file tree
Hide file tree
Showing 7 changed files with 256 additions and 57 deletions.
52 changes: 33 additions & 19 deletions dedupe/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,38 +33,52 @@
RecordDict = Mapping[str, Any]
RecordID = Union[int, str]
RecordIDDType = Union[Type[int], Tuple[Type[str], Literal[256]]]
RecordIDPair = Tuple[RecordID, RecordID]
Record = Tuple[RecordID, RecordDict]
RecordPair = Tuple[Record, Record]
RecordPairs = Iterator[RecordPair]
Block = List[RecordPair]
Blocks = Iterator[Block]
Cluster = Tuple[
Tuple[RecordID, ...], Union[numpy.typing.NDArray[numpy.float_], Tuple[float, ...]]
RecordIDPair = Union[Tuple[int, int], Tuple[str, str]]
RecordInt = Tuple[int, RecordDict]
RecordStr = Tuple[str, RecordDict]
Record = Union[RecordInt, RecordStr]
RecordPairInt = Tuple[RecordInt, RecordInt]
RecordPairStr = Tuple[RecordStr, RecordStr]
RecordPairs = Union[Iterator[RecordPairInt], Iterator[RecordPairStr]]
BlockInt = List[RecordPairInt]
BlockStr = List[RecordPairStr]
Block = Union[RecordPairInt, RecordPairStr]
BlocksInt = Iterator[BlockInt]
BlocksStr = Iterator[BlockStr]
Blocks = Union[BlocksInt, BlocksStr]
ClusterInt = Tuple[
Tuple[int, ...], Union[numpy.typing.NDArray[numpy.float_], Tuple[float, ...]]
]
Clusters = Iterable[Cluster]
ClusterStr = Tuple[
Tuple[str, ...], Union[numpy.typing.NDArray[numpy.float_], Tuple[float, ...]]
]
ClustersInt = Iterable[ClusterInt]
ClustersStr = Iterable[ClusterStr]
Clusters = Union[ClustersInt, ClustersStr]

# this is not quite right. we are saying that Data is a dictionary that
# could have mixed string or integer keys, but really Data needs to be either
# a dictionary with only string keys, or a dictionary with only integer keys.
#
# we might be able to express that instead with a lot of overloads, but i'm
# not sure it's worth it.
Data = Mapping[RecordID, RecordDict]
DataInt = Mapping[int, RecordDict]
DataStr = Mapping[str, RecordDict]
Data = Union[DataInt, DataStr]

RecordDictPair = Tuple[RecordDict, RecordDict]
RecordDictPairs = List[RecordDictPair]
ArrayLinks = Iterable[numpy.ndarray]
TupleLinks = Iterable[Tuple[Tuple[RecordID, RecordID], float]]
TupleLinksInt = Iterable[Tuple[Tuple[int, int], float]]
TupleLinksStr = Iterable[Tuple[Tuple[str, str], float]]
TupleLinks = Union[TupleLinksInt, TupleLinksStr]
Links = Union[ArrayLinks, TupleLinks]
LookupResults = Iterable[Tuple[RecordID, Tuple[Tuple[RecordID, float], ...]]]
LookupResultsInt = Iterable[Tuple[int, Tuple[Tuple[int, float], ...]]]
LookupResultsStr = Iterable[Tuple[str, Tuple[Tuple[str, float], ...]]]
LookupResults = Union[LookupResultsInt, LookupResultsStr]
JoinConstraint = Literal["one-to-one", "many-to-one", "many-to-many"]
Comparator = Callable[[Any, Any], Union[Union[int, float], Sequence[Union[int, float]]]]
Scores = Union[numpy.memmap, numpy.ndarray]
Labels = List[Literal[0, 1]]
LabelsLike = Iterable[Literal[0, 1]]
Cover = Dict["Predicate", FrozenSet[int]]
ComparisonCover = Dict["Predicate", FrozenSet[Tuple[RecordID, RecordID]]]
ComparisonCoverInt = Dict["Predicate", FrozenSet[Tuple[int, int]]]
ComparisonCoverStr = Dict["Predicate", FrozenSet[Tuple[str, str]]]
ComparisonCover = Union[ComparisonCoverInt, ComparisonCoverStr]
PredicateFunction = Callable[[Any], Iterable[str]]

VariableDefinition = TypedDict(
Expand Down
108 changes: 92 additions & 16 deletions dedupe/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import sqlite3
import tempfile
import warnings
from typing import TYPE_CHECKING, cast
from typing import TYPE_CHECKING, cast, overload

import numpy
import sklearn.linear_model
Expand All @@ -30,20 +30,36 @@
from dedupe._typing import Literal

if TYPE_CHECKING:
from typing import BinaryIO, Collection, Generator, Iterable, MutableMapping, TextIO
from typing import (
BinaryIO,
Collection,
Generator,
Iterable,
MutableMapping,
TextIO,
Union,
)

import numpy.typing

from dedupe._typing import (
ArrayLinks,
Blocks,
BlocksInt,
BlocksStr,
Classifier,
Clusters,
ClustersInt,
ClustersStr,
Data,
DataInt,
DataStr,
JoinConstraint,
LabelsLike,
Links,
LookupResults,
LookupResultsInt,
LookupResultsStr,
PathLike,
RecordDict,
)
Expand Down Expand Up @@ -133,9 +149,19 @@ class DedupeMatching(IntegralMatching):
"""

@overload
def partition(
self, data: DataInt, threshold: float = 0.5
) -> ClustersInt: # pragma: no cover
...

@overload
def partition(
self, data: Data, threshold: float = 0.5
) -> Clusters: # pragma: no cover
self, data: DataStr, threshold: float = 0.5
) -> ClustersStr: # pragma: no cover
...

def partition(self, data, threshold=0.5): # pragma: no cover
"""
Identifies records that all refer to the same entity, returns
tuples containing a sequence of record ids and corresponding
Expand Down Expand Up @@ -177,9 +203,9 @@ def partition(
pair_scores = self.score(pairs)
clusters = self.cluster(pair_scores, threshold)
clusters = self._add_singletons(data.keys(), clusters)
clusters = list(clusters)
clusters_eval = list(clusters)
_cleanup_scores(pair_scores)
return clusters
return clusters_eval

@staticmethod
def _add_singletons(all_ids: Iterable[RecordID], clusters: Clusters) -> Clusters:
Expand Down Expand Up @@ -649,7 +675,10 @@ def __init__(
self.temp_dir = tempfile.TemporaryDirectory()
self.db = self.temp_dir.name + "/blocks.db"

self.indexed_data: MutableMapping[RecordID, RecordDict] = {}
self.indexed_data: Union[
MutableMapping[int, RecordDict], MutableMapping[str, RecordDict]
]
self.indexed_data = {} # type: ignore[assignment]

def _close(self) -> None:
if not self.in_memory:
Expand All @@ -658,7 +687,15 @@ def _close(self) -> None:
def __del__(self) -> None:
self._close()

def index(self, data: Data) -> None: # pragma: no cover
@overload
def index(self, data: DataInt) -> None:
...

@overload
def index(self, data: DataStr) -> None:
...

def index(self, data): # pragma: no cover
"""
Add records to the index of records to match against. If a record in
`canonical_data` has the same key as a previously indexed record, the
Expand Down Expand Up @@ -705,7 +742,15 @@ def index(self, data: Data) -> None: # pragma: no cover

self.indexed_data.update(data)

def unindex(self, data: Data) -> None: # pragma: no cover
@overload
def unindex(self, data: DataInt) -> None: # pragma: no cover
...

@overload
def unindex(self, data: DataStr) -> None: # pragma: no cover
...

def unindex(self, data): # pragma: no cover
"""
Remove records from the index of records to match against.
Expand Down Expand Up @@ -734,7 +779,15 @@ def unindex(self, data: Data) -> None: # pragma: no cover
for k in data:
del self.indexed_data[k]

def blocks(self, data: Data) -> Blocks:
@overload
def blocks(self, data: DataInt) -> BlocksInt:
...

@overload
def blocks(self, data: DataStr) -> BlocksStr:
...

def blocks(self, data):
"""
Yield groups of pairs of records that share fingerprints.
Expand Down Expand Up @@ -800,9 +853,12 @@ def blocks(self, data: Data) -> Blocks:
ORDER BY a.record_id"""
)

pair_blocks: Iterable[
tuple[RecordID, Iterable[tuple[RecordID, RecordID]]]
] = itertools.groupby(pairs, lambda x: x[0])
pair_blocks: Union[
Iterable[tuple[int, Iterable[tuple[int, int]]]],
Iterable[tuple[str, Iterable[tuple[str, str]]]],
]

pair_blocks = itertools.groupby(pairs, lambda x: x[0])

for _, pair_block in pair_blocks:

Expand Down Expand Up @@ -866,13 +922,33 @@ def many_to_n(

yield from clustering.gazetteMatching(score_blocks, threshold, n_matches)

@overload
def search(
self,
data: Data,
data: DataInt,
threshold: float = 0.0,
n_matches: int = 1,
generator: bool = False,
) -> LookupResultsInt: # pragma: no cover
...

@overload
def search(
self,
data: DataStr,
threshold: float = 0.0,
n_matches: int = 1,
generator: bool = False,
) -> LookupResults: # pragma: no cover
) -> LookupResultsStr: # pragma: no cover
...

def search(
self,
data,
threshold=0.0,
n_matches=1,
generator=False,
): # pragma: no cover
"""
Identifies pairs of records that could refer to the same entity,
returns tuples containing tuples of possible matches, with a
Expand Down Expand Up @@ -936,7 +1012,7 @@ def _format_search_results(
b: RecordID
score: float
prepared_result: list[tuple[RecordID, float]] = []
for (a, b), score in result: # type: ignore
for (a, b), score in result:
prepared_result.append((b, score))

assert a is not None
Expand Down
2 changes: 1 addition & 1 deletion dedupe/canonical.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def getCentroid(attribute_variants: Sequence[str], comparator: Comparator) -> st

# there can be ties for minimum, average distance string
min_dist_indices: numpy.typing.NDArray[numpy.int_]
min_dist_indices = numpy.where(average_distance == average_distance.min())[0] # type: ignore
min_dist_indices = numpy.where(average_distance == average_distance.min())[0]

if len(min_dist_indices) > 1:
centroid = breakCentroidTie(attribute_variants, min_dist_indices)
Expand Down
37 changes: 33 additions & 4 deletions dedupe/convenience.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@
import random
import sys
import warnings
from typing import Iterator, Tuple
from typing import Iterator, Tuple, overload

import numpy

import dedupe
from dedupe._typing import (
Data,
DataInt,
DataStr,
Literal,
RecordDict,
RecordDictPair,
Expand Down Expand Up @@ -203,8 +204,22 @@ def console_label(deduper: dedupe.api.ActiveMatching) -> None: # pragma: no cov
_mark_pair(deduper, labeled_pair)


@overload
def training_data_link(
data_1: Data, data_2: Data, common_key: str, training_size: int = 50000
data_1: DataInt, data_2: DataInt, common_key: str, training_size: int = 50000
) -> TrainingData: # pragma: nocover
...


@overload
def training_data_link(
data_1: DataStr, data_2: DataStr, common_key: str, training_size: int = 50000
) -> TrainingData: # pragma: nocover
...


def training_data_link(
data_1, data_2, common_key, training_size=50000
) -> TrainingData: # pragma: nocover
"""
Construct training data for consumption by the func:`mark_pairs`
Expand Down Expand Up @@ -265,8 +280,22 @@ def training_data_link(
return training_pairs


@overload
def training_data_dedupe(
data: DataInt, common_key: str, training_size: int = 50000
) -> TrainingData: # pragma: nocover
...


@overload
def training_data_dedupe(
data: DataStr, common_key: str, training_size: int = 50000
) -> TrainingData: # pragma: nocover
...


def training_data_dedupe(
data: Data, common_key: str, training_size: int = 50000
data, common_key, training_size=50000
) -> TrainingData: # pragma: nocover
"""
Construct training data for consumption by the func:`mark_pairs`
Expand Down
4 changes: 2 additions & 2 deletions dedupe/datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,12 +240,12 @@ def interaction_indices(variables: list[Variable]) -> list[list[int]]:
indices = []
for var in variables:
if hasattr(var, "interaction_fields"):
interaction_indices = [var_names.index(f) for f in var.interaction_fields] # type: ignore
interaction_indices = [var_names.index(f) for f in var.interaction_fields]
indices.append(interaction_indices)
return indices


def reduce_method(m): # type: ignore[no-untyped-def]
def reduce_method(m):
return (getattr, (m.__self__, m.__func__.__name__))


Expand Down

0 comments on commit 2bccbb0

Please sign in to comment.