new blackening

dedupeio · Feb 17, 2023 · cd8a4d3 · cd8a4d3
1 parent 1f5dfbc
commit cd8a4d3
Show file tree

Hide file tree

Showing 17 changed files with 0 additions and 49 deletions.
diff --git a/dedupe/api.py b/dedupe/api.py
@@ -85,7 +85,6 @@ class Matching(object):
     def __init__(
         self, num_cores: int | None, in_memory: bool = False, **kwargs
     ) -> None:
-
         if num_cores is None:
             self.num_cores = multiprocessing.cpu_count()
         else:
@@ -99,7 +98,6 @@ def __init__(
 
     @property
     def fingerprinter(self) -> blocking.Fingerprinter:
-
         if self._fingerprinter is None:
             raise ValueError(
                 "the record fingerprinter is not intialized, "
@@ -665,7 +663,6 @@ class GazetteerMatching(Matching):
     def __init__(
         self, num_cores: int | None, in_memory: bool = False, **kwargs
     ) -> None:
-
         super().__init__(num_cores, in_memory, **kwargs)
 
         self.db: PathLike
@@ -861,7 +858,6 @@ def blocks(self, data):
         pair_blocks = itertools.groupby(pairs, lambda x: x[0])
 
         for _, pair_block in pair_blocks:
-
             yield [
                 (
                     (a_record_id, data[a_record_id]),
@@ -1004,7 +1000,6 @@ def search(
     def _format_search_results(
         self, search_d: Data, results: ArrayLinks
     ) -> LookupResults:
-
         seen: set[RecordID] = set()
 
         for result in results:

diff --git a/dedupe/blocking.py b/dedupe/blocking.py
@@ -38,7 +38,6 @@ class Fingerprinter(object):
     """Takes in a record and returns all blocks that record belongs to"""
 
     def __init__(self, predicates: Iterable[dedupe.predicates.Predicate]) -> None:
-
         self.predicates = predicates
 
         self.index_fields: dict[str, IndexList]
@@ -151,7 +150,6 @@ def index(self, docs: Docs, field: str) -> None:
                     index.index(preprocess(doc))
 
         for index_type, index, _ in indices:
-
             index.initSearch()
 
             for predicate in self.index_fields[field][index_type]:
@@ -182,7 +180,6 @@ def unindex(self, docs: Docs, field: str) -> None:
                         pass
 
         for index_type, index, _ in indices:
-
             index.initSearch()
 
             for predicate in self.index_fields[field][index_type]:
@@ -199,7 +196,6 @@ def index_all(self, data: Data) -> None:
 def extractIndices(
     index_fields: IndexList,
 ) -> Sequence[tuple[str, Index, Callable[[Any], Any]]]:
-
     indices = []
     for index_type, predicates in index_fields.items():
         predicate = predicates[0]

diff --git a/dedupe/clustering.py b/dedupe/clustering.py
@@ -21,7 +21,6 @@
 def connected_components(
     edgelist: Scores, max_components: int
 ) -> Generator[Scores, None, None]:
-
     if len(edgelist) == 0:
         raise StopIteration()
 
@@ -60,7 +59,6 @@ def _connected_components(
 
     start = 0
     for stop in component_stops:
-
         sub_graph = edgelist[start:stop]
         n_edges = stop - start
         start = stop
@@ -77,7 +75,6 @@ def _connected_components(
                 needs_filtering = True
 
         if needs_filtering:
-
             min_score = numpy.min(sub_graph["score"])
             min_score_logit = numpy.log(min_score) - numpy.log(1 - min_score)
             threshold = 1 / (1 + numpy.exp(-min_score_logit - 1))
@@ -103,7 +100,6 @@ def _connected_components(
 
 
 def union_find(scored_pairs: Scores) -> numpy.typing.NDArray[numpy.int_]:
-
     root: dict[RecordID, int] = {}
 
     components = {}
@@ -237,7 +233,6 @@ def cluster(
 
     for sub_graph in dupe_sub_graphs:
         if len(sub_graph) > 1:
-
             i_to_id, condensed_distances, N = condensedDistance(sub_graph)
 
             linkage = scipy.cluster.hierarchy.linkage(
@@ -309,7 +304,6 @@ def greedyMatching(dupes: Scores) -> TupleLinks:
 def gazetteMatching(
     scored_blocks: Iterable[Scores], threshold: float = 0, n_matches: int = 1
 ) -> ArrayLinks:
-
     for block in scored_blocks:
         block = block[block["score"] > threshold]
         block.sort(order="score")
@@ -325,7 +319,6 @@ def gazetteMatching(
 def pair_gazette_matching(
     scored_pairs: Scores, threshold: float = 0.0, n_matches: int = 1
 ) -> ArrayLinks:
-
     scored_pairs.sort(order="pairs")
 
     group_key = scored_pairs["pairs"][:, 0]

diff --git a/dedupe/core.py b/dedupe/core.py
@@ -70,7 +70,6 @@ def __init__(
         self.offset = offset
 
     def __call__(self) -> None:
-
         while True:
             record_pairs: Optional[RecordPairs] = self.records_queue.get()
             if record_pairs is None:
@@ -231,7 +230,6 @@ def scoreGazette(
     classifier: Classifier,
     num_cores: int = 1,
 ) -> Generator[Scores, None, None]:
-
     first, record_pairs = peek(record_pairs)
     if first is None:
         return  # terminate iteration
@@ -259,7 +257,6 @@ def join(self) -> None:
 
 
 def appropriate_imap(num_cores: int) -> tuple[MapLike, ClosableJoinable]:
-
     if num_cores < 2:
         imap: MapLike = map
 
@@ -335,7 +332,6 @@ def sniff_id_type(ids: Sequence[tuple[RecordID, RecordID]]) -> RecordIDDType:
 
 
 def sqlite_id_type(data: Data) -> Literal["text", "integer"]:
-
     example = next(iter(data.keys()))
     python_type = type(example)
 

diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
@@ -85,7 +85,6 @@ def distances(
         distances = numpy.empty((num_records, len(self)), "f4")
 
         for i, (record_1, record_2) in enumerate(record_pairs):
-
             for field, compare, start, stop in self._field_comparators:
                 if record_1[field] is not None and record_2[field] is not None:
                     distances[i, start:stop] = compare(record_1[field], record_2[field])
@@ -133,7 +132,6 @@ def __getstate__(self):
         return d
 
     def __setstate__(self, d):
-
         version = d.pop("version", None)
         if version is None and "_variables" in d:
             d["_len"] = len(d.pop("_variables"))

diff --git a/dedupe/labeler.py b/dedupe/labeler.py
@@ -142,7 +142,6 @@ def learn_predicates(
     def _predict(self, pairs: TrainingExamples) -> Labels:
         labels: Labels = []
         for record_1, record_2 in pairs:
-
             for predicate in self.current_predicates:
                 keys = predicate(record_2, target=True)
                 if keys:
@@ -162,7 +161,6 @@ def remove(self, index: int) -> None:
     def _sample_indices(
         self, sample_size: int, max_cover: int
     ) -> Iterable[RecordIDPair]:
-
         weights: Dict[RecordIDPair, float] = {}
         for predicate, covered in self.block_learner.comparison_cover.items():
             # each predicate gets to vote for every record pair it covers. the
@@ -240,7 +238,6 @@ def __init__(
         self._index_predicates(examples_to_index)
 
     def _index_predicates(self, candidates: TrainingExamples) -> None:
-
         blocker = self.block_learner.blocker
 
         records = core.unique((record for pair in candidates for record in pair))
@@ -261,7 +258,6 @@ def _sample(self, data: DataStr, sample_size: int) -> TrainingExamples:
         ...
 
     def _sample(self, data, sample_size):
-
         sample_indices = self._sample_indices(
             sample_size, len(data) * (len(data) - 1) // 2
         )
@@ -303,7 +299,6 @@ def __init__(
         self._index_predicates(examples_to_index)
 
     def _index_predicates(self, candidates: TrainingExamples) -> None:
-
         blocker = self.block_learner.blocker
 
         A_full, B_full = zip(*candidates)
@@ -330,7 +325,6 @@ def _sample(
         ...
 
     def _sample(self, data_1, data_2, sample_size):
-
         sample_indices = self._sample_indices(sample_size, len(data_1) * len(data_2))
 
         sample = [(data_1[id_1], data_2[id_2]) for id_1, id_2 in sample_indices]

diff --git a/dedupe/predicates.py b/dedupe/predicates.py
@@ -75,7 +75,6 @@ def __call__(self, record: RecordDict, **kwargs) -> Iterable[str]:
         pass
 
     def __add__(self, other: "Predicate") -> "CompoundPredicate":
-
         if isinstance(other, CompoundPredicate):
             return CompoundPredicate((self,) + tuple(other))
         elif isinstance(other, Predicate):
@@ -188,7 +187,6 @@ def reset(self) -> None:
         self.index = None
 
     def __call__(self, record: RecordDict, **kwargs) -> list[str]:
-
         block_key = None
         column = record[self.field]
 
@@ -252,7 +250,6 @@ def reset(self) -> None:
         self.index = None
 
     def __call__(self, record: RecordDict, target: bool = False, **kwargs) -> list[str]:
-
         column = record[self.field]
         if column:
             if (column, target) in self._cache:
@@ -377,7 +374,6 @@ def __call__(self, record: RecordDict, **kwargs) -> list[str]:
         ]
 
     def __add__(self, other: Predicate) -> "CompoundPredicate":  # type: ignore
-
         if isinstance(other, CompoundPredicate):
             return CompoundPredicate(tuple(self) + tuple(other))
         elif isinstance(other, Predicate):

diff --git a/dedupe/training.py b/dedupe/training.py
@@ -175,7 +175,6 @@ class DedupeBlockLearner(BlockLearner):
     def __init__(
         self, predicates: Iterable[Predicate], sampled_records: Data, data: Data
     ):
-
         self.blocker = blocking.Fingerprinter(predicates)
         self.blocker.index_all(data)
 
@@ -255,7 +254,6 @@ def __init__(
         sampled_records_2,
         data_2,
     ):
-
         self.blocker = blocking.Fingerprinter(predicates)
         self.blocker.index_all(data_2)
 
@@ -351,7 +349,6 @@ def search(
             reachable = self.reachable(candidates) + covered
 
             if candidates and reachable >= self.target:
-
                 order_by = functools.partial(self.order_by, candidates)
 
                 best = max(candidates, key=order_by)
@@ -431,7 +428,6 @@ def __rand__(self, item):
 
 class Resampler(object):
     def __init__(self, sequence: Sequence[int]):
-
         sampled = random.choices(sequence, k=len(sequence))
 
         c = collections.Counter(sampled)
@@ -447,7 +443,6 @@ def __init__(self, sequence: Sequence[int]):
 
     @functools.lru_cache()
     def __call__(self, iterable: Iterable[int]) -> frozenset[int]:
-
         result = itertools.chain.from_iterable(
             self.replacements[k] for k in iterable if k in self.replacements
         )

diff --git a/dedupe/variables/base.py b/dedupe/variables/base.py
@@ -30,7 +30,6 @@ def __eq__(self, other: Any) -> bool:
         return self.name == other_name
 
     def __init__(self, definition: VariableDefinition):
-
         if definition.get("has missing", False):
             self.has_missing = True
             try:
@@ -69,7 +68,6 @@ class MissingDataType(Variable):
     type = "MissingData"
 
     def __init__(self, name: str):
-
         self.name = "(%s: Not Missing)" % name
 
         self.has_missing = False

diff --git a/dedupe/variables/categorical_type.py b/dedupe/variables/categorical_type.py
@@ -20,7 +20,6 @@ def _categories(self, definition: VariableDefinition) -> list[str]:
         return categories
 
     def __init__(self, definition: VariableDefinition):
-
         super(CategoricalType, self).__init__(definition)
 
         categories = self._categories(definition)

diff --git a/dedupe/variables/exists.py b/dedupe/variables/exists.py
@@ -14,7 +14,6 @@ class ExistsType(CategoricalType):
     _predicate_functions: list[PredicateFunction] = []
 
     def __init__(self, definition: VariableDefinition):
-
         super(CategoricalType, self).__init__(definition)
 
         self.cat_comparator = CategoricalComparator([0, 1])

diff --git a/dedupe/variables/interaction.py b/dedupe/variables/interaction.py
@@ -13,7 +13,6 @@ class InteractionType(Variable):
     higher_vars: list["InteractionType"]
 
     def __init__(self, definition: VariableDefinition):
-
         self.interactions = definition["interaction variables"]
 
         self.name = "(Interaction: %s)" % str(self.interactions)
@@ -22,7 +21,6 @@ def __init__(self, definition: VariableDefinition):
         super().__init__(definition)
 
     def expandInteractions(self, field_model: Mapping[str, FieldVariable]) -> None:
-
         self.interaction_fields = self.atomicInteractions(
             self.interactions, field_model
         )

diff --git a/tests/test_blocking.py b/tests/test_blocking.py
@@ -8,7 +8,6 @@
 
 class BlockingTest(unittest.TestCase):
     def setUp(self):
-
         field_definition = [{"field": "name", "type": "String"}]
         self.data_model = dedupe.Dedupe(field_definition).data_model
         self.training_pairs = {
@@ -51,7 +50,6 @@ def setUp(self):
         }
 
     def test_unconstrained_inverted_index(self):
-
         blocker = dedupe.blocking.Fingerprinter(
             [dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")]
         )

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -9,7 +9,6 @@
 
 class MockClassifier:
     def __init__(self):
-
         self.weight = 0
         self.bias = 0
 

diff --git a/tests/test_dedupe.py b/tests/test_dedupe.py
@@ -223,7 +223,6 @@ def test_greedy_matching(self):
         ]
 
     def test_gazette_matching(self):
-
         gazetteMatch = dedupe.clustering.gazetteMatching
         blocked_dupes = itertools.groupby(self.bipartite_dupes, key=lambda x: x[0][0])
 

diff --git a/tests/test_predicates.py b/tests/test_predicates.py
@@ -70,7 +70,6 @@ def test_precise_latlong(self):
 
 class TestAlpaNumeric(unittest.TestCase):
     def test_alphanumeric(self):
-
         assert predicates.alphaNumericPredicate("a1") == set(["a1"])
         assert predicates.alphaNumericPredicate("1a") == set(["1a"])
         assert predicates.alphaNumericPredicate("a1b") == set(["a1b"])

diff --git a/tests/test_training.py b/tests/test_training.py
@@ -6,7 +6,6 @@
 
 class TrainingTest(unittest.TestCase):
     def setUp(self):
-
         field_definition = [{"field": "name", "type": "String"}]
         self.data_model = dedupe.Dedupe(field_definition).data_model
         self.training_pairs = {