Skip to content

Commit

Permalink
new blackening
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Feb 17, 2023
1 parent 1f5dfbc commit cd8a4d3
Show file tree
Hide file tree
Showing 17 changed files with 0 additions and 49 deletions.
5 changes: 0 additions & 5 deletions dedupe/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ class Matching(object):
def __init__(
self, num_cores: int | None, in_memory: bool = False, **kwargs
) -> None:

if num_cores is None:
self.num_cores = multiprocessing.cpu_count()
else:
Expand All @@ -99,7 +98,6 @@ def __init__(

@property
def fingerprinter(self) -> blocking.Fingerprinter:

if self._fingerprinter is None:
raise ValueError(
"the record fingerprinter is not intialized, "
Expand Down Expand Up @@ -665,7 +663,6 @@ class GazetteerMatching(Matching):
def __init__(
self, num_cores: int | None, in_memory: bool = False, **kwargs
) -> None:

super().__init__(num_cores, in_memory, **kwargs)

self.db: PathLike
Expand Down Expand Up @@ -861,7 +858,6 @@ def blocks(self, data):
pair_blocks = itertools.groupby(pairs, lambda x: x[0])

for _, pair_block in pair_blocks:

yield [
(
(a_record_id, data[a_record_id]),
Expand Down Expand Up @@ -1004,7 +1000,6 @@ def search(
def _format_search_results(
self, search_d: Data, results: ArrayLinks
) -> LookupResults:

seen: set[RecordID] = set()

for result in results:
Expand Down
4 changes: 0 additions & 4 deletions dedupe/blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ class Fingerprinter(object):
"""Takes in a record and returns all blocks that record belongs to"""

def __init__(self, predicates: Iterable[dedupe.predicates.Predicate]) -> None:

self.predicates = predicates

self.index_fields: dict[str, IndexList]
Expand Down Expand Up @@ -151,7 +150,6 @@ def index(self, docs: Docs, field: str) -> None:
index.index(preprocess(doc))

for index_type, index, _ in indices:

index.initSearch()

for predicate in self.index_fields[field][index_type]:
Expand Down Expand Up @@ -182,7 +180,6 @@ def unindex(self, docs: Docs, field: str) -> None:
pass

for index_type, index, _ in indices:

index.initSearch()

for predicate in self.index_fields[field][index_type]:
Expand All @@ -199,7 +196,6 @@ def index_all(self, data: Data) -> None:
def extractIndices(
index_fields: IndexList,
) -> Sequence[tuple[str, Index, Callable[[Any], Any]]]:

indices = []
for index_type, predicates in index_fields.items():
predicate = predicates[0]
Expand Down
7 changes: 0 additions & 7 deletions dedupe/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
def connected_components(
edgelist: Scores, max_components: int
) -> Generator[Scores, None, None]:

if len(edgelist) == 0:
raise StopIteration()

Expand Down Expand Up @@ -60,7 +59,6 @@ def _connected_components(

start = 0
for stop in component_stops:

sub_graph = edgelist[start:stop]
n_edges = stop - start
start = stop
Expand All @@ -77,7 +75,6 @@ def _connected_components(
needs_filtering = True

if needs_filtering:

min_score = numpy.min(sub_graph["score"])
min_score_logit = numpy.log(min_score) - numpy.log(1 - min_score)
threshold = 1 / (1 + numpy.exp(-min_score_logit - 1))
Expand All @@ -103,7 +100,6 @@ def _connected_components(


def union_find(scored_pairs: Scores) -> numpy.typing.NDArray[numpy.int_]:

root: dict[RecordID, int] = {}

components = {}
Expand Down Expand Up @@ -237,7 +233,6 @@ def cluster(

for sub_graph in dupe_sub_graphs:
if len(sub_graph) > 1:

i_to_id, condensed_distances, N = condensedDistance(sub_graph)

linkage = scipy.cluster.hierarchy.linkage(
Expand Down Expand Up @@ -309,7 +304,6 @@ def greedyMatching(dupes: Scores) -> TupleLinks:
def gazetteMatching(
scored_blocks: Iterable[Scores], threshold: float = 0, n_matches: int = 1
) -> ArrayLinks:

for block in scored_blocks:
block = block[block["score"] > threshold]
block.sort(order="score")
Expand All @@ -325,7 +319,6 @@ def gazetteMatching(
def pair_gazette_matching(
scored_pairs: Scores, threshold: float = 0.0, n_matches: int = 1
) -> ArrayLinks:

scored_pairs.sort(order="pairs")

group_key = scored_pairs["pairs"][:, 0]
Expand Down
4 changes: 0 additions & 4 deletions dedupe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ def __init__(
self.offset = offset

def __call__(self) -> None:

while True:
record_pairs: Optional[RecordPairs] = self.records_queue.get()
if record_pairs is None:
Expand Down Expand Up @@ -231,7 +230,6 @@ def scoreGazette(
classifier: Classifier,
num_cores: int = 1,
) -> Generator[Scores, None, None]:

first, record_pairs = peek(record_pairs)
if first is None:
return # terminate iteration
Expand Down Expand Up @@ -259,7 +257,6 @@ def join(self) -> None:


def appropriate_imap(num_cores: int) -> tuple[MapLike, ClosableJoinable]:

if num_cores < 2:
imap: MapLike = map

Expand Down Expand Up @@ -335,7 +332,6 @@ def sniff_id_type(ids: Sequence[tuple[RecordID, RecordID]]) -> RecordIDDType:


def sqlite_id_type(data: Data) -> Literal["text", "integer"]:

example = next(iter(data.keys()))
python_type = type(example)

Expand Down
2 changes: 0 additions & 2 deletions dedupe/datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ def distances(
distances = numpy.empty((num_records, len(self)), "f4")

for i, (record_1, record_2) in enumerate(record_pairs):

for field, compare, start, stop in self._field_comparators:
if record_1[field] is not None and record_2[field] is not None:
distances[i, start:stop] = compare(record_1[field], record_2[field])
Expand Down Expand Up @@ -133,7 +132,6 @@ def __getstate__(self):
return d

def __setstate__(self, d):

version = d.pop("version", None)
if version is None and "_variables" in d:
d["_len"] = len(d.pop("_variables"))
Expand Down
6 changes: 0 additions & 6 deletions dedupe/labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,6 @@ def learn_predicates(
def _predict(self, pairs: TrainingExamples) -> Labels:
labels: Labels = []
for record_1, record_2 in pairs:

for predicate in self.current_predicates:
keys = predicate(record_2, target=True)
if keys:
Expand All @@ -162,7 +161,6 @@ def remove(self, index: int) -> None:
def _sample_indices(
self, sample_size: int, max_cover: int
) -> Iterable[RecordIDPair]:

weights: Dict[RecordIDPair, float] = {}
for predicate, covered in self.block_learner.comparison_cover.items():
# each predicate gets to vote for every record pair it covers. the
Expand Down Expand Up @@ -240,7 +238,6 @@ def __init__(
self._index_predicates(examples_to_index)

def _index_predicates(self, candidates: TrainingExamples) -> None:

blocker = self.block_learner.blocker

records = core.unique((record for pair in candidates for record in pair))
Expand All @@ -261,7 +258,6 @@ def _sample(self, data: DataStr, sample_size: int) -> TrainingExamples:
...

def _sample(self, data, sample_size):

sample_indices = self._sample_indices(
sample_size, len(data) * (len(data) - 1) // 2
)
Expand Down Expand Up @@ -303,7 +299,6 @@ def __init__(
self._index_predicates(examples_to_index)

def _index_predicates(self, candidates: TrainingExamples) -> None:

blocker = self.block_learner.blocker

A_full, B_full = zip(*candidates)
Expand All @@ -330,7 +325,6 @@ def _sample(
...

def _sample(self, data_1, data_2, sample_size):

sample_indices = self._sample_indices(sample_size, len(data_1) * len(data_2))

sample = [(data_1[id_1], data_2[id_2]) for id_1, id_2 in sample_indices]
Expand Down
4 changes: 0 additions & 4 deletions dedupe/predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ def __call__(self, record: RecordDict, **kwargs) -> Iterable[str]:
pass

def __add__(self, other: "Predicate") -> "CompoundPredicate":

if isinstance(other, CompoundPredicate):
return CompoundPredicate((self,) + tuple(other))
elif isinstance(other, Predicate):
Expand Down Expand Up @@ -188,7 +187,6 @@ def reset(self) -> None:
self.index = None

def __call__(self, record: RecordDict, **kwargs) -> list[str]:

block_key = None
column = record[self.field]

Expand Down Expand Up @@ -252,7 +250,6 @@ def reset(self) -> None:
self.index = None

def __call__(self, record: RecordDict, target: bool = False, **kwargs) -> list[str]:

column = record[self.field]
if column:
if (column, target) in self._cache:
Expand Down Expand Up @@ -377,7 +374,6 @@ def __call__(self, record: RecordDict, **kwargs) -> list[str]:
]

def __add__(self, other: Predicate) -> "CompoundPredicate": # type: ignore

if isinstance(other, CompoundPredicate):
return CompoundPredicate(tuple(self) + tuple(other))
elif isinstance(other, Predicate):
Expand Down
5 changes: 0 additions & 5 deletions dedupe/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,6 @@ class DedupeBlockLearner(BlockLearner):
def __init__(
self, predicates: Iterable[Predicate], sampled_records: Data, data: Data
):

self.blocker = blocking.Fingerprinter(predicates)
self.blocker.index_all(data)

Expand Down Expand Up @@ -255,7 +254,6 @@ def __init__(
sampled_records_2,
data_2,
):

self.blocker = blocking.Fingerprinter(predicates)
self.blocker.index_all(data_2)

Expand Down Expand Up @@ -351,7 +349,6 @@ def search(
reachable = self.reachable(candidates) + covered

if candidates and reachable >= self.target:

order_by = functools.partial(self.order_by, candidates)

best = max(candidates, key=order_by)
Expand Down Expand Up @@ -431,7 +428,6 @@ def __rand__(self, item):

class Resampler(object):
def __init__(self, sequence: Sequence[int]):

sampled = random.choices(sequence, k=len(sequence))

c = collections.Counter(sampled)
Expand All @@ -447,7 +443,6 @@ def __init__(self, sequence: Sequence[int]):

@functools.lru_cache()
def __call__(self, iterable: Iterable[int]) -> frozenset[int]:

result = itertools.chain.from_iterable(
self.replacements[k] for k in iterable if k in self.replacements
)
Expand Down
2 changes: 0 additions & 2 deletions dedupe/variables/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def __eq__(self, other: Any) -> bool:
return self.name == other_name

def __init__(self, definition: VariableDefinition):

if definition.get("has missing", False):
self.has_missing = True
try:
Expand Down Expand Up @@ -69,7 +68,6 @@ class MissingDataType(Variable):
type = "MissingData"

def __init__(self, name: str):

self.name = "(%s: Not Missing)" % name

self.has_missing = False
Expand Down
1 change: 0 additions & 1 deletion dedupe/variables/categorical_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def _categories(self, definition: VariableDefinition) -> list[str]:
return categories

def __init__(self, definition: VariableDefinition):

super(CategoricalType, self).__init__(definition)

categories = self._categories(definition)
Expand Down
1 change: 0 additions & 1 deletion dedupe/variables/exists.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ class ExistsType(CategoricalType):
_predicate_functions: list[PredicateFunction] = []

def __init__(self, definition: VariableDefinition):

super(CategoricalType, self).__init__(definition)

self.cat_comparator = CategoricalComparator([0, 1])
Expand Down
2 changes: 0 additions & 2 deletions dedupe/variables/interaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ class InteractionType(Variable):
higher_vars: list["InteractionType"]

def __init__(self, definition: VariableDefinition):

self.interactions = definition["interaction variables"]

self.name = "(Interaction: %s)" % str(self.interactions)
Expand All @@ -22,7 +21,6 @@ def __init__(self, definition: VariableDefinition):
super().__init__(definition)

def expandInteractions(self, field_model: Mapping[str, FieldVariable]) -> None:

self.interaction_fields = self.atomicInteractions(
self.interactions, field_model
)
Expand Down
2 changes: 0 additions & 2 deletions tests/test_blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

class BlockingTest(unittest.TestCase):
def setUp(self):

field_definition = [{"field": "name", "type": "String"}]
self.data_model = dedupe.Dedupe(field_definition).data_model
self.training_pairs = {
Expand Down Expand Up @@ -51,7 +50,6 @@ def setUp(self):
}

def test_unconstrained_inverted_index(self):

blocker = dedupe.blocking.Fingerprinter(
[dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")]
)
Expand Down
1 change: 0 additions & 1 deletion tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

class MockClassifier:
def __init__(self):

self.weight = 0
self.bias = 0

Expand Down
1 change: 0 additions & 1 deletion tests/test_dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,6 @@ def test_greedy_matching(self):
]

def test_gazette_matching(self):

gazetteMatch = dedupe.clustering.gazetteMatching
blocked_dupes = itertools.groupby(self.bipartite_dupes, key=lambda x: x[0][0])

Expand Down
1 change: 0 additions & 1 deletion tests/test_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ def test_precise_latlong(self):

class TestAlpaNumeric(unittest.TestCase):
def test_alphanumeric(self):

assert predicates.alphaNumericPredicate("a1") == set(["a1"])
assert predicates.alphaNumericPredicate("1a") == set(["1a"])
assert predicates.alphaNumericPredicate("a1b") == set(["a1b"])
Expand Down
1 change: 0 additions & 1 deletion tests/test_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

class TrainingTest(unittest.TestCase):
def setUp(self):

field_definition = [{"field": "name", "type": "String"}]
self.data_model = dedupe.Dedupe(field_definition).data_model
self.training_pairs = {
Expand Down

0 comments on commit cd8a4d3

Please sign in to comment.