Skip to content

Commit

Permalink
staged learning is not worth it
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Sep 6, 2020
1 parent c65e2e3 commit 3f36272
Showing 1 changed file with 5 additions and 8 deletions.
13 changes: 5 additions & 8 deletions dedupe/training.py
Expand Up @@ -110,13 +110,15 @@ def generate_candidates_rf(self,
# the base for the k-conjunctions
candidate = CompoundPredicate()
covered_comparisons = InfiniteSet()
covered_matches = frozenset(matches)
covered_matches = InfiniteSet()
covered_sample_matches = InfiniteSet()

def score(predicate: Predicate) -> float:
try:
return (len(covered_sample_matches & sample_match_cover[predicate]) /
self.estimate(covered_comparisons & comparison_cover[predicate]))
return (len(covered_sample_matches &
sample_match_cover[predicate]) /
self.estimate(covered_comparisons &
comparison_cover[predicate]))
except ZeroDivisionError:
return 0.

Expand Down Expand Up @@ -391,10 +393,5 @@ def __call__(self, iterable: Iterable) -> frozenset:
if k in self.replacements)
return frozenset(result)

@property
def pairs(self) -> frozenset:

return frozenset(itertools.chain.from_iterable(self.replacements.values()))


OUT_OF_PREDICATES_WARNING = "Ran out of predicates: Dedupe tries to find blocking rules that will work well with your data. Sometimes it can't find great ones, and you'll get this warning. It means that there are some pairs of true records that dedupe may never compare. If you are getting bad results, try increasing the `max_comparison` argument to the train method" # noqa: E501

0 comments on commit 3f36272

Please sign in to comment.