Skip to content

Commit

Permalink
add index_predicates arg to train, closes #362
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Feb 23, 2015
1 parent 530ddc3 commit 4da149f
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 33 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
## Unreleased
Features
- train method has argument for not considering index predicates

## 0.7.5
Features
Expand All @@ -12,7 +15,7 @@ Bug Fixes
## 0.7.4
Features
- Sampling methods now use blocked sampling

## 0.7.0
Version 0.7.0 is backwards compatible, except for the match method of Gazetteer class

Expand Down
39 changes: 27 additions & 12 deletions dedupe/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,9 +654,8 @@ def readTraining(self, training_file) :

self._trainClassifier()

def train(self, ppc=.1, uncovered_dupes=1) : # pragma : no cover
"""
Keyword arguments:
def train(self, ppc=.1, uncovered_dupes=1, index_predicates=True) : # pragma : no cover
"""Keyword arguments:
ppc -- Limits the Proportion of Pairs Covered that we allow a
predicate to cover. If a predicate puts together a fraction
of possible pairs greater than the ppc, that predicate will
Expand All @@ -677,6 +676,13 @@ def train(self, ppc=.1, uncovered_dupes=1) : # pragma : no cover
true dupe pair may mean that we have to use
blocks that put together many, many distinct pairs
that we'll have to expensively, compare as well.
index_predicates -- Should dedupe consider predicates that
rely upon indexing the data. Index predicates can
be slower and take susbstantial memory.
Defaults to True.
"""
n_folds = min(numpy.sum(self.training_data['label']==u'match')/3,
20)
Expand All @@ -693,7 +699,7 @@ def train(self, ppc=.1, uncovered_dupes=1) : # pragma : no cover


self._trainClassifier(alpha)
self._trainBlocker(ppc, uncovered_dupes)
self._trainBlocker(ppc, uncovered_dupes, index_predicates)


def _trainClassifier(self, alpha=.1) : # pragma : no cover
Expand All @@ -705,7 +711,7 @@ def _trainClassifier(self, alpha=.1) : # pragma : no cover
self._logLearnedWeights()


def _trainBlocker(self, ppc=1, uncovered_dupes=1) : # pragma : no cover
def _trainBlocker(self, ppc=1, uncovered_dupes=1, index_predicates=True) : # pragma : no cover
training_pairs = copy.deepcopy(self.training_pairs)

confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample,
Expand All @@ -714,7 +720,8 @@ def _trainBlocker(self, ppc=1, uncovered_dupes=1) : # pragma : no cover

training_pairs[u'distinct'].extend(confident_nonduplicates)

predicate_set = predicateGenerator(self.data_model)
predicate_set = predicateGenerator(self.data_model,
index_predicates)

(self.predicates,
self.stop_words) = dedupe.training.blockTraining(training_pairs,
Expand Down Expand Up @@ -912,8 +919,9 @@ def sample(self, data, sample_size=15000,
data = core.index(data)

blocked_sample_size = int(blocked_proportion * sample_size)
predicates = [pred for pred in predicateGenerator(self.data_model)
if pred.type == 'SimplePredicate']
predicates = list(predicateGenerator(self.data_model,
index_predicates=False))


data = sampling.randomDeque(data)
blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
Expand Down Expand Up @@ -973,8 +981,8 @@ def sample(self, data_1, data_2, sample_size=150000,
data_2 = core.index(data_2, offset)

blocked_sample_size = int(blocked_proportion * sample_size)
predicates = [pred for pred in predicateGenerator(self.data_model)
if pred.type == 'SimplePredicate']
predicates = list(predicateGenerator(self.data_model,
index_predicates=False))

data_1 = sampling.randomDeque(data_1)
data_2 = sampling.randomDeque(data_2)
Expand Down Expand Up @@ -1083,10 +1091,17 @@ class Gazetteer(RecordLink, GazetteerMatching):
class StaticGazetteer(StaticRecordLink, GazetteerMatching):
pass

def predicateGenerator(data_model) :
def predicateGenerator(data_model, index_predicates) :
predicates = set([])
for definition in data_model.primary_fields :
predicates.update(definition.predicates)
if not index_predicates :
filtered_predicates = []
for predicate in definition.predicates :
if not hasattr(predicate, 'index') :
filtered_predicates.append(predicate)
predicates.update(filtered_predicates)
else :
predicates.update(definition.predicates)

return predicates

49 changes: 29 additions & 20 deletions docs/common_learning_methods.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,35 +35,44 @@
deduper.markPairs(labeled_examples)
.. py:method:: train([ppc=1.0, [uncovered_dupes=1]])
.. py:method:: train([ppc=0.1, [uncovered_dupes=1, [index_predicates=True]]])
Learn final pairwise classifier and blocking rules. Requires that
adequate training data has been already been provided.

:param float ppc: Limits the Proportion of Pairs Covered that we
allow a predicate to cover. If a predicate puts
together a fraction of possible pairs greater
than the ppc, that predicate will be removed
from consideration.
allow a predicate to cover. If a predicate puts
together a fraction of possible pairs greater
than the ppc, that predicate will be removed from
consideration.

As the size of the data increases, the user will
generally want to reduce ppc.
As the size of the data increases, the user will
generally want to reduce ppc.

ppc should be a value between 0.0 and 1.0
ppc should be a value between 0.0 and
1.0. Defaults to 0.1

:param int uncovered_dupes: The number of true dupes pairs in our
training data that we can accept will
not be put into any block. If true
true duplicates are never in the same
block, we will never compare them, and
may never declare them to be
duplicates.

However, requiring that we cover every
single true dupe pair may mean that we
have to use blocks that put together
many, many distinct pairs that we'll
have to expensively, compare as well.
training data that we can accept will
not be put into any block. If true true
duplicates are never in the same block,
we will never compare them, and may
never declare them to be duplicates.

However, requiring that we cover every
single true dupe pair may mean that we
have to use blocks that put together
many, many distinct pairs that we'll
have to expensively, compare as well.

Defaults to 1

:param bool index_predicates: Should dedupe consider predicates
that rely upon indexing the
data. Index predicates can be slower
and take susbstantial memory.

Defaults to True

.. code:: python
Expand Down

0 comments on commit 4da149f

Please sign in to comment.