add index_predicates arg to train, closes #362

dedupeio · Feb 23, 2015 · 4da149f · 4da149f
1 parent 530ddc3
commit 4da149f
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 33 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,6 @@
+## Unreleased
+Features
+- train method has argument for not considering index predicates
 
 ## 0.7.5
 Features
@@ -12,7 +15,7 @@ Bug Fixes
 ## 0.7.4
 Features
 - Sampling methods now use blocked sampling
-	
+
 ## 0.7.0
 Version 0.7.0 is backwards compatible, except for the match method of Gazetteer class
 

diff --git a/dedupe/api.py b/dedupe/api.py
@@ -654,9 +654,8 @@ def readTraining(self, training_file) :
 
         self._trainClassifier()
 
-    def train(self, ppc=.1, uncovered_dupes=1) : # pragma : no cover
-        """
-        Keyword arguments:
+    def train(self, ppc=.1, uncovered_dupes=1, index_predicates=True) : # pragma : no cover
+        """Keyword arguments:
         ppc -- Limits the Proportion of Pairs Covered that we allow a
                predicate to cover. If a predicate puts together a fraction
                of possible pairs greater than the ppc, that predicate will
@@ -677,6 +676,13 @@ def train(self, ppc=.1, uncovered_dupes=1) : # pragma : no cover
                            true dupe pair may mean that we have to use
                            blocks that put together many, many distinct pairs
                            that we'll have to expensively, compare as well.
+
+        index_predicates -- Should dedupe consider predicates that
+                            rely upon indexing the data. Index predicates can 
+                            be slower and take susbstantial memory.
+
+                            Defaults to True.
+
         """
         n_folds = min(numpy.sum(self.training_data['label']==u'match')/3,
                       20)
@@ -693,7 +699,7 @@ def train(self, ppc=.1, uncovered_dupes=1) : # pragma : no cover
 
 
         self._trainClassifier(alpha)
-        self._trainBlocker(ppc, uncovered_dupes)
+        self._trainBlocker(ppc, uncovered_dupes, index_predicates)
 
 
     def _trainClassifier(self, alpha=.1) : # pragma : no cover
@@ -705,7 +711,7 @@ def _trainClassifier(self, alpha=.1) : # pragma : no cover
         self._logLearnedWeights()
 
 
-    def _trainBlocker(self, ppc=1, uncovered_dupes=1) : # pragma : no cover
+    def _trainBlocker(self, ppc=1, uncovered_dupes=1, index_predicates=True) : # pragma : no cover
         training_pairs = copy.deepcopy(self.training_pairs)
 
         confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample,
@@ -714,7 +720,8 @@ def _trainBlocker(self, ppc=1, uncovered_dupes=1) : # pragma : no cover
 
         training_pairs[u'distinct'].extend(confident_nonduplicates)
 
-        predicate_set = predicateGenerator(self.data_model)
+        predicate_set = predicateGenerator(self.data_model, 
+                                           index_predicates)
 
         (self.predicates, 
          self.stop_words) = dedupe.training.blockTraining(training_pairs,
@@ -912,8 +919,9 @@ def sample(self, data, sample_size=15000,
         data = core.index(data)
 
         blocked_sample_size = int(blocked_proportion * sample_size)
-        predicates = [pred for pred in predicateGenerator(self.data_model)
-                      if pred.type == 'SimplePredicate']
+        predicates = list(predicateGenerator(self.data_model, 
+                                             index_predicates=False))
+
 
         data = sampling.randomDeque(data)
         blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
@@ -973,8 +981,8 @@ def sample(self, data_1, data_2, sample_size=150000,
         data_2 = core.index(data_2, offset)
 
         blocked_sample_size = int(blocked_proportion * sample_size)
-        predicates = [pred for pred in predicateGenerator(self.data_model)
-                      if pred.type == 'SimplePredicate']
+        predicates = list(predicateGenerator(self.data_model, 
+                                             index_predicates=False))
 
         data_1 = sampling.randomDeque(data_1)
         data_2 = sampling.randomDeque(data_2)
@@ -1083,10 +1091,17 @@ class Gazetteer(RecordLink, GazetteerMatching):
 class StaticGazetteer(StaticRecordLink, GazetteerMatching):
     pass
 
-def predicateGenerator(data_model) :
+def predicateGenerator(data_model, index_predicates) :
     predicates = set([])
     for definition in data_model.primary_fields :
-        predicates.update(definition.predicates)
+        if not index_predicates :
+            filtered_predicates = []
+            for predicate in definition.predicates :
+                if not hasattr(predicate, 'index') :
+                    filtered_predicates.append(predicate)
+            predicates.update(filtered_predicates)
+        else :
+            predicates.update(definition.predicates)
 
     return predicates
 
diff --git a/docs/common_learning_methods.rst b/docs/common_learning_methods.rst
@@ -35,35 +35,44 @@
       deduper.markPairs(labeled_examples)
 
 
-.. py:method:: train([ppc=1.0, [uncovered_dupes=1]])
+.. py:method:: train([ppc=0.1, [uncovered_dupes=1, [index_predicates=True]]])
 
    Learn final pairwise classifier and blocking rules. Requires that
    adequate training data has been already been provided.
 
    :param float ppc: Limits the Proportion of Pairs Covered that we
-		      allow a predicate to cover. If a predicate puts
-		      together a fraction of possible pairs greater
-		      than the ppc, that predicate will be removed
-		      from consideration.
+		     allow a predicate to cover. If a predicate puts
+		     together a fraction of possible pairs greater
+		     than the ppc, that predicate will be removed from
+		     consideration.
 
-		      As the size of the data increases, the user will
-		      generally want to reduce ppc.
+		     As the size of the data increases, the user will
+		     generally want to reduce ppc.
 
-		      ppc should be a value between 0.0 and 1.0
+		     ppc should be a value between 0.0 and
+		     1.0. Defaults to 0.1
 
    :param int uncovered_dupes: The number of true dupes pairs in our
-				training data that we can accept will
-				not be put into any block. If true
-				true duplicates are never in the same
-				block, we will never compare them, and
-				may never declare them to be
-				duplicates.
-
-				However, requiring that we cover every
-				single true dupe pair may mean that we
-				have to use blocks that put together
-				many, many distinct pairs that we'll
-				have to expensively, compare as well.
+			       training data that we can accept will
+			       not be put into any block. If true true
+			       duplicates are never in the same block,
+			       we will never compare them, and may
+			       never declare them to be duplicates.
+
+			       However, requiring that we cover every
+			       single true dupe pair may mean that we
+			       have to use blocks that put together
+			       many, many distinct pairs that we'll
+			       have to expensively, compare as well.
+
+			       Defaults to 1
+
+   :param bool index_predicates: Should dedupe consider predicates
+				 that rely upon indexing the
+				 data. Index predicates can be slower
+				 and take susbstantial memory.
+
+				 Defaults to True
 
    .. code:: python