merge

dedupeio · Feb 15, 2015 · 5bdda6d · 5bdda6d
1 parent 567eaef
commit 5bdda6d
Show file tree

Hide file tree

Showing 10 changed files with 79 additions and 89 deletions.
diff --git a/dedupe/blocking.py b/dedupe/blocking.py
@@ -68,10 +68,10 @@ def index(self, data, field):
                                  self.stop_words[field])
 
         for doc in data :
-            for _, index in indices :
-                index.index(doc)
+            for _, index, preprocess in indices :
+                index.index(preprocess(doc))
 
-        for index_type, index in indices :
+        for index_type, index, _ in indices :
 
             index.initSearch()
 
@@ -102,10 +102,10 @@ def extractIndices(index_fields, stop_words=None) :
     for index_type, predicates in index_fields.items() :
         predicate = next(iter(predicates))
         index = predicate.index
+        preprocess = predicate.preprocess
         if predicate.index is None :
             index = predicate.initIndex(stop_words)
-
-        indices.append((index_type, index))
+        indices.append((index_type, index, preprocess))
 
     return indices
 
diff --git a/dedupe/index.py b/dedupe/index.py
@@ -57,13 +57,12 @@ def apply(self, query_list, threshold, start=0, count=None):
 class CanopyLexicon(Lexicon) : # pragma : no cover
     def __init__(self, stop_words) : 
         super(CanopyLexicon, self).__init__()
-        self._pipeline = [Splitter(),
-                          CustomStopWordRemover(stop_words)]
+        self._pipeline = [CustomStopWordRemover(stop_words)]
 
-    def sourceToWordIds(self, doc): 
-        if doc is None:
-            doc = ''
-        last = stringify(doc) # this is changed line
+    def sourceToWordIds(self, last): 
+        if last is None:
+            last = []
+        #last = stringify(doc) # this is changed line
         for element in self._pipeline:
             last = element.process(last)
         if not isinstance(self.wordCount, Length):
@@ -80,20 +79,3 @@ def process(self, lst):
         return [w for w in lst if not w in self.stop_words]
 
 
-def stringify(doc) :
-    if not isinstance(doc, basestring) :
-        doc = u' '.join(u'_'.join(each.split()) for each in doc)
-
-    return [doc]
-
-
-
-class Splitter(object):
-    rx = re.compile(r"(?u)\w+[\w*?]*")
-
-    def process(self, lst):
-        result = []
-        for s in lst:
-            result += self.rx.findall(s)
-
-        return result
diff --git a/dedupe/metric_tree.py b/dedupe/metric_tree.py
diff --git a/dedupe/predicates.py b/dedupe/predicates.py
@@ -59,38 +59,54 @@ def __setstate__(self, d) :
         self.index = None
 
 class TfidfPredicate(IndexPredicate):
-    type = "TfidfPredicate"
+
+    def initIndex(self, stop_words) :
+        return tfidf.TfIdfIndex(stop_words)
+
+class TfidfTextPredicate(TfidfPredicate) :
+    type = "TfidfTextPredicate"
+
+    rx = re.compile(r"(?u)\w+[\w*?]*")
+
+    def preprocess(self, doc) :
+        return tuple(self.rx.findall(doc))
 
     def __call__(self, record) :
 
-        centers = self.index.search(record[self.field], self.threshold)
+        centers = self.index.search(self.preprocess(record[self.field]), 
+                                    self.threshold)
 
         l_unicode = unicode
         return [l_unicode(center) for center in centers]
 
-    def initIndex(self, stop_words) :
-        return tfidf.TfIdfIndex(stop_words)
-
-class LevenshteinPredicate(IndexPredicate) :
-    type = "LevenshteinPredicate"
-
-    def __init__(self, threshold, field):
-        super(LevenshteinPredicate, self).__init__(threshold, field)
+class TfidfSetPredicate(TfidfPredicate) :
+    type = "TfidfSetPredicate"
 
-        self.transitions = metric_tree.transitions(threshold)
+    def preprocess(self, doc) :
+        return doc
 
     def __call__(self, record) :
 
         centers = self.index.search(record[self.field], 
-                                    self.transitions,
                                     self.threshold)
 
         l_unicode = unicode
         return [l_unicode(center) for center in centers]
 
+class TfidfNGramPredicate(TfidfPredicate) :
+    type = "TfidfNGramPredicate"
+
+    def preprocess(self, doc) :
+        return tuple(ngrams(doc.replace(' ', ''), 2))
+
+    def __call__(self, record) :
+
+        centers = self.index.search(self.preprocess(record[self.field]), 
+                                    self.threshold)
+
+        l_unicode = unicode
+        return [l_unicode(center) for center in centers]
 
-    def initIndex(self, *args) :
-        return metric_tree.LevenshteinIndex()
 
 class CompoundPredicate(Predicate) :
     type = "CompoundPredicate"
@@ -171,22 +187,31 @@ def commonThreeTokens(field) :
     return ngramsTokens(field.split(), 3)
 
 def fingerprint(field) :
-    return (u''.join(sorted(field.split())).strip(),)
+    if field :
+        return (u''.join(sorted(field.split())).strip(),)
+    else :
+        return ()
 
 def oneGramFingerprint(field) :
-    return (u''.join(sorted(ngrams(field.replace(' ', ''), 1))).strip(),)
+    if field :
+        return (u''.join(sorted(set(ngrams(field.replace(' ', ''), 1)))).strip(),)
+    else :
+        return ()
 
 def twoGramFingerprint(field) :
-    return (u''.join(sorted(gram.strip() for gram 
-                            in ngrams(field.replace(' ', ''), 2))),)
+    if len(field) > 1 :
+        return (u''.join(sorted(gram.strip() for gram 
+                                in set(ngrams(field.replace(' ', ''), 2)))),)
+    else :
+        return ()
 
 def commonFourGram(field):
     """return 4-grams"""
-    return ngrams(field.replace(' ', ''), 4)
+    return set(ngrams(field.replace(' ', ''), 4))
 
 def commonSixGram(field):
     """return 6-grams"""
-    return ngrams(field.replace(' ', ''), 6)
+    return set(ngrams(field.replace(' ', ''), 6))
 
 def sameThreeCharStartPredicate(field):
     """return first three characters"""

diff --git a/dedupe/training.py b/dedupe/training.py
@@ -324,19 +324,18 @@ def predicateCoverage(self,
 
     def compoundPredicates(self) :
         intersection = set.intersection
-        product = itertools.product
+
+        # compound_predicates = itertools.chain(itertools.combinations(self.overlap, 2),
+        #                                       itertools.combinations(self.overlap, 3))
 
         compound_predicates = itertools.combinations(self.overlap, 2)
 
+
         for compound_predicate in compound_predicates :
             compound_predicate = predicates.CompoundPredicate(compound_predicate)
-            predicate_1, predicate_2 = compound_predicate
-
             self.overlap[compound_predicate] =\
-                intersection(self.overlap[predicate_1],
-                             self.overlap[predicate_2])
-
-            i = 0
+                intersection(*[self.overlap[pred] 
+                               for pred in compound_predicate])         
 
 
 class DedupeCoverage(Coverage) :

diff --git a/dedupe/variables/set.py b/dedupe/variables/set.py
@@ -17,7 +17,7 @@ class SetType(FieldType) :
     def __init__(self, definition) :
         super(SetType, self).__init__(definition)
 
-        canopy_predicates = [predicates.TfidfPredicate(threshold, 
+        canopy_predicates = [predicates.TfidfSetPredicate(threshold, 
                                                        self.field)
                              for threshold in self._canopy_thresholds]
 

diff --git a/dedupe/variables/string.py b/dedupe/variables/string.py
@@ -35,11 +35,15 @@ class StringType(ShortStringType) :
     def __init__(self, definition) :
         super(StringType, self).__init__(definition)
 
-        canopy_predicates = [predicates.TfidfPredicate(threshold, 
-                                                            self.field)
-                             for threshold in self._canopy_thresholds]
+        self.predicates += [predicates.TfidfTextPredicate(threshold, 
+                                                          self.field)
+                            for threshold in self._canopy_thresholds]
+
+        self.predicates += [predicates.TfidfNGramPredicate(threshold, 
+                                                          self.field)
+                            for threshold in self._canopy_thresholds]
+
 
-        self.predicates += canopy_predicates
 
 class TextType(StringType) :
     type = "Text"

diff --git a/setup.py b/setup.py
@@ -12,12 +12,8 @@
                   'categorical-distance',
                   'rlr',
                   'affinegap',
-<<<<<<< HEAD
                   'canonicalize',
                   'simplecosine',
-=======
-                  'finenight',
->>>>>>> metric_tree
                   'haversine',
                   'BTrees==4.0.8',
                   'zope.interface', 

diff --git a/src/cpredicates.pyx b/src/cpredicates.pyx
@@ -1,6 +1,6 @@
 # cython: c_string_type=unicode, c_string_encoding=utf8
 
-cpdef set ngrams(basestring field, int n):
+cpdef list ngrams(basestring field, int n):
     """ngrams returns all unique, contiguous sequences of n characters
     of a given field.
         
@@ -14,12 +14,12 @@ cpdef set ngrams(basestring field, int n):
     """
     cdef unicode ufield = _ustring(field)
 
-    cdef set grams = set([])
+    cdef list grams = []
     cdef int i, j
     cdef int n_char = len(ufield)
     for i in range(n_char):
         for j in range(i+n, min(n_char, i+n)+1):
-            grams.add(ufield[i:j])
+            grams.append(ufield[i:j])
 
     return grams
 

diff --git a/tests/test_dedupe.py b/tests/test_dedupe.py
@@ -301,10 +301,14 @@ def test_predicates_correctness(self):
     assert dedupe.predicates.commonThreeElementsPredicate((1,)) == set([])
 
     assert dedupe.predicates.fingerprint('time sandwich') == (u'sandwichtime',)
+    assert dedupe.predicates.fingerprint('') == ()
     assert dedupe.predicates.oneGramFingerprint('sandwich time') == (u'acdehimnstw',)
+    assert dedupe.predicates.oneGramFingerprint('') == ()
     assert dedupe.predicates.twoGramFingerprint('sandwich time') == (u'anchdwhticimmendsatiwi',)
-
-
+    assert dedupe.predicates.twoGramFingerprint('1') == ()
+    assert dedupe.predicates.commonTwoTokens('foo bar') == set([u'foo bar'])
+    assert dedupe.predicates.commonTwoTokens('foo') == set([])
+
 
 
 if __name__ == "__main__":