Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Feb 15, 2015
1 parent 567eaef commit 5bdda6d
Show file tree
Hide file tree
Showing 10 changed files with 79 additions and 89 deletions.
10 changes: 5 additions & 5 deletions dedupe/blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ def index(self, data, field):
self.stop_words[field])

for doc in data :
for _, index in indices :
index.index(doc)
for _, index, preprocess in indices :
index.index(preprocess(doc))

for index_type, index in indices :
for index_type, index, _ in indices :

index.initSearch()

Expand Down Expand Up @@ -102,10 +102,10 @@ def extractIndices(index_fields, stop_words=None) :
for index_type, predicates in index_fields.items() :
predicate = next(iter(predicates))
index = predicate.index
preprocess = predicate.preprocess
if predicate.index is None :
index = predicate.initIndex(stop_words)

indices.append((index_type, index))
indices.append((index_type, index, preprocess))

return indices

28 changes: 5 additions & 23 deletions dedupe/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,12 @@ def apply(self, query_list, threshold, start=0, count=None):
class CanopyLexicon(Lexicon) : # pragma : no cover
def __init__(self, stop_words) :
super(CanopyLexicon, self).__init__()
self._pipeline = [Splitter(),
CustomStopWordRemover(stop_words)]
self._pipeline = [CustomStopWordRemover(stop_words)]

def sourceToWordIds(self, doc):
if doc is None:
doc = ''
last = stringify(doc) # this is changed line
def sourceToWordIds(self, last):
if last is None:
last = []
#last = stringify(doc) # this is changed line
for element in self._pipeline:
last = element.process(last)
if not isinstance(self.wordCount, Length):
Expand All @@ -80,20 +79,3 @@ def process(self, lst):
return [w for w in lst if not w in self.stop_words]


def stringify(doc) :
if not isinstance(doc, basestring) :
doc = u' '.join(u'_'.join(each.split()) for each in doc)

return [doc]



class Splitter(object):
rx = re.compile(r"(?u)\w+[\w*?]*")

def process(self, lst):
result = []
for s in lst:
result += self.rx.findall(s)

return result
20 changes: 0 additions & 20 deletions dedupe/metric_tree.py

This file was deleted.

65 changes: 45 additions & 20 deletions dedupe/predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,38 +59,54 @@ def __setstate__(self, d) :
self.index = None

class TfidfPredicate(IndexPredicate):
type = "TfidfPredicate"

def initIndex(self, stop_words) :
return tfidf.TfIdfIndex(stop_words)

class TfidfTextPredicate(TfidfPredicate) :
type = "TfidfTextPredicate"

rx = re.compile(r"(?u)\w+[\w*?]*")

def preprocess(self, doc) :
return tuple(self.rx.findall(doc))

def __call__(self, record) :

centers = self.index.search(record[self.field], self.threshold)
centers = self.index.search(self.preprocess(record[self.field]),
self.threshold)

l_unicode = unicode
return [l_unicode(center) for center in centers]

def initIndex(self, stop_words) :
return tfidf.TfIdfIndex(stop_words)

class LevenshteinPredicate(IndexPredicate) :
type = "LevenshteinPredicate"

def __init__(self, threshold, field):
super(LevenshteinPredicate, self).__init__(threshold, field)
class TfidfSetPredicate(TfidfPredicate) :
type = "TfidfSetPredicate"

self.transitions = metric_tree.transitions(threshold)
def preprocess(self, doc) :
return doc

def __call__(self, record) :

centers = self.index.search(record[self.field],
self.transitions,
self.threshold)

l_unicode = unicode
return [l_unicode(center) for center in centers]

class TfidfNGramPredicate(TfidfPredicate) :
type = "TfidfNGramPredicate"

def preprocess(self, doc) :
return tuple(ngrams(doc.replace(' ', ''), 2))

def __call__(self, record) :

centers = self.index.search(self.preprocess(record[self.field]),
self.threshold)

l_unicode = unicode
return [l_unicode(center) for center in centers]

def initIndex(self, *args) :
return metric_tree.LevenshteinIndex()

class CompoundPredicate(Predicate) :
type = "CompoundPredicate"
Expand Down Expand Up @@ -171,22 +187,31 @@ def commonThreeTokens(field) :
return ngramsTokens(field.split(), 3)

def fingerprint(field) :
return (u''.join(sorted(field.split())).strip(),)
if field :
return (u''.join(sorted(field.split())).strip(),)
else :
return ()

def oneGramFingerprint(field) :
return (u''.join(sorted(ngrams(field.replace(' ', ''), 1))).strip(),)
if field :
return (u''.join(sorted(set(ngrams(field.replace(' ', ''), 1)))).strip(),)
else :
return ()

def twoGramFingerprint(field) :
return (u''.join(sorted(gram.strip() for gram
in ngrams(field.replace(' ', ''), 2))),)
if len(field) > 1 :
return (u''.join(sorted(gram.strip() for gram
in set(ngrams(field.replace(' ', ''), 2)))),)
else :
return ()

def commonFourGram(field):
"""return 4-grams"""
return ngrams(field.replace(' ', ''), 4)
return set(ngrams(field.replace(' ', ''), 4))

def commonSixGram(field):
"""return 6-grams"""
return ngrams(field.replace(' ', ''), 6)
return set(ngrams(field.replace(' ', ''), 6))

def sameThreeCharStartPredicate(field):
"""return first three characters"""
Expand Down
13 changes: 6 additions & 7 deletions dedupe/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,19 +324,18 @@ def predicateCoverage(self,

def compoundPredicates(self) :
intersection = set.intersection
product = itertools.product

# compound_predicates = itertools.chain(itertools.combinations(self.overlap, 2),
# itertools.combinations(self.overlap, 3))

compound_predicates = itertools.combinations(self.overlap, 2)


for compound_predicate in compound_predicates :
compound_predicate = predicates.CompoundPredicate(compound_predicate)
predicate_1, predicate_2 = compound_predicate

self.overlap[compound_predicate] =\
intersection(self.overlap[predicate_1],
self.overlap[predicate_2])

i = 0
intersection(*[self.overlap[pred]
for pred in compound_predicate])


class DedupeCoverage(Coverage) :
Expand Down
2 changes: 1 addition & 1 deletion dedupe/variables/set.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class SetType(FieldType) :
def __init__(self, definition) :
super(SetType, self).__init__(definition)

canopy_predicates = [predicates.TfidfPredicate(threshold,
canopy_predicates = [predicates.TfidfSetPredicate(threshold,
self.field)
for threshold in self._canopy_thresholds]

Expand Down
12 changes: 8 additions & 4 deletions dedupe/variables/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,15 @@ class StringType(ShortStringType) :
def __init__(self, definition) :
super(StringType, self).__init__(definition)

canopy_predicates = [predicates.TfidfPredicate(threshold,
self.field)
for threshold in self._canopy_thresholds]
self.predicates += [predicates.TfidfTextPredicate(threshold,
self.field)
for threshold in self._canopy_thresholds]

self.predicates += [predicates.TfidfNGramPredicate(threshold,
self.field)
for threshold in self._canopy_thresholds]


self.predicates += canopy_predicates

class TextType(StringType) :
type = "Text"
Expand Down
4 changes: 0 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,8 @@
'categorical-distance',
'rlr',
'affinegap',
<<<<<<< HEAD
'canonicalize',
'simplecosine',
=======
'finenight',
>>>>>>> metric_tree
'haversine',
'BTrees==4.0.8',
'zope.interface',
Expand Down
6 changes: 3 additions & 3 deletions src/cpredicates.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# cython: c_string_type=unicode, c_string_encoding=utf8

cpdef set ngrams(basestring field, int n):
cpdef list ngrams(basestring field, int n):
"""ngrams returns all unique, contiguous sequences of n characters
of a given field.
Expand All @@ -14,12 +14,12 @@ cpdef set ngrams(basestring field, int n):
"""
cdef unicode ufield = _ustring(field)

cdef set grams = set([])
cdef list grams = []
cdef int i, j
cdef int n_char = len(ufield)
for i in range(n_char):
for j in range(i+n, min(n_char, i+n)+1):
grams.add(ufield[i:j])
grams.append(ufield[i:j])

return grams

Expand Down
8 changes: 6 additions & 2 deletions tests/test_dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,10 +301,14 @@ def test_predicates_correctness(self):
assert dedupe.predicates.commonThreeElementsPredicate((1,)) == set([])

assert dedupe.predicates.fingerprint('time sandwich') == (u'sandwichtime',)
assert dedupe.predicates.fingerprint('') == ()
assert dedupe.predicates.oneGramFingerprint('sandwich time') == (u'acdehimnstw',)
assert dedupe.predicates.oneGramFingerprint('') == ()
assert dedupe.predicates.twoGramFingerprint('sandwich time') == (u'anchdwhticimmendsatiwi',)


assert dedupe.predicates.twoGramFingerprint('1') == ()
assert dedupe.predicates.commonTwoTokens('foo bar') == set([u'foo bar'])
assert dedupe.predicates.commonTwoTokens('foo') == set([])



if __name__ == "__main__":
Expand Down

0 comments on commit 5bdda6d

Please sign in to comment.