Skip to content

Commit

Permalink
Merge pull request #369 from datamade/none-check
Browse files Browse the repository at this point in the history
None check
  • Loading branch information
fgregg committed Mar 3, 2015
2 parents d36e11f + bc5ae92 commit acd081c
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 66 deletions.
84 changes: 36 additions & 48 deletions dedupe/predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,17 @@ def __init__(self, func, field) :

def __call__(self, record) :
column = record[self.field]
return self.func(column)
if column :
return self.func(column)
else :
return ()

class ExistsPredicate(SimplePredicate) :
def __call__(self, record) :
if record[self.field] :
return ('1',)
else :
return ('0',)

class IndexPredicate(Predicate) :
def __init__(self, threshold, field):
Expand All @@ -63,16 +73,20 @@ def initIndex(self, stop_words) :
return tfidf.TfIdfIndex(stop_words)

def __call__(self, record) :
column = record[self.field]
if column :
try :
centers = self.index.search(self.preprocess(column),
self.threshold)

try :
centers = self.index.search(self.preprocess(record[self.field]),
self.threshold)
l_unicode = unicode
return [l_unicode(center) for center in centers]
except :
raise AttributeError("Attempting to block with an index "
"predicate without indexing records")

l_unicode = unicode
return [l_unicode(center) for center in centers]
except :
raise AttributeError("Attempting to block with an index "
"predicate without indexing records")
else :
return ()

class TfidfTextPredicate(TfidfPredicate) :
type = "TfidfTextPredicate"
Expand Down Expand Up @@ -117,11 +131,7 @@ def __call__(self, record) :

def wholeFieldPredicate(field):
"""return the whole field"""

if field:
return (unicode(field), )
else:
return ()
return (unicode(field), )

def tokenFieldPredicate(field):
"""returns the tokens"""
Expand Down Expand Up @@ -173,16 +183,10 @@ def commonThreeTokens(field) :
return ngramsTokens(field.split(), 3)

def fingerprint(field) :
if field :
return (u''.join(sorted(field.split())).strip(),)
else :
return ()
return (u''.join(sorted(field.split())).strip(),)

def oneGramFingerprint(field) :
if field :
return (u''.join(sorted(set(ngrams(field.replace(' ', ''), 1)))).strip(),)
else :
return ()
return (u''.join(sorted(set(ngrams(field.replace(' ', ''), 1)))).strip(),)

def twoGramFingerprint(field) :
if len(field) > 1 :
Expand Down Expand Up @@ -243,17 +247,11 @@ def existsPredicate(field) :
return (u'0',)

def wholeSetPredicate(field_set):
if field_set :
return (unicode(field_set),)
else :
return ()
return (unicode(field_set),)

def commonSetElementPredicate(field_set):
"""return set as individual elements"""
if field_set :
return tuple([unicode(each) for each in field_set])
else :
return ()
return tuple([unicode(each) for each in field_set])

def commonTwoElementsPredicate(field) :
l = sorted(field)
Expand All @@ -264,20 +262,13 @@ def commonThreeElementsPredicate(field) :
return ngramsTokens(l, 3)

def lastSetElementPredicate(field_set) :
if field_set :
return (unicode(max(field_set)), )
return ()
return (unicode(max(field_set)), )

def firstSetElementPredicate(field_set) :
if field_set :
return (unicode(min(field_set)), )
return ()
return (unicode(min(field_set)), )

def magnitudeOfCardinality(field_set) :
if field_set :
return orderOfMagnitude(len(field_set))
else :
return ()
return orderOfMagnitude(len(field_set))

def latLongGridPredicate(field, digits=1):
"""
Expand All @@ -295,17 +286,14 @@ def latLongGridPredicate(field, digits=1):
return ()

def orderOfMagnitude(field) :
if field and field > 0 :
if field > 0 :
return (unicode(int(round(math.log10(field)))), )
else :
return ()

def roundTo1(field) : # thanks http://stackoverflow.com/questions/3410976/how-to-round-a-number-to-significant-figures-in-python
if field :
abs_num = abs(field)
order = int(math.floor(math.log10(abs_num)))
rounded = round(abs_num, -order)
return (unicode(int(math.copysign(rounded, field))),)
else :
return ()
abs_num = abs(field)
order = int(math.floor(math.log10(abs_num)))
rounded = round(abs_num, -order)
return (unicode(int(math.copysign(rounded, field))),)

4 changes: 1 addition & 3 deletions dedupe/variables/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def __init__(self, definition) :
if definition.get('has missing', False) :
self.has_missing = True
try :
self._predicate_functions += (predicates.existsPredicate,)
self.predicates += [predicates.ExistsPredicate]
except AttributeError :
pass
else :
Expand Down Expand Up @@ -83,8 +83,6 @@ def __init__(self, definition) :
self.comparator.__name__)




def allSubclasses(cls) :
field_classes = {}
for q in cls.__subclasses__() :
Expand Down
2 changes: 1 addition & 1 deletion dedupe/variables/exists.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class ExistsType(CategoricalType) :
type = "Exists"
_predicate_functions = [predicates.existsPredicate]
_predicate_functions = []

def __init__(self, definition) :

Expand Down
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

install_requires=['numpy>=1.9',
'fastcluster',
'hcluster>=0.3.0',
'dedupe-hcluster',
'categorical-distance',
'rlr',
'metafone',
Expand All @@ -30,12 +30,11 @@
setup(
name='dedupe',
url='https://github.com/datamade/dedupe',
version='0.7.7.1.3',
version='0.7.7.1.6',
description='A python library for accurate and scaleable data deduplication and entity-resolution',
packages=['dedupe', 'dedupe.variables'],
ext_modules=[Extension('dedupe.cpredicates', ['src/cpredicates.c'])],
license='The MIT License: http://www.opensource.org/licenses/mit-license.php',
dependency_links = ['http://github.com/datamade/hcluster/tarball/master#egg=hcluster-0.3.0'],
install_requires=install_requires,
classifiers=[
'Development Status :: 3 - Alpha',
Expand Down
3 changes: 0 additions & 3 deletions tests/test_dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,6 @@ def test_predicates_correctness(self):
assert dedupe.predicates.existsPredicate(1) == ('1',)
assert dedupe.predicates.existsPredicate(0) == ('0',)
assert dedupe.predicates.sortedAcronym(field) == ('11s',)
assert dedupe.predicates.wholeFieldPredicate('') == ()
assert dedupe.predicates.wholeFieldPredicate(field) == ('123 16th st',)
assert dedupe.predicates.firstTokenPredicate(field) == ('123',)
assert dedupe.predicates.firstTokenPredicate('') == ()
Expand Down Expand Up @@ -301,9 +300,7 @@ def test_predicates_correctness(self):
assert dedupe.predicates.commonThreeElementsPredicate((1,)) == set([])

assert dedupe.predicates.fingerprint('time sandwich') == (u'sandwichtime',)
assert dedupe.predicates.fingerprint('') == ()
assert dedupe.predicates.oneGramFingerprint('sandwich time') == (u'acdehimnstw',)
assert dedupe.predicates.oneGramFingerprint('') == ()
assert dedupe.predicates.twoGramFingerprint('sandwich time') == (u'anchdwhticimmendsatiwi',)
assert dedupe.predicates.twoGramFingerprint('1') == ()
assert dedupe.predicates.commonTwoTokens('foo bar') == set([u'foo bar'])
Expand Down
8 changes: 0 additions & 8 deletions tests/test_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@ def test_full_set(self):
block_val = predicates.wholeSetPredicate(self.s1)
self.assertEqual(block_val, (unicode(self.s1),))

def test_empty_set(self):
block_val = predicates.wholeSetPredicate(set())
self.assertEqual(block_val, tuple())

class TestSetElement(unittest.TestCase):
def setUp(self):
self.s1 = set(['red', 'blue', 'green'])
Expand All @@ -35,10 +31,6 @@ def test_first_last(self) :
assert block_val == ('red',)
block_val = predicates.firstSetElementPredicate(self.s1)
assert block_val == ('blue',)
block_val = predicates.firstSetElementPredicate(set([]))
assert block_val == ()
block_val = predicates.lastSetElementPredicate(set([]))
assert block_val == ()

def test_magnitude(self) :
block_val = predicates.magnitudeOfCardinality(self.s1)
Expand Down

0 comments on commit acd081c

Please sign in to comment.