Skip to content

Commit

Permalink
Merge pull request #370 from datamade/python3
Browse files Browse the repository at this point in the history
initial pass at python3 compatibility close #368
  • Loading branch information
fgregg committed Mar 10, 2015
2 parents d478d84 + a88b04b commit 385807f
Show file tree
Hide file tree
Showing 32 changed files with 268 additions and 224 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ notifications:
irc: chat.freenode.net#dedupe
python:
- '2.7'
- '2.6'
- '3.4'
install:
- pip install -r requirements.txt
- pip install coveralls
Expand Down
17 changes: 16 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
## Unreleased
## 0.8.0
Support for Python 3.4 added. Support for Python 2.6 dropped.

Features
- Windows OS supported
- train method has argument for not considering index predicates
- TfIDFNGram Index Predicate added (for shorter string)
- SuffixArray Predicate
- Double Metaphone Predicates
- Predicates for numbers, OrderOfMagnitude, Round
- Set Predicate OrderOfCardinality
- Final, learned predicates list will now often be smaller without
loss of coverage
- Variables refactored to support external extensions like
https://github.com/datamade/dedupe-variable-address
- Categorical distance, regularized logistic regression, affine gap
distance, canicalization have been turned into separate libraries.
- Simplejson is now dependency

## 0.7.5
Features
Expand Down
10 changes: 5 additions & 5 deletions dedupe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
'crossvalidation',
]

from api import StaticDedupe, Dedupe
from api import StaticRecordLink, RecordLink
from api import StaticGazetteer, Gazetteer
from core import randomPairs, randomPairsMatch, frozendict
from convenience import consoleLabel, trainingDataDedupe, trainingDataLink, canonicalize
from .api import StaticDedupe, Dedupe
from .api import StaticRecordLink, RecordLink
from .api import StaticGazetteer, Gazetteer
from .core import randomPairs, randomPairsMatch, frozendict
from .convenience import consoleLabel, trainingDataDedupe, trainingDataLink, canonicalize
33 changes: 17 additions & 16 deletions dedupe/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
dedupe provides the main user interface for the library the
Dedupe class
"""
from __future__ import print_function
from future.utils import viewitems, viewvalues

import itertools
import logging
Expand All @@ -14,11 +16,9 @@
import warnings
import copy
import os
from collections import defaultdict
from collections import defaultdict, OrderedDict
import simplejson as json

from dedupe.backport import OrderedDict

import dedupe
import dedupe.sampling as sampling
import dedupe.core as core
Expand All @@ -32,6 +32,7 @@

logger = logging.getLogger(__name__)


class Matching(object):
"""
Base Class for Record Matching Classes
Expand Down Expand Up @@ -233,7 +234,7 @@ def _blockedPairs(self, blocks) :
block, blocks = core.peek(blocks)
self._checkBlock(block)

combinations = itertools.combinations
combinations = itertools.combinations

pairs = (combinations(block, 2) for block in blocks)

Expand Down Expand Up @@ -267,12 +268,12 @@ def _blockData(self, data_d):
for field in self.blocker.index_fields :
unique_fields = set(record[field]
for record
in data_d.itervalues()
in viewvalues(data_d)
if record[field])

self.blocker.index(unique_fields, field)

for block_key, record_id in self.blocker(data_d.iteritems()) :
for block_key, record_id in self.blocker(viewitems(data_d)) :
blocks[block_key][record_id] = data_d[record_id]

self.blocker.resetIndices()
Expand All @@ -296,15 +297,15 @@ def _redundantFree(self, blocks) :

for block_id, records in enumerate(blocks) :

for record_id, record in records.iteritems() :
for record_id, record in viewitems(records) :
coverage[record_id].append(block_id)

for block_id, records in enumerate(blocks) :
if block_id % 10000 == 0 :
logger.info("%s blocks" % block_id)

marked_records = []
for record_id, record in records.iteritems() :
for record_id, record in viewitems(records) :
smaller_ids = set([covered_id for covered_id
in coverage[record_id]
if covered_id < block_id])
Expand Down Expand Up @@ -381,7 +382,7 @@ def threshold(self, data_1, data_2, recall_weight = 1.5) : # pragma : no cover
recall. I.e. if you care twice as much about
recall as you do precision, set recall_weight
to 2.
"""
x """

blocked_pairs = self._blockData(data_1, data_2)
return self.thresholdBlocks(blocked_pairs, recall_weight)
Expand All @@ -398,7 +399,7 @@ def _blockedPairs(self, blocks) :
block, blocks = core.peek(blocks)
self._checkBlock(block)

product = itertools.product
product = itertools.product

pairs = (product(base, target) for base, target in blocks)

Expand All @@ -424,7 +425,7 @@ def _checkBlock(self, block) :
self._checkRecordType(target[0][1])

def _blockGenerator(self, messy_data, blocked_records) :
block_groups = itertools.groupby(self.blocker(messy_data.iteritems()),
block_groups = itertools.groupby(self.blocker(viewitems(messy_data)),
lambda x : x[1])

for i, (record_id, block_keys) in enumerate(block_groups) :
Expand Down Expand Up @@ -454,7 +455,7 @@ def _blockData(self, data_1, data_2) :
for field in self.blocker.index_fields :
fields_2 = (record[field]
for record
in data_2.itervalues())
in viewvalues(data_2))

self.blocker.index(set(fields_2), field)

Expand Down Expand Up @@ -512,7 +513,7 @@ def __init__(self,
"the current version of dedupe. This can happen "
"if you have recently upgraded dedupe.")
except :
print "Something has gone wrong with loading the settings file"
print("Something has gone wrong with loading the settings file")
raise


Expand Down Expand Up @@ -1033,7 +1034,7 @@ def index(self, data) : # pragma : no cover
for field in self.blocker.index_fields :
self.blocker.index((record[field]
for record
in data.itervalues()),
in viewvalues(data)),
field)

for block_key, record_id in self.blocker(data.items()) :
Expand All @@ -1046,10 +1047,10 @@ def unindex(self, data) : # pragma : no cover
for field in self.blocker.index_fields :
self.blocker.unindex((record[field]
for record
in data.itervalues()),
in viewvalues(data)),
field)

for block_key, record_id in self.blocker(data.iteritems()) :
for block_key, record_id in self.blocker(viewitems(data)) :
try :
del self.blocked_records[block_key][record_id]
except KeyError :
Expand Down
19 changes: 10 additions & 9 deletions dedupe/backport.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
import threading
import warnings
import platform
import sys

from future.utils import viewitems
from future.builtins import range

MULTIPROCESSING = True
# Deal with Mac OS X issuse
config_info = str([value for key, value in
numpy.__config__.__dict__.iteritems()
viewitems(numpy.__config__.__dict__)
if key.endswith("_info")]).lower()

if "accelerate" in config_info or "veclib" in config_info :
Expand All @@ -21,19 +25,16 @@

if MULTIPROCESSING :
from multiprocessing import Process, Pool, Queue
from multiprocessing.queues import SimpleQueue
if sys.version < '3':
from multiprocessing.queues import SimpleQueue
else :
from multiprocessing import SimpleQueue
else :
if not hasattr(threading.current_thread(), "_children"):
threading.current_thread()._children = weakref.WeakKeyDictionary()
from multiprocessing.dummy import Process, Pool, Queue
SimpleQueue = Queue

try :
from collections import OrderedDict
except ImportError :
from ordereddict import OrderedDict


def cartesian(arrays, out=None):
"""Generate a cartesian product of input arrays.
Expand Down Expand Up @@ -82,7 +83,7 @@ def cartesian(arrays, out=None):
out[:, 0] = numpy.repeat(arrays[0], m)
if arrays[1:]:
cartesian(arrays[1:], out=out[0:m, 1:])
for j in xrange(1, arrays[0].size):
for j in range(1, arrays[0].size):
out[j * m:(j + 1) * m, 1:] = out[0:m, 1:]
return out

2 changes: 1 addition & 1 deletion dedupe/blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __call__(self, records):
def resetIndices(self) :
# clear canopies to reduce memory usage
for index_type in self.index_fields.values() :
for predicate in index_type.values()[0] :
for predicate in list(index_type.values())[0] :
predicate.index = None

def index(self, data, field):
Expand Down
15 changes: 8 additions & 7 deletions dedupe/clustering.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from future.utils import viewitems

import itertools

Expand Down Expand Up @@ -54,10 +55,10 @@ def connected_components(edgelist, max_components) :
indices[root_a].append(i)

for root in component :
n_components = len(component[root])
sub_graph = edgelist[indices[root]]

if n_components > max_components :
n_components = len(component[root])
sub_graph = edgelist[indices[root]]
if n_components > max_components :
threshold = numpy.min(sub_graph['score'])
threshold *= 1.1
warnings.warn('A component contained %s elements. '
Expand Down Expand Up @@ -109,8 +110,8 @@ def condensedDistance(dupes):
index = matrix_length - row_step + col - row - 1

condensed_distances = numpy.ones(matrix_length, 'f4')
condensed_distances[index] = 1 - dupes['score']
condensed_distances[index.astype(int)] = 1 - dupes['score']


return i_to_id, condensed_distances, N

Expand Down Expand Up @@ -150,7 +151,7 @@ def cluster(dupes, threshold=.5, max_components=30000):
for (i, sub_cluster_id) in enumerate(partition):
clusters.setdefault(cluster_id + sub_cluster_id, []).append(i)

for cluster_id, items in clusters.iteritems() :
for cluster_id, items in viewitems(clusters) :
if len(items) > 1 :
scores = confidences(items, condensed_distances, N)
clustering[cluster_id] =\
Expand Down
15 changes: 9 additions & 6 deletions dedupe/convenience.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import print_function
from builtins import input

import collections
import itertools
import random
Expand Down Expand Up @@ -30,13 +33,13 @@ def consoleLabel(deduper): # pragma : no cover
for field in set(field[0] for field
in deduper.data_model.field_comparators) :
line = "%s : %s" % (field, pair[field])
print line
print
print(line)
print()

print 'Do these records refer to the same thing?'
print('Do these records refer to the same thing?')
valid_response = False
while not valid_response:
label = raw_input('(y)es / (n)o / (u)nsure / (f)inished\n')
label = input('(y)es / (n)o / (u)nsure / (f)inished\n')
if label in ['y', 'n', 'u', 'f']:
valid_response = True

Expand All @@ -47,10 +50,10 @@ def consoleLabel(deduper): # pragma : no cover
labels['distinct'].append(record_pair)
labeled = True
elif label == 'f':
print 'Finished labeling'
print('Finished labeling')
finished = True
elif label != 'u':
print 'Nonvalid response'
print('Nonvalid response')
raise

if labeled :
Expand Down

0 comments on commit 385807f

Please sign in to comment.