Skip to content

Commit

Permalink
use numpy to do filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Apr 16, 2014
1 parent f33a748 commit 37a006f
Showing 1 changed file with 10 additions and 26 deletions.
36 changes: 10 additions & 26 deletions dedupe/core.py
Expand Up @@ -181,39 +181,23 @@ def __call__(self, chunk_queue, scored_pairs_queue) :
scored_pairs_queue.put(scored_pairs)

def scoreRecords(self, record_pairs) :
ids = []

num_records = len(record_pairs)

scored_pairs = numpy.empty(num_records,
dtype = self.dtype)

def split_records() :
for pair in record_pairs :
for i, pair in enumerate(record_pairs) :
record_1, record_2 = pair
ids.append((record_1[0], record_2[0]))
scored_pairs['pairs'][i] = (record_1[0], record_2[0])
yield (record_1[1], record_2[1])

scores = scorePairs(fieldDistances(split_records(),
self.data_model,
num_records),
self.data_model)

if self.threshold :
threshold = self.threshold
filtered_scores = ((pair_id, score)
for pair_id, score
in itertools.izip(ids, scores)
if score > threshold)
scored_pairs = numpy.fromiter(filtered_scores,
dtype=self.dtype)

else :
filtered_scores = ((pair_id, score)
for pair_id, score
in itertools.izip(ids, scores))

scored_pairs = numpy.fromiter(filtered_scores,
dtype=self.dtype,
count=num_records)
scored_pairs['score'] = scorePairs(fieldDistances(split_records(),
self.data_model,
num_records),
self.data_model)

scored_pairs = scored_pairs[scored_pairs['score'] > self.threshold]
return scored_pairs

def scoreDuplicates(records, data_model, num_processes, threshold=0):
Expand Down

0 comments on commit 37a006f

Please sign in to comment.