Skip to content

Commit

Permalink
Merge branch 'remove_coverage_estimator'
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Jun 9, 2021
2 parents b8f48c7 + b2e6b5b commit e22c269
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 16 deletions.
10 changes: 5 additions & 5 deletions dedupe/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,11 @@ def condensedDistance(dupes: numpy.ndarray) -> Tuple[Dict[int, RecordID],
col = ids[:, 1]

N = len(candidate_set)
matrix_length = N * (N - 1) // 2

row_step = (N - row) * (N - row - 1) // 2
index = matrix_length - row_step + col - row - 1
# alternate form thanks to wolfram alpa
index = row * (2 * N - row - 3) // 2 + col - 1

condensed_distances = numpy.ones(matrix_length, 'f4')
condensed_distances = numpy.ones(N * (N-1) // 2, 'f4')
condensed_distances[index] = 1 - dupes['score']

return i_to_id, condensed_distances, N
Expand Down Expand Up @@ -249,8 +248,9 @@ def confidences(cluster: Sequence[int],
'''
scores_d = dict.fromkeys(cluster, 0.0)
squared_distances = condensed_distances ** 2
C = 2 * d - 3
for i, j in itertools.combinations(cluster, 2):
index = d * (d - 1) // 2 - (d - i) * (d - i - 1) // 2 + j - i - 1
index = i * (C - i) // 2 + j - 1
squared_dist = squared_distances[index]
scores_d[i] += squared_dist
scores_d[j] += squared_dist
Expand Down
18 changes: 7 additions & 11 deletions dedupe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,19 @@ def randomPairs(n_records: int, sample_size: int) -> IndicesIterator:
n: int = n_records * (n_records - 1) // 2

if sample_size >= n:
random_pairs = numpy.arange(n, dtype='uint')
random_pairs = numpy.arange(n)
else:
try:
random_pairs = numpy.array(random.sample(range(n), sample_size),
dtype='uint')
random_pairs = numpy.array(random.sample(range(n), sample_size))
except OverflowError:
return randomPairsWithReplacement(n_records, sample_size)

b: int = 1 - 2 * n_records

root = (-b - 2 * numpy.sqrt(2 * (n - random_pairs) + 0.25)) // 2
i = (-b - 2 * numpy.sqrt(2 * (n - random_pairs) + 0.25)) // 2
i = i.astype(int)

i = root.astype('uint')
j = (random_pairs + i * (b + i + 2) // 2 + 1).astype('uint')
j = random_pairs + i * (b + i + 2) // 2 + 1

return zip(i, j)

Expand Down Expand Up @@ -402,14 +401,11 @@ def Enumerator(start: int = 0, initial: tuple = ()) -> collections.defaultdict:

class DiagonalEnumerator(object):
def __init__(self, N: int):
self.N = N
self.C = N * (N - 1) // 2 - 1
self.C = 2 * N - 3

def __getitem__(self, pair: Tuple[int, int]) -> int:
x, y = pair
N = self.N
C = self.C
return C - (N - x) * (N - x - 1) // 2 + y - x
return x * (self.C - x) // 2 + y - 1


class FullEnumerator(object):
Expand Down

0 comments on commit e22c269

Please sign in to comment.