From 545433b980c92d40e1e35781f25433df626c4a02 Mon Sep 17 00:00:00 2001 From: Jeff Hendricks Date: Thu, 19 Aug 2021 20:15:49 -0600 Subject: [PATCH 1/2] pre-compute squared_distances --- dedupe/clustering.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dedupe/clustering.py b/dedupe/clustering.py index 4986d403f..0108dc1a7 100644 --- a/dedupe/clustering.py +++ b/dedupe/clustering.py @@ -212,6 +212,7 @@ def cluster(dupes: numpy.ndarray, if len(sub_graph) > 1: i_to_id, condensed_distances, N = condensedDistance(sub_graph) + squared_distances = condensed_distances**2 linkage = fastcluster.linkage(condensed_distances, method='centroid', @@ -228,7 +229,7 @@ def cluster(dupes: numpy.ndarray, for cluster in clusters.values(): if len(cluster) > 1: - scores = confidences(cluster, condensed_distances, N) + scores = confidences(cluster, squared_distances, N) yield tuple(i_to_id[i] for i in cluster), scores else: @@ -238,7 +239,7 @@ def cluster(dupes: numpy.ndarray, def confidences(cluster: Sequence[int], - condensed_distances: numpy.ndarray, + squared_distances: numpy.ndarray, d: int) -> numpy.ndarray: ''' We calculate a per record score that is similar to a standard @@ -247,7 +248,6 @@ def confidences(cluster: Sequence[int], which is a reasonable metric for clusters. ''' scores_d = dict.fromkeys(cluster, 0.0) - squared_distances = condensed_distances ** 2 C = 2 * d - 3 for i, j in itertools.combinations(cluster, 2): index = i * (C - i) // 2 + j - 1 From 476536bbf6fbfcb3fa437673b050b63ead8df5e0 Mon Sep 17 00:00:00 2001 From: Jeff Hendricks Date: Mon, 23 Aug 2021 12:25:17 -0600 Subject: [PATCH 2/2] Move squared_distances closer to where it's used --- dedupe/clustering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dedupe/clustering.py b/dedupe/clustering.py index 0108dc1a7..33d6ae61e 100644 --- a/dedupe/clustering.py +++ b/dedupe/clustering.py @@ -212,7 +212,6 @@ def cluster(dupes: numpy.ndarray, if len(sub_graph) > 1: i_to_id, condensed_distances, N = condensedDistance(sub_graph) - squared_distances = condensed_distances**2 linkage = fastcluster.linkage(condensed_distances, method='centroid', @@ -227,6 +226,7 @@ def cluster(dupes: numpy.ndarray, for i, cluster_id in enumerate(partition): clusters[cluster_id].append(i) + squared_distances = condensed_distances**2 for cluster in clusters.values(): if len(cluster) > 1: scores = confidences(cluster, squared_distances, N)