-
Notifications
You must be signed in to change notification settings - Fork 215
Expand file tree
/
Copy pathcsv_evaluation.py
More file actions
46 lines (31 loc) · 1.17 KB
/
csv_evaluation.py
File metadata and controls
46 lines (31 loc) · 1.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import collections
import csv
import itertools
def evaluateDuplicates(found_dupes, true_dupes):
true_positives = found_dupes.intersection(true_dupes)
false_positives = found_dupes.difference(true_dupes)
print("found duplicate")
print(len(found_dupes))
print("precision")
print(1 - len(false_positives) / float(len(found_dupes)))
print("recall")
print(len(true_positives) / float(len(true_dupes)))
def dupePairs(filename, rowname):
dupe_d = collections.defaultdict(list)
with open(filename) as f:
reader = csv.DictReader(f, delimiter=",", quotechar='"')
for row in reader:
dupe_d[row[rowname]].append(row["Id"])
if "x" in dupe_d:
del dupe_d["x"]
dupe_s = set()
for unique_id, cluster in dupe_d.items():
if len(cluster) > 1:
for pair in itertools.combinations(cluster, 2):
dupe_s.add(frozenset(pair))
return dupe_s
manual_clusters = "csv_example_input_with_true_ids.csv"
dedupe_clusters = "csv_example_output.csv"
true_dupes = dupePairs(manual_clusters, "True Id")
test_dupes = dupePairs(dedupe_clusters, "Cluster ID")
evaluateDuplicates(test_dupes, true_dupes)