In [111]:
import json
from pathlib import Path
from typing import NamedTuple
import itertools

In [112]:
datapath = Path("raw_outputs.json")
data = json.loads(datapath.read_text())

In [113]:
class CheckLocation(NamedTuple):
    filename: str
    start_row: int
    end_row:int

profiles: dict[str,set[NamedTuple]] = {}
for datum in data:
    code = datum['code']
    if code not in profiles:
        profiles[code] = set()
    profiles[code].add(
        CheckLocation(
            filename=datum['filename'],
            start_row=datum['location']['row'],
            end_row=datum['end_location']['row']
        )
    )

In [114]:
def iou(set0:set,set1:set)->float:
    ints = set0.intersection(set1)
    un = set0.union(set1)
    if un:
        return round(len(ints)/len(un),3)*100
    else:
        return 0.0

In [115]:
ious_by_pair = {
    (x,y):iou(profiles[x],profiles[y])
    for x,y in itertools.combinations(profiles,2)
}

In [116]:
candidates = sorted([
    (a,v) for a,v in ious_by_pair.items()
], key=lambda x:x[1], reverse=True)

In [118]:
matches = []
for code in profiles:
    for other_code in profiles:
        if code!=other_code and profiles[code] == profiles[other_code]:
            matches.append((code,other_code))

In [119]:
matches

[]

In [120]:
candidates[:10]

[(('D400', 'D415'), 94.19999999999999),
 (('INP001', 'D100'), 73.1),
 (('TD003', 'FIX002'), 71.8),
 (('F509', 'PLE1300'), 66.7),
 (('TD001', 'FIX001'), 64.7),
 (('ANN201', 'D103'), 63.9),
 (('PTH101', 'S103'), 63.6),
 (('TD003', 'TD002'), 60.3),
 (('G004', 'TRY401'), 53.300000000000004),
 (('FIX002', 'TD002'), 50.7)]