In [17]:
import polars as pl
import jellyfish as jf
from collections import defaultdict
from itertools import combinations

In [7]:
def get_soundex_encoding(w):
    if w is None or w == "":
        return ""
    return jf.soundex(w.replace(r'\d+', ''))

In [8]:
ds = pl.read_csv("../dataset.csv", infer_schema_length=1000000)
ds.head(10)

_id,given_name,surname,suburb,postcode
str,str,str,str,str
"""0#0""","""annie""","""johnson""","""warrenton""","""27589"""
"""0#1""","""erin""","""pugh""","""asheboro""","""27205"""
"""0#2""","""lauren""","""toledo""","""chapel hill""","""27514"""
"""0#3""","""georgetta""","""atkinson""","""goldsboro""","""27530"""
"""0#4""","""lynn""","""kitchens""","""greensboro""","""27408"""
"""0#5""","""jacqueline""","""lacewell""","""riegelwood""","""28456"""
"""0#6""","""mary""","""martin""","""charlotte""","""28215"""
"""0#7""","""gerald""","""clontz""","""midland""","""28107"""
"""0#8""","""marie""","""roach""","""reidsville""","""27320"""
"""0#9""","""jennifer""","""hall""","""wilson""","""27896"""


In [9]:
ds = ds.with_columns(
    pl.col("given_name").apply(get_soundex_encoding).alias("given_name_soundex"),
    pl.col("surname").apply(get_soundex_encoding).alias("surname_soundex"),
    pl.col("suburb").apply(get_soundex_encoding).alias("suburb_soundex")
)
ds.head(10)

_id,given_name,surname,suburb,postcode,given_name_soundex,surname_soundex,suburb_soundex
str,str,str,str,str,str,str,str
"""0#0""","""annie""","""johnson""","""warrenton""","""27589""","""A500""","""J525""","""W653"""
"""0#1""","""erin""","""pugh""","""asheboro""","""27205""","""E650""","""P200""","""A216"""
"""0#2""","""lauren""","""toledo""","""chapel hill""","""27514""","""L650""","""T430""","""C144"""
"""0#3""","""georgetta""","""atkinson""","""goldsboro""","""27530""","""G623""","""A325""","""G432"""
"""0#4""","""lynn""","""kitchens""","""greensboro""","""27408""","""L500""","""K325""","""G652"""
"""0#5""","""jacqueline""","""lacewell""","""riegelwood""","""28456""","""J245""","""L240""","""R243"""
"""0#6""","""mary""","""martin""","""charlotte""","""28215""","""M600""","""M635""","""C643"""
"""0#7""","""gerald""","""clontz""","""midland""","""28107""","""G643""","""C453""","""M345"""
"""0#8""","""marie""","""roach""","""reidsville""","""27320""","""M600""","""R200""","""R321"""
"""0#9""","""jennifer""","""hall""","""wilson""","""27896""","""J516""","""H400""","""W425"""


In [11]:
ds = ds.with_columns((ds["given_name_soundex"] + ds["surname_soundex"] + ds["suburb_soundex"] ).alias("name_soundex"))
ds.head(10)

_id,given_name,surname,suburb,postcode,given_name_soundex,surname_soundex,suburb_soundex,name_soundex
str,str,str,str,str,str,str,str,str
"""0#0""","""annie""","""johnson""","""warrenton""","""27589""","""A500""","""J525""","""W653""","""A500J525W653"""
"""0#1""","""erin""","""pugh""","""asheboro""","""27205""","""E650""","""P200""","""A216""","""E650P200A216"""
"""0#2""","""lauren""","""toledo""","""chapel hill""","""27514""","""L650""","""T430""","""C144""","""L650T430C144"""
"""0#3""","""georgetta""","""atkinson""","""goldsboro""","""27530""","""G623""","""A325""","""G432""","""G623A325G432"""
"""0#4""","""lynn""","""kitchens""","""greensboro""","""27408""","""L500""","""K325""","""G652""","""L500K325G652"""
"""0#5""","""jacqueline""","""lacewell""","""riegelwood""","""28456""","""J245""","""L240""","""R243""","""J245L240R243"""
"""0#6""","""mary""","""martin""","""charlotte""","""28215""","""M600""","""M635""","""C643""","""M600M635C643"""
"""0#7""","""gerald""","""clontz""","""midland""","""28107""","""G643""","""C453""","""M345""","""G643C453M345"""
"""0#8""","""marie""","""roach""","""reidsville""","""27320""","""M600""","""R200""","""R321""","""M600R200R321"""
"""0#9""","""jennifer""","""hall""","""wilson""","""27896""","""J516""","""H400""","""W425""","""J516H400W425"""


In [12]:
records = ds.select(["_id", "name_soundex"]).rows()
print(len(records))
records[0]

10000000


('0#0', 'A500J525W653')

In [15]:
grouped_dict = defaultdict(list)
for _id, name_soundex in records:
    grouped_dict[name_soundex].append(_id)
grouped_dict = dict(grouped_dict)
len(grouped_dict)

4995075

In [16]:
clusters = list(grouped_dict.values())
len(clusters)

4995075

In [18]:
record_pairs = [list(combinations(c, 2)) if len(c) > 1 else [] for c in clusters]
len(record_pairs)

4995075

In [19]:
candidates = {p if p[0] <= p[1] else (p[1], p[0]) for c in record_pairs for p in c if p[0][0] != p[1][0]}
len(candidates)

26977252

In [21]:
matches = pl.read_csv("../matches.csv")
match_tuples = [tuple(row) for row in matches.to_numpy()]
match_set = {t if t[0] < t[1] else (t[1], t[0]) for t in match_tuples}
len(match_set)

14995973

In [22]:
found = candidates.intersection(match_set)
tp = len(found)
tp

14995973

In [23]:
not_found = match_set.difference(candidates)
fn = len(not_found)
fn

0

In [24]:
recall = tp / len(match_set)
recall

1.0

In [25]:
cand_df = pl.DataFrame(list(candidates), schema=["l_id", "r_id"])
cand_df.head(10)

l_id,r_id
str,str
"""5#58321""","""9#747623"""
"""5#386059""","""9#941570"""
"""2#353659""","""4#161705"""
"""3#430671""","""9#430195"""
"""3#59672""","""6#232391"""
"""1#964027""","""3#126407"""
"""0#214429""","""4#557650"""
"""2#223462""","""8#69191"""
"""7#289547""","""9#114177"""
"""2#725324""","""3#796902"""


In [26]:
cand_df.write_csv("../blockers/candidates_soundex.csv")