In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# open wr file
# for each line, for each pair, get more-frequent name
# map more-freq name to less-freq names

In [6]:
from collections import defaultdict

In [7]:
def load_name_freqs(filename):
    name_freqs = defaultdict(int)
    with open(filename, mode="r", encoding="utf-8") as f:
        for line in f:
            fields = line.rstrip().split("\t")
            # normalize name
            name_freqs[fields[0]] = int(fields[1])
    return name_freqs

In [8]:
# map names to their frequency
given_name_freqs = load_name_freqs('../data/given-final.normal.txt')
surname_freqs = load_name_freqs('../data/surname-final.normal.txt')

In [39]:
min_freq = 5

In [40]:
def load_werelate_pairs(filename, name_freqs):
    tree_variants = defaultdict(set)
    with open(filename, mode="r", encoding="utf-8") as f:
        is_header = True
        for line in f:
            if is_header:
                is_header = False
                continue
            fields = line.rstrip().split("\t")
            name = fields[0].strip()
            if not name or len(fields) < 2:
                continue
            name_freq = name_freqs[name]
            if name_freq < min_freq:
                continue
            confirmed_variants = fields[1].strip().split(" ")
            computer_variants = fields[2].strip().split(" ") if len(fields) == 3 else []
            variants = confirmed_variants + computer_variants
            for variant in variants:
                if not variant:
                    continue
                variant_freq = name_freqs[variant]
                if variant_freq < min_freq:
                    continue
                tree_name, record_name = (name, variant) if name_freq > variant_freq else (variant, name)
                tree_variants[tree_name].add(record_name)
    return tree_variants

In [41]:
given_variants = load_werelate_pairs('../data/givenname_similar_names.werelate.20210414.tsv', given_name_freqs)
print(len(given_variants))
surname_variants = load_werelate_pairs('../data/surname_similar_names.werelate.20210414.tsv', surname_freqs)
print(len(surname_variants))
all_variants = defaultdict(set)
for k, v in given_variants.items():
    all_variants[k] |= v
for k, v in surname_variants.items():
    all_variants[k] |= v
print(len(all_variants))

37252
126889
147255


In [42]:
def save_werelate_pairs(filename, variants):
    with open(filename, mode="w", encoding="utf-8") as f:
        f.write(",name1,name2,co_occurrence,count1,count2\n")
        ix = 0
        for name, variants in variants.items():
            for variant in variants:
                f.write(f'{ix},{name},{variant},0,0,0\n')
                ix += 1

In [43]:
save_werelate_pairs('../data/werelate_pairs.csv', all_variants)