In [None]:
from tqdm.auto import tqdm
import itertools
import random
import pickle as pkl
import pandas as pd

## Load Dutch UMLS

In [None]:
df = pd.read_csv('umls-dutch_v1.11_with_drugs_filtered-categories.csv')

In [None]:
cleaned = []

for index, concept in df.iterrows():
    cleaned.append(f"{concept['cui']}||{concept['name']}")

In [None]:
cleaned[:30]

## Create mappings: idx2cui and cui2string dicts

In [None]:
idx2cui = {}
cui2string = {}

for i, line in enumerate(cleaned):
    splitted = line.split('||')
    idx2cui[i] = splitted[0]
    cui2string[splitted[0]] = splitted[1]

pkl.dump(idx2cui, open('idx2cui', "wb"))
pkl.dump(cui2string, open('cui2string', "wb"))

## positive pairs generation

In [None]:
umls_dict = {} # constrauct cui to list of name dict, again
for line in tqdm(cleaned):
    cui, name = line.split("||")
    if cui in umls_dict:
        umls_dict[cui].append(name)
    else:
        umls_dict[cui] = [name]

### generate!

In [None]:
def gen_pairs(input_list):
    return list(itertools.combinations(input_list, r=2))

In [None]:
pos_pairs = []
for k,v in tqdm(umls_dict.items()):
    pairs = gen_pairs(v)
    if len(pairs)>50: # if >50 pairs, then trim to 50 pairs
        pairs = random.sample(pairs, 50)
    for p in pairs:
        line = str(k) + "||" + p[0] + "||" + p[1]
        pos_pairs.append(line)

In [None]:
pos_pairs[:100]

### save the pairwise positive training file

In [None]:
with open('./training_file_umls2023aa_nl_no_dup_pairwise_pair_th50_filtered-categories.txt', 'w') as f:
    for line in pos_pairs:
        f.write("%s\n" % line)