In [2]:
from tqdm.auto import tqdm
import itertools
import random
import pickle as pkl
import pandas as pd

## Load Dutch UMLS

In [3]:
df = pd.read_csv('dataset.csv')

In [4]:
df['uniq_cui'] = df['cui'].apply()

Unnamed: 0,id,mention,cui,start_index,end_index,sentence,uniq_cui
0,0,aminozuren,['C0002520'],57,67,Uit simpele organische stoffen ontstonden mate...,C0002520
1,1,kopersulfaat,['C0056301'],52,64,Onder strikte voorwaarden is een bladbespuitin...,C0056301
2,2,kopen,['C0520949'],171,176,Individuele armoede kan het gevolg zijn van ee...,C0520949
3,3,ziekte van Van Leeuwenhoek,"['C0027066', 'C1854302']",168,194,Volgens de verhalen dicteerde hij op zijn ster...,'C1854302
4,4,retina,['C0035298'],16,22,Het netvlies of retina is beweeglijk zodat het...,C0035298
...,...,...,...,...,...,...,...
7872,8348,atomair,['C0567415'],65,72,Zij geloven dat het mogelijk is om de trilling...,C0567415
7873,8349,spaanplaatgas,['C0016564'],37,50,Vanwege jarenlange blootstelling aan spaanplaa...,C0016564
7874,8350,Hartcatheterisatie,['C0085532'],77,95,Chamuleaus enthousiasme voor de cardiologie we...,C0085532
7875,8351,anxiolytische,['C0040616'],14,27,"De stof bezit anxiolytische, anticonvulsieve, ...",C0040616


In [3]:
cleaned = []

for index, concept in df.iterrows():
    cleaned.append(f"{concept['cui']}||{concept['name']}")

In [13]:
cleaned[:30]

['C0000039||1,2-dipalmitoylphosphatidylcholine',
 'C0000052||1,4-alfa-glucaan vertakkend enzym',
 'C0000097||methyl-fenyltetrahydropyridine',
 'C0000097||1-methyl-4-fenyl-1,2,3,6-tetrahydropyridine',
 'C0000097||MPTP',
 'C0000172||18-hydroxycorticosteron',
 'C0000215||2,4,5-trichloorfenoxyazijnzuur',
 'C0000215||2,4,5-t',
 'C0000220||2,4-dichloorfenoxyazijnzuur',
 'C0000220||2,4-d',
 'C0000266||parlodel',
 'C0000294||mesna',
 'C0000294||mercapto-ethaansulfonzuur',
 'C0000294||natrium-2-mercapto-ethaansulfonaat',
 'C0000378||DOPS',
 'C0000378||droxidopa',
 'C0000378||l-DOPS',
 'C0000378||l-dihydroxyphenylserine',
 'C0000378||l-threo-dihydroxyphenylserine',
 'C0000379||3,4-methylenedioxyamphetamine',
 'C0000379||MDA',
 'C0000379||methylenedioxyamphetamine',
 'C0000379||tenamfetamina',
 'C0000379||tenamfetamine',
 'C0000379||tenamfetaminum',
 'C0000392||beta-alanine',
 'C0000402||meglutol',
 'C0000464||docosahexaenoate',
 'C0000473||para-aminobenzoëzuur',
 'C0000473||1-amino-4-carboxybenz

## Create mappings: idx2cui and cui2string dicts

In [14]:
idx2cui = {}
cui2string = {}

for i, line in enumerate(cleaned):
    splitted = line.split('||')
    idx2cui[i] = splitted[0]
    cui2string[splitted[0]] = splitted[1]

pkl.dump(idx2cui, open('idx2cui', "wb"))
pkl.dump(cui2string, open('cui2string', "wb"))

## positive pairs generation

In [18]:
umls_dict = {} # constrauct cui to list of name dict, again
for line in tqdm(cleaned):
    cui, name = line.split("||")
    if cui in umls_dict:
        umls_dict[cui].append(name)
    else:
        umls_dict[cui] = [name]

  0%|          | 0/752536 [00:00<?, ?it/s]

### generate!

In [19]:
def gen_pairs(input_list):
    return list(itertools.combinations(input_list, r=2))

In [21]:
pos_pairs = []
for k,v in tqdm(umls_dict.items()):
    pairs = gen_pairs(v)
    if len(pairs)>50: # if >50 pairs, then trim to 50 pairs
        pairs = random.sample(pairs, 50)
    for p in pairs:
        line = str(k) + "||" + p[0] + "||" + p[1]
        pos_pairs.append(line)

  0%|          | 0/366071 [00:00<?, ?it/s]

In [23]:
pos_pairs[:100]

['C0000097||methyl-fenyltetrahydropyridine||1-methyl-4-fenyl-1,2,3,6-tetrahydropyridine',
 'C0000097||methyl-fenyltetrahydropyridine||MPTP',
 'C0000097||1-methyl-4-fenyl-1,2,3,6-tetrahydropyridine||MPTP',
 'C0000215||2,4,5-trichloorfenoxyazijnzuur||2,4,5-t',
 'C0000220||2,4-dichloorfenoxyazijnzuur||2,4-d',
 'C0000294||mesna||mercapto-ethaansulfonzuur',
 'C0000294||mesna||natrium-2-mercapto-ethaansulfonaat',
 'C0000294||mercapto-ethaansulfonzuur||natrium-2-mercapto-ethaansulfonaat',
 'C0000378||DOPS||droxidopa',
 'C0000378||DOPS||l-DOPS',
 'C0000378||DOPS||l-dihydroxyphenylserine',
 'C0000378||DOPS||l-threo-dihydroxyphenylserine',
 'C0000378||droxidopa||l-DOPS',
 'C0000378||droxidopa||l-dihydroxyphenylserine',
 'C0000378||droxidopa||l-threo-dihydroxyphenylserine',
 'C0000378||l-DOPS||l-dihydroxyphenylserine',
 'C0000378||l-DOPS||l-threo-dihydroxyphenylserine',
 'C0000378||l-dihydroxyphenylserine||l-threo-dihydroxyphenylserine',
 'C0000379||3,4-methylenedioxyamphetamine||MDA',
 'C0000379

### save the pairwise positive training file

In [25]:
with open('./training_file_umls2023aa_nl_no_dup_pairwise_pair_th50_filtered-categories.txt', 'w') as f:
    for line in pos_pairs:
        f.write("%s\n" % line)