In [84]:
import pandas as pd
import numpy as np
import sys
import os

In [85]:
kg_file = '/home/dc925/project/data/graphmimic/UMLS/icd_relations.txt'
concepts_file = '/home/dc925/project/data/graphmimic/UMLS/icd_concepts_all.txt'
relations = pd.read_csv(kg_file, sep='\t', header=None)

In [86]:
relations.columns = ['head', 'rel', 'tail']

In [87]:
relations = relations.drop_duplicates(subset=['head', 'tail'])

In [88]:
concepts = pd.read_csv(concepts_file, sep='\t', header=None)

In [89]:
concepts.columns = ['CUI', 'ICD9', 'DESC']

In [90]:
concepts = concepts.drop_duplicates(subset=['CUI'])

In [91]:
cuis_in_relations = set(relations['head'].values) & set(relations['tail'].values)

In [92]:
len(cuis_in_relations)

20996

In [95]:
unique_cuis = set(concepts['CUI'].values)

In [96]:
len(unique_cuis)

20995

In [104]:
'C1137112' in unique_cuis

False

In [105]:
filtered_relations = relations[(relations['head'].isin(unique_cuis)) & (relations['tail'].isin(unique_cuis))]

In [106]:
filtered_relations

Unnamed: 0,head,rel,tail
0,C0000737,SY,C0000737
2,C0000737,CHD,C0232493
3,C0000737,CHD,C0235299
4,C0000737,CHD,C0238551
5,C0000737,CHD,C0238552
...,...,...,...
219829,C5235233,SIB,C0154922
219830,C5235233,SIB,C0154923
219831,C5235233,SIB,C0154924
219832,C5235233,SIB,C0154925


In [107]:
cui2icd = pd.Series(concepts.ICD9.values, index=concepts.CUI).to_dict()

In [108]:
len(cui2icd)

20995

In [120]:
'C0003615' in cui2icd

True

In [121]:
cui2icd['C0003615']

'540-543.99'

In [109]:
filtered_relations['head'] = filtered_relations['head'].map(lambda x: cui2icd[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [110]:
filtered_relations['tail'] = filtered_relations['tail'].map(lambda x: cui2icd[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [112]:
filtered_relations

Unnamed: 0,head,rel,tail
0,789.0,SY,789.0
2,789.0,CHD,789.06
3,789.0,CHD,789.01
4,789.0,CHD,789.04
5,789.0,CHD,789.02
...,...,...,...
219829,364.53,SIB,364.55
219830,364.53,SIB,364.56
219831,364.53,SIB,364.57
219832,364.53,SIB,364.59


In [113]:
unique_icds = set(filtered_relations['head'].values) & set(filtered_relations['tail'].values)

In [114]:
len(unique_icds)

20995

In [115]:
# Have to make sure broad codes from the MIMIC datasets are included in KG
icd_codes_mortality = pd.read_csv(
    '/home/dc925/project/data/graphmimic/mortality/icd_codes_mortality.txt', header=None)
icd_codes_readmission = pd.read_csv(
    '/home/dc925/project/data/graphmimic/readmission/icd_codes_readmission.txt', header=None)
icd_codes_mortality = icd_codes_mortality[0].tolist()
icd_codes_readmission = icd_codes_readmission[0].tolist()
icd_codes = set(icd_codes_mortality + icd_codes_readmission)

In [128]:
icd_fix = {}
for c in icd_codes:
    if c in unique_icds:
        continue
    else:
        icd_fix['{}.0'.format(c)] = c
        icd_fix['{}.00'.format(c)] = c
icd_fix['540-543.99'] = '541'
icd_fix['E880-E888.9'] = 'E887'
icd_fix['042-042.99'] = '042'

    

In [129]:
len(icd_fix)

193

In [131]:
filtered_relations = filtered_relations.applymap(lambda x: x if x not in icd_fix else icd_fix[x])

In [132]:
filtered_relations

Unnamed: 0,head,rel,tail
0,789.0,SY,789.0
2,789.0,CHD,789.06
3,789.0,CHD,789.01
4,789.0,CHD,789.04
5,789.0,CHD,789.02
...,...,...,...
219829,364.53,SIB,364.55
219830,364.53,SIB,364.56
219831,364.53,SIB,364.57
219832,364.53,SIB,364.59


In [138]:
triples = filtered_relations.values.tolist()
num_triples = len(triples)

In [139]:
seed = np.arange(num_triples)

In [140]:
np.random.shuffle(seed)

In [141]:
data_dir = '/home/dc925/project/data/graphmimic/UMLS/ICD_KG'

In [142]:
train_count = int(num_triples * 0.9)
valid_count = int(num_triples * 0.05)

train_set = seed[:train_count].tolist()
valid_set = seed[train_count: train_count+valid_count].tolist()
test_set = seed[train_count+valid_count:].tolist()

with open(os.path.join(data_dir, 'train.tsv'), 'w+') as f:
    for idx in train_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))
with open(os.path.join(data_dir, 'valid.tsv'), 'w+') as f:
    for idx in valid_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))
with open(os.path.join(data_dir, 'test.tsv'), 'w+') as f:
    for idx in test_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))
