In [1]:
import pandas as pd
import numpy as np

#directory for MIMIC-III data
base_dir = './'

admission_file = base_dir+'ADMISSIONS.csv'
procedure_file = base_dir+'PROCEDURES_ICD.csv'
prescriptions_file = base_dir+'PRESCRIPTIONS.csv'
diagnoses_file = base_dir+'DIAGNOSES_ICD.csv'

In [2]:

admission_df = pd.read_csv(admission_file)
# Convert column names to uppercase
admission_df.columns = admission_df.columns.str.upper()

# Convert time columns to datetime format
admission_df.ADMITTIME = pd.to_datetime(admission_df.ADMITTIME,format='%Y-%m-%d %H:%M:%S',errors='coerce')
admission_df.DISCHTIME = pd.to_datetime(admission_df.DISCHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
admission_df.DEATHTIME = pd.to_datetime(admission_df.DEATHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

# Sort the DataFrame by SUBJECT_ID and ADMITTIME
admission_df = admission_df.sort_values(['SUBJECT_ID','ADMITTIME'])
admission_df = admission_df.reset_index(drop=True)

# Add columns for the next admission time and type for each subject
admission_df['NEXT_ADMITTIME'] = admission_df.groupby('SUBJECT_ID').ADMITTIME.shift(-1)
admission_df['NEXT_ADMISSION_TYPE'] = admission_df.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)

# Handle cases where the next admission type is 'ELECTIVE'
rows = admission_df.NEXT_ADMISSION_TYPE=='ELECTIVE'
admission_df.loc[rows,'NEXT_ADMITTIME'] = pd.NaT
admission_df.loc[rows,'NEXT_ADMISSION_TYPE'] = np.NaN

admission_df = admission_df.sort_values(['SUBJECT_ID','ADMITTIME'])

admission_df[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = admission_df.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method='bfill')
admission_df['DAYS_NEXT_ADMIT'] = (admission_df.NEXT_ADMITTIME-admission_df.DISCHTIME).dt.total_seconds()/(24*60*60)
admission_df['OUTPUT_LABEL'] = (admission_df.DAYS_NEXT_ADMIT<30).astype('int')

# Remove rows with ADMISSION_TYPE 'NEWBORN' and those with non-null DEATHTIME
admission_df = admission_df[admission_df['ADMISSION_TYPE']!='NEWBORN']
admission_df = admission_df[admission_df.DEATHTIME.isnull()]
admission_df['DURATION'] = (admission_df['DISCHTIME']-admission_df['ADMITTIME']).dt.total_seconds()/(24*60*60)
print('done')


done


In [3]:
"""
ICD-9
"""


def expand_level2():
    level2 = ['001-009', '010-018', '020-027', '030-041', '042', '045-049', '050-059', '060-066', '070-079', '080-088',
              '090-099', '100-104', '110-118', '120-129', '130-136', '137-139', '140-149', '150-159', '160-165',
              '170-176',
              '176', '179-189', '190-199', '200-208', '209', '210-229', '230-234', '235-238', '239', '240-246',
              '249-259',
              '260-269', '270-279', '280-289', '290-294', '295-299', '300-316', '317-319', '320-327', '330-337', '338',
              '339', '340-349', '350-359', '360-379', '380-389', '390-392', '393-398', '401-405', '410-414', '415-417',
              '420-429', '430-438', '440-449', '451-459', '460-466', '470-478', '480-488', '490-496', '500-508',
              '510-519',
              '520-529', '530-539', '540-543', '550-553', '555-558', '560-569', '570-579', '580-589', '590-599',
              '600-608',
              '610-611', '614-616', '617-629', '630-639', '640-649', '650-659', '660-669', '670-677', '678-679',
              '680-686',
              '690-698', '700-709', '710-719', '720-724', '725-729', '730-739', '740-759', '760-763', '764-779',
              '780-789',
              '790-796', '797-799', '800-804', '805-809', '810-819', '820-829', '830-839', '840-848', '850-854',
              '860-869',
              '870-879', '880-887', '890-897', '900-904', '905-909', '910-919', '920-924', '925-929', '930-939',
              '940-949',
              '950-957', '958-959', '960-979', '980-989', '990-995', '996-999', 'V01-V91', 'V01-V09', 'V10-V19',
              'V20-V29',
              'V30-V39', 'V40-V49', 'V50-V59', 'V60-V69', 'V70-V82', 'V83-V84', 'V85', 'V86', 'V87', 'V88', 'V89',
              'V90',
              'V91', 'E000-E899', 'E000', 'E001-E030', 'E800-E807', 'E810-E819', 'E820-E825', 'E826-E829', 'E830-E838',
              'E840-E845', 'E846-E849', 'E850-E858', 'E860-E869', 'E870-E876', 'E878-E879', 'E880-E888', 'E890-E899',
              'E900-E909', 'E910-E915', 'E916-E928', 'E929', 'E930-E949', 'E950-E959', 'E960-E969', 'E970-E978',
              'E980-E989', 'E990-E999']

    # Create a dictionary to map level 3 codes to their corresponding level 2 code
    level2_expand = {}
    for i in level2:
        tokens = i.split('-')
        if i[0] == 'V':
            if len(tokens) == 1:
                level2_expand[i] = i
            else:
                for j in range(int(tokens[0][1:]), int(tokens[1][1:]) + 1):
                    level2_expand["V%02d" % j] = i
        elif i[0] == 'E':
            if len(tokens) == 1:
                level2_expand[i] = i
            else:
                for j in range(int(tokens[0][1:]), int(tokens[1][1:]) + 1):
                    level2_expand["E%03d" % j] = i
        else:
            if len(tokens) == 1:
                level2_expand[i] = i
            else:
                for j in range(int(tokens[0]), int(tokens[1]) + 1):
                    level2_expand["%03d" % j] = i
    return level2_expand


level3_dict = expand_level2()

#transform codes to level 2 and level 3
def transform_code(unique_code):
    level2 = unique_code[:4] if unique_code[0]=='E' else unique_code[:3]
    level3 = level3_dict[level2]
    
    return [level2,level3]


In [4]:
import pandas as pd

diag_file = base_dir+'DIAGNOSES_ICD.csv'
# Filter the dataframe to include only main diagnoses (SEQ_NUM == 1)
diagnose_df = pd.read_csv(diag_file)
main_diag_df = diagnose_df[diagnose_df['SEQ_NUM']==1]
main_diag_df['diagnose_level2'],main_diag_df['diagnose_level3'] = zip(*main_diag_df['ICD9_CODE'].apply(transform_code))


V30    6339
414    3576
038    3389
410    3307
424    1721
       ... 
919       1
219       1
597       1
671       1
259       1
Name: diagnose_level2, Length: 651, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_diag_df['diagnose_level2'],main_diag_df['diagnose_level3'] = zip(*main_diag_df['ICD9_CODE'].apply(transform_code))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_diag_df['diagnose_level2'],main_diag_df['diagnose_level3'] = zip(*main_diag_df['ICD9_CODE'].apply(transform_code))


In [5]:

adm_df = pd.read_csv(admission_file)
proce_df = pd.read_csv(procedure_file)
pres_df = pd.read_csv(prescriptions_file,dtype={'NDC':'category'})
diag_df = pd.read_csv(diagnoses_file)

MIN_CODE_THRESHOLD = 50
MEDIUM_CODE_THRESHOLD = 100
LARGE_CODE_THEESHOLD = 500


def construct_valid_subset(raw_df,column='ICD9_CODE',threshold=MIN_CODE_THRESHOLD,desc='filter desc:'):
    base_df = raw_df[column].value_counts()

    valid_code = base_df[base_df>=threshold].index.values
    filtered_df = raw_df[raw_df[column].isin(valid_code)]
    filtered_admission_ids = set(filtered_df['HADM_ID'].tolist())

    return valid_code,filtered_admission_ids,filtered_df

diag_codes,diag_admission_ids,diag_df = construct_valid_subset(diag_df,desc='valid diagnoses code base/num: ')
proce_codes,proce_admission_ids,proce_df = construct_valid_subset(proce_df,desc='valid procedure code base/num: ')
pres_codes,pres_admission_ids,pres_df = construct_valid_subset(pres_df,column='NDC',desc='valid prescription code base/num: ')

common_admission_ids = diag_admission_ids & proce_admission_ids & pres_admission_ids

diag_df = diag_df.groupby(['SUBJECT_ID','HADM_ID']).agg({'ICD9_CODE':lambda x:','.join(x)}).reset_index().rename(columns={'ICD9_CODE':'ICD9_DIAG'})
common_diag_df = diag_df[diag_df['HADM_ID'].isin(common_admission_ids)]

proce_df.ICD9_CODE = proce_df.ICD9_CODE.astype(str)
proce_df = proce_df.groupby(['SUBJECT_ID','HADM_ID']).agg({'ICD9_CODE':lambda x:','.join(x)}).reset_index().rename(columns={'ICD9_CODE':'ICD9_PROCE'})

pres_df.drop(index=pres_df[pres_df['NDC'] == '0'].index, axis=0, inplace=True)
pres_df = pres_df.groupby(['SUBJECT_ID','HADM_ID']).agg({'NDC':lambda x:','.join(x)}).reset_index()


common_df = pd.merge(common_diag_df,proce_df,on=['SUBJECT_ID','HADM_ID'])
common_df = pd.merge(common_df,pres_df,on=['SUBJECT_ID','HADM_ID'])
print('done')


  pres_df = pd.read_csv(prescriptions_file,dtype={'NDC':'category'})


done


In [6]:
'''
    for each admission, category it to only single visit or the visit can be formulated in a visit sequence
'''
print('patient statics: ')
print(len(common_df.SUBJECT_ID.unique()))

visit_num_df = common_df[['SUBJECT_ID','HADM_ID']].groupby('SUBJECT_ID').HADM_ID.unique().reset_index()
visit_num_df['HADM_ID_LEN'] = visit_num_df['HADM_ID'].apply(lambda x:len(x))
multi_subjects = visit_num_df[visit_num_df['HADM_ID_LEN']>1].SUBJECT_ID.unique()
print(len(multi_subjects),' patient has multi visits')
single_subjects = visit_num_df[visit_num_df['HADM_ID_LEN']==1].SUBJECT_ID.unique()
print(len(single_subjects),' patient has single visits')

print('admission statics:')
print(len(common_df.HADM_ID.unique()))
common_multi_df = common_df[common_df['SUBJECT_ID'].isin(multi_subjects)]
print(len(common_multi_df),' admission can be formulate as a visit sequence')
common_single_df = common_df[common_df['SUBJECT_ID'].isin(single_subjects)]
print(len(common_single_df),' admission are single visits')


patient statics: 
35241
5330  patient has multi visits
29911  patient has single visits
admission statics:
43738
13827  admission can be formulate as a visit sequence
29911  admission are single visits


In [7]:
common_single_df['disease'] = common_single_df['ICD9_DIAG'].apply(lambda x:x.split(',')[0])
aa = list(common_single_df['disease'].value_counts())

disease_cohorts = list(common_single_df['disease'].value_counts().index.values[0:6])
common_single_df = common_single_df[common_single_df['disease'].isin(disease_cohorts)]

print(disease_cohorts)


['41401', 'V3001', 'V3000', '41071', '0389', '4241']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_single_df['disease'] = common_single_df['ICD9_DIAG'].apply(lambda x:x.split(',')[0])


In [8]:
'''
    split all the visits into the train, valid, test dataset respectively
'''
import random,math

single_admission_nums = len(common_single_df)

#split all admissions into train,valid and test with ratio 0.6,0.2,0.2
train_bound,valid_bound = math.floor(0.6*single_admission_nums),math.floor(0.8*single_admission_nums)
all_admission_ids = common_single_df.HADM_ID.unique()

random.shuffle(all_admission_ids)
train_admission_ids = all_admission_ids[:train_bound]
valid_admission_ids = all_admission_ids[train_bound:valid_bound]
test_admission_ids = all_admission_ids[valid_bound:]
assert single_admission_nums==len(train_admission_ids)+len(valid_admission_ids)+len(test_admission_ids)

train_admission_df = common_single_df[common_single_df['HADM_ID'].isin(train_admission_ids)]
valid_admission_df = common_single_df[common_single_df['HADM_ID'].isin(valid_admission_ids)]
test_admission_df = common_single_df[common_single_df['HADM_ID'].isin(test_admission_ids)]

train_admission_df['disease'] = train_admission_df['ICD9_DIAG'].apply(lambda x:x.split(',')[0])
valid_admission_df['disease'] = valid_admission_df['ICD9_DIAG'].apply(lambda x:x.split(',')[0])
test_admission_df['disease'] = test_admission_df['ICD9_DIAG'].apply(lambda x:x.split(',')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_admission_df['disease'] = train_admission_df['ICD9_DIAG'].apply(lambda x:x.split(',')[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_admission_df['disease'] = valid_admission_df['ICD9_DIAG'].apply(lambda x:x.split(',')[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_admissio

In [9]:

print(len(train_admission_df))
train_admission_df = train_admission_df[train_admission_df['disease'].isin(disease_cohorts)]
print(len(train_admission_df))
test_admission_df = test_admission_df[test_admission_df['disease'].isin(disease_cohorts)]
print(len(test_admission_df))
valid_admission_df = valid_admission_df[valid_admission_df['disease'].isin(disease_cohorts)]
print(len(valid_admission_df))

4771
4771
1591
1590


In [10]:
ndc2rxnorm_file = '../ndc_atc/ndc2rxnorm_mapping.txt'

def ndc2atc(pres_df):
    with open(ndc2rxnorm_file,'r') as f:
        ndc2rxnorm = eval(f.read())
    pres_df['ATC'] = pres_df['NDC'].map(lambda x:','.join([ndc2rxnorm[ndc] for ndc in x.split(',')]))
    return pres_df

train_admission_df = ndc2atc(train_admission_df)
valid_admission_df = ndc2atc(valid_admission_df)
test_admission_df = ndc2atc(test_admission_df)

train_admission_file = './train_admissions.csv'
valid_admission_file = './valid_admissions.csv'
test_admission_file = './test_admissions.csv'
train_admission_df.to_csv(train_admission_file,index=False)
valid_admission_df.to_csv(valid_admission_file,index=False)
test_admission_df.to_csv(test_admission_file,index=False)

In [12]:
import random
def construct_labels(test_admission_df,label_file):
    similar_pairs,dis_similar_pairs= [],[]
    by_disease_df = test_admission_df.groupby('disease')['HADM_ID'].unique().reset_index()
    all_hadm_ids = test_admission_df['HADM_ID'].tolist()


    for disease,hadm_ids in  zip(by_disease_df['disease'],by_disease_df['HADM_ID']):
        if len(hadm_ids)==1:continue
        hadm_ids = hadm_ids.tolist()

        for admission_id in hadm_ids:
            random_sample = random.sample(set(hadm_ids)-set([admission_id]), 5)
            similar_pairs.extend([(admission_id,sample) for sample in random_sample])
            other_hadm_ids = random.sample(list(set(all_hadm_ids)-set(hadm_ids)),5)
            for o_admission_id in other_hadm_ids:
                dis_similar_pairs.append((admission_id,o_admission_id))

    print('construct labels with similar and dissimilar counts: ',len(similar_pairs),len(dis_similar_pairs))
    with open(label_file,'w',encoding='utf-8') as writer:
        writer.write('hadm_id\t'+'hadm_id\t'+'label\n')
        for similar_pair,dis_similar_pair in zip(similar_pairs,dis_similar_pairs):
            writer.write(str(similar_pair[0])+'\t'+str(similar_pair[1])+'\t'+'1\n')
            writer.write(str(dis_similar_pair[0])+'\t'+str(dis_similar_pair[1])+'\t'+'0\n')
    return similar_pairs,dis_similar_pairs

train_label_file = './train_label.csv'
valid_label_file = './valid_label.csv'
construct_labels(train_admission_df,train_label_file)
construct_labels(valid_admission_df,valid_label_file)
print('done')

since Python 3.9 and will be removed in a subsequent version.
  random_sample = random.sample(set(hadm_ids)-set([admission_id]), 5)


construct labels with similar and dissimilar counts:  23855 23855


since Python 3.9 and will be removed in a subsequent version.
  random_sample = random.sample(set(hadm_ids)-set([admission_id]), 5)


construct labels with similar and dissimilar counts:  7950 7950
done


In [13]:
import pickle
all_admission_df = pd.concat([train_admission_df,valid_admission_df,test_admission_df])
all_diag_codes = []
all_admission_df['ICD9_DIAG'].apply(lambda x:all_diag_codes.extend(x.split(',')))
all_diag_codes = list(set(all_diag_codes))
print(len(all_diag_codes))
all_proce_codes = []
all_admission_df['ICD9_PROCE'].apply(lambda x:all_proce_codes.extend(x.split(',')))
all_proce_codes = list(set(all_proce_codes))
print(len(all_proce_codes))
all_atc_codes = []
all_admission_df['ATC'].apply(lambda x:all_atc_codes.extend(x.split(',')))
all_atc_codes = list(set(all_atc_codes))
print(len(all_atc_codes))

pickle.dump({'diag_codes':all_diag_codes,'proce_codes':all_proce_codes,'atc_codes':all_atc_codes},open('./vocab.pkl','wb'))

1211
336
1479


In [14]:
'''
    construct all the knowledge graph with PMI value
    the entity type: diagnose, procedure, prescription
    the relation type: diagnose-procedure, diagnose-prescription, procedure-presciption
'''
from math import log

def construct_ent_pairs(x,head_col,tail_col,all_pairs):
    for head_ent in x[head_col].split(','):
        for tail_ent in x[tail_col].split(','):
            all_pairs.append(head_ent+','+tail_ent)


'''
    based on the valid pmi value, construct the relation
'''
def construct_relation(common_df,head_col,tail_col):
    all_pairs = []
    common_df.apply(construct_ent_pairs,axis=1,args=(head_col,tail_col,all_pairs))
#     print(len(all_pairs))
    entity_freq = {}
    rel_pair_count = {}
    for rel_pair in all_pairs:
        head_ent,tail_ent = rel_pair.split(',')
        if rel_pair not in rel_pair_count:
            rel_pair_count[rel_pair] = 1
        else:
            rel_pair_count[rel_pair]+=1
        if head_ent not in entity_freq:
            entity_freq[head_ent] = 1
        else:
            entity_freq[head_ent]+=1
        if tail_ent not in entity_freq:
            entity_freq[tail_ent] = 1
        else:
            entity_freq[tail_ent]+=1

    num_windows = len(all_pairs)
    pmi_result = []
    for rel_pair in rel_pair_count:
        entities = rel_pair.split(',')
        pmi = log((1.0*rel_pair_count[rel_pair]/num_windows)/(1.0*entity_freq[entities[0]]*entity_freq[entities[1]]/(num_windows*num_windows)))
        if pmi<0:continue
        pmi_result.append([entities[0],entities[1],pmi])
    return pmi_result

def write_relation(pmi_result,output_file):
    with open(output_file,'x',encoding='utf-8') as writer:
        writer.write('head ent'+'\t'+'tail ent'+'\t'+'pmi\n')
        for key in pmi_result:
            writer.write(key[0]+'\t'+key[1]+'\t'+str(key[2])+'\n')
    print('relation file writing done...')


diag_proce_rel = construct_relation(common_single_df,'ICD9_DIAG','ICD9_PROCE')
print('diagnose and procedure relation num: ',len(diag_proce_rel))

diag_pres_rel = construct_relation(common_single_df,'ICD9_DIAG','NDC')
print('diagnose and prescription relation num: ',len(diag_pres_rel))

proce_pres_rel = construct_relation(common_single_df,'ICD9_PROCE','NDC')
print('procedure and presciption relation num: ',len(proce_pres_rel))

write_relation(diag_proce_rel,'./diag_proce_rel.csv')
write_relation(diag_pres_rel,'./diag_pres_rel.csv')
write_relation(proce_pres_rel,'./proce_pres_rel.csv')


diagnose and procedure relation num:  41536
diagnose and prescription relation num:  294670
procedure and presciption relation num:  88962
