In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load data
umls_icdall_phe_CUI = pd.read_csv('umls_icdall_phe_CUI.csv',dtype = {'Phecode':str})
icd10_df = pd.read_csv('ICD10_phecode.csv',dtype = {'Phecode':str})
icd9_df = pd.read_csv('ICD9_phecode.csv',dtype = {'Phecode':str})


entity_df = pd.read_csv('processed_entities.csv')
relation_df = pd.read_csv('processed_relationships.csv')
#
HS_data = pd.read_csv('logistic_regression_results.csv')
#
dis_df = pd.read_csv('disease_vocab.csv')
sym_df = pd.read_csv('symptom_vocab.csv')

In [None]:
# From vocabulary provided by iBKH get the umls cui and icd mapping
dis_df = dis_df[['primary','name','umls_cui','icd_10','icd_9']] 
sym_df = sym_df[['primary','name','umls_cui']]

# concat all diease related codes
dis_all = pd.concat([dis_df,sym_df]) 
dis_all['primary'] = dis_all['primary'].str.upper() 

In [None]:
# Select ICD 10 from dis_all
dis_all_10 = dis_all[~dis_all['icd_10'].isna()]
dis_all_10['ICD'] = dis_all_10['icd_10'] # create a new column ICD to store ICD 10 value

# Select ICD 9 from dis_all
dis_all_9 = dis_all[~dis_all['icd_9'].isna()]
dis_all_9['ICD'] = dis_all_9['icd_9'] # create a new column ICD to store ICD 9 value

# Map icd phecode dictionary to get the phecode for each node ID
dis_all_10_phe = dis_all_10.merge(icd10_df, on = 'ICD', how = 'left') 
dis_all_9_phe = dis_all_9.merge(icd9_df, on = 'ICD', how = 'left') 

# Concat and get the full result of mapping
dis_all_10_9_phe = pd.concat([dis_all_9_phe,dis_all_10_phe]) 
dis_all_10_9_phe = dis_all_10_9_phe.sort_values(by = 'primary') 
dis_all_10_9_phe[dis_all_10_9_phe.duplicated(subset = ['primary','Phecode'], keep = False)]

dis_all_10_9_phe_drop = dis_all_10_9_phe.drop_duplicates(subset = ['primary','Phecode'])
dis_all_10_9_phe_drop['Phecode'].nunique()

In [None]:
# --- Get Phecode - DOID mapping by ICD PHECDOE dictionary ---

# Select ICD 10 codes from disease vocabulary (only non-null values)
dis_all_10 = dis_all[~dis_all['icd_10'].isna()]
dis_all_10['ICD'] = dis_all_10['icd_10']  

# Select ICD 9 codes from disease vocabulary (only non-null values)  
dis_all_9 = dis_all[~dis_all['icd_9'].isna()]
dis_all_9['ICD'] = dis_all_9['icd_9']  

# Map ICD codes to Phecodes using the ICD-Phecode dictionaries
dis_all_10_phe = dis_all_10.merge(icd10_df, on='ICD', how='left') 
dis_all_9_phe = dis_all_9.merge(icd9_df, on='ICD', how='left')    

# Combine ICD-9 and ICD-10 mapping results
dis_all_10_9_phe = pd.concat([dis_all_9_phe, dis_all_10_phe])
dis_all_10_9_phe = dis_all_10_9_phe.sort_values(by='primary')  

# Check for duplicates in the mapping
dis_all_10_9_phe[dis_all_10_9_phe.duplicated(subset=['primary','Phecode'], keep=False)]

dis_all_10_9_phe_drop = dis_all_10_9_phe.drop_duplicates(subset=['primary','Phecode'])
dis_all_10_9_phe_drop['Phecode'].nunique()

In [None]:
# --- Get Phecode - DOID mapping by UMLS_CUI dictionary ---
dis_all_cui = dis_all.merge(umls_icdall_phe_CUI, on='umls_cui', how='left')

print(dis_all_cui.shape)
print(dis_all_cui['primary'].nunique())
print(dis_all_cui['Phecode'].nunique())

dis_all_phe = pd.concat([dis_all_10_9_phe_drop, dis_all_cui])
dis_all_phe = dis_all_phe.sort_values(by='primary')
dis_all_phe = dis_all_phe.drop_duplicates()
dis_all_phe['Phecode'].nunique()

# --- Handle duplicates ---
duplicated_ids = dis_all_phe[dis_all_phe['primary'].duplicated()] 
duplicated_ids = duplicated_ids['primary'].drop_duplicates()
non_duplicates = dis_all_phe[~dis_all_phe['primary'].isin(duplicated_ids)]
non_duplicates['primary'].nunique()

duplicates = dis_all_phe[dis_all_phe['primary'].isin(duplicated_ids)]

# --- All NaN duplicates ---
result = duplicates.groupby('primary')['Phecode'].apply(lambda x:x.isna().all())
all_nan_groups = result[result == True]
duplicates_na = duplicates[duplicates['primary'].isin(all_nan_groups.index)] 
duplicates_na = duplicates_na.drop_duplicates(subset=['primary','Phecode']) 

# --- Value duplicates ---
duplicates_value = duplicates[~duplicates['primary'].isin(duplicates_na['primary'])]
duplicates_value = duplicates_value.drop_duplicates(subset=['primary','Phecode'])

# --- Final combination ---
dis_all_phe_1 = pd.concat([non_duplicates,duplicates_na,duplicates_value])
dis_all_phe_1 = dis_all_phe_1[['primary','Phecode']].rename(columns = {'primary':'node_id'})

print('USCUI:',dis_all_cui[dis_all_cui['Phecode'].isin(disease_list)]['Phecode'].nunique())
print('ICD:',dis_all_10_9_phe[dis_all_10_9_phe['Phecode'].isin(disease_list)]['Phecode'].nunique())
print('USCUI:',dis_all_cui[dis_all_cui['Phecode'].isin(disease_list)]['primary'].nunique())
print('ICD:',dis_all_10_9_phe[dis_all_10_9_phe['Phecode'].isin(disease_list)]['primary'].nunique())
print('shape of all dis df:',dis_all.shape)
print('shape of all icd result:',dis_all_10_9_phe.shape)
print('shape of all uscui result:',dis_all_cui.shape)

In [None]:
# Add beta from logistic regression result
# --- calculate significance ---
N = len(HS_data)
HS_data['is_significant'] = np.where(HS_data['Pvalue'] < 0.05/N, 1, 0)
HS_data = HS_data[(HS_data['is_significant'] == 1) & (HS_data['Beta'] > 0)]
HS_data['Phecode'] = HS_data['Phecode'].str.lstrip('X')
HS_data['Phecode'] = HS_data['Phecode'].astype('str')
print(f"HS data after filtering: {HS_data.shape}")

disease_list = HS_data['Phecode']
HS_data = HS_data[['Phecode','Beta']]

# --- Merge phecode and beta values ---
entity_disease_df_phe = entity_disease_df.merge(dis_all_phe_1, how='left', on='node_id')
entity_disease_df_phe_HS = entity_disease_df_phe.merge(HS_data, how='left', on='Phecode')
print(entity_disease_df_phe_HS.shape)

# --- Handle duplicates ---
duplicate_id = entity_disease_df_phe_HS[entity_disease_df_phe_HS.duplicated(subset=['node_id'], keep=False)]
valid_beta = duplicate_id[~duplicate_id['Beta'].isna()]
result = entity_disease_df_phe_HS[entity_disease_df_phe_HS['node_id'].isin(valid_beta['node_id'])]
dul_df_id = result['node_id'].drop_duplicates()

In [None]:
# --- FIRST CONDITION: Not duplicated ---
# Not duplicated (For those IDs that is not in (unique-id-with-beta-value))
# In disease/symptom category node ID. 
# For those ID is not in the one I am interested (unique-id-with-beta-value)
# Drop duplicated directly, it won't drop any beta value in accident.
a = entity_disease_df_phe_HS[~entity_disease_df_phe_HS['node_id'].isin(dul_df_id)]
a = a.drop_duplicates(subset='node_id')

# --- SECOND CONDITION: Duplicated with beta values ---
# Node ID is in the duplicated (unique-id-with-beta-value)
# And Beta value is True
# And with duplicated node ID (as some duplicated comes from a Beta False + Beta True)
# Find the maximum beta in this condition

b = entity_disease_df_phe_HS[entity_disease_df_phe_HS['node_id'].isin(dul_df_id) 
    & entity_disease_df_phe_HS['Beta'].notna() 
    & entity_disease_df_phe_HS['node_id'].duplicated(keep=False)]
b = b.loc[b.groupby('node_id')['Beta'].transform(max) == b['Beta']]

In [None]:
# --- Final combination ---
entity_disease_no_dul_df = pd.concat([a,b])
entity_non_disease_df['Beta'] = np.nan
entity_phe_df = pd.concat([entity_disease_no_dul_df, entity_non_disease_df])
entity_phe_df = entity_phe_df.drop_duplicates()

# --- Summary statistics ---
print('Number of significant phecode:', HS_data['Phecode'].nunique())
print('Number of node with significant phecode:', entity_phe_df[~entity_phe_df['Beta'].isna()]['node_id'].nunique())
print('Number of Beta in entity df:', len(entity_phe_df[~entity_phe_df['Beta'].isna()]))

entity_phe_df = entity_phe_df[['node_id','category','Beta','Phecode']]
entity_phe_df.to_csv('entity_weight.csv', index=False)