In [216]:
from xml.dom import minidom
from pronto import Ontology
import pandas as pd

In [230]:
# parse an xml file by name
mydoc = minidom.parse('data/REPO-TRIAL-comorbiditome/icd10cm_tabular_2021.xml')

items = mydoc.getElementsByTagName('diag')

icd10_encoder = {}

for item in items:
    code = item.getElementsByTagName('name')[0].firstChild.data
    
    # we dont want to get all the sub-diseases, not used in Comorbidity file from Repo-Trial-DB
    if '.' in code:
        continue
        
    name = item.getElementsByTagName('desc')[0].firstChild.data
    
    icd10_encoder[code] = name
    
mondo_to_icd = {}
icd_to_mondo = {}

with open('data/REPO-TRIAL-comorbiditome/mondo.owl', 'r', encoding="utf8") as reader:
    # Read & print the entire file
    for line in reader:
        if 'owl:annotatedSource rdf:resource="http://purl.obolibrary.org/obo/MONDO' in line:
            mondo_id = line.split('http://purl.obolibrary.org/obo/')[1][:13]
            mondo_id = mondo_id.split('_')[1]

        elif 'owl:annotatedTarget' in line and 'ICD10:' in line:
            icd10 = line.split('<owl:annotatedTarget rdf:datatype="http://www.w3.org/2001/XMLSchema#string">')[1].split('<')[0]
            icd10 = icd10.split('.')[0]
            icd10 = icd10.split(':')[1]
            mondo_to_icd[mondo_id] = icd10
            icd_to_mondo[icd10] = mondo_id
    
df = pd.read_csv("data/REPO-TRIAL-comorbiditome/Comorbidity_MAINCODE_all_APR_2020_withDelivery_noCutoff.txt", sep="\t")

df['disease1_name'] = df['disease1'].map(icd10_encoder)
df['disease2_name'] = df['disease2'].map(icd10_encoder)    

# normalize relative_risk
max_value = df['relative_risk'].max()
min_value = df['relative_risk'].min()
df['relative_risk_norm'] = (df['relative_risk'] - min_value) / (max_value - min_value)


df = df.rename(columns={'disease1': 'disease1_ICD10', 'disease2': 'disease2_ICD10'})

df['disease1_Mondo'] = df['disease1_ICD10'].map(icd_to_mondo)
df['disease2_Mondo'] = df['disease2_ICD10'].map(icd_to_mondo)

df.to_csv('data/REPO-TRIAL-comorbiditome/Comorbidity_ALL_with_names_edited.csv')

### load gene information about comorbidities

In [229]:
df = pd.read_csv('data/REPO-TRIAL-comorbiditome/gene-disease-associations-OMIM&DisGeNet.txt', sep="\t")
df = df.rename(columns={'Gene': 'EntrezID', 'Disease': 'Mondo'})

df['EntrezID'] = df['EntrezID'].map(lambda x: x.split('.')[1])

def __format_mondo(x):
    mondo_id = x.split('.')[1]
    return mondo_id

df['Mondo'] = df['Mondo'].map(__format_mondo)
# read mondo db 
file = Ontology('data/REPO-TRIAL-comorbiditome/mondo_codes_to_names.obo')
df['mondo_disease_name'] = df['Mondo'].map(lambda x: file[f'MONDO:{x}'].name)
df['icd10'] = df['Mondo'].map(mondo_to_icd)

df.to_csv('data/REPO-TRIAL-comorbiditome/comorbidity_disease_gene_interactions.csv')
df.head()

Unnamed: 0,EntrezID,Mondo,database_assertedBy,score_DisGeNet,mondo_disease_name,icd10
0,171514,11727,omim,,"anorexia nervosa, susceptibility to, 1",
1,100307118,11556,omim,,BCC1,
2,1243,7963,omim,,"melanoma, cutaneous malignant, susceptibility ...",
3,7792,7280,omim,,cataract 8 multiple types,Q12
4,406874,12168,omim,,"dyslexia, susceptibility to, 8",
