In [1]:
import polars as pl

# Data processing

## BioSNAP

In [2]:
dch = pl.read_csv('./data/snap/DCh-Miner_miner-disease-chemical.tsv', separator = '\t')
chg = pl.read_csv('./data/snap/ChG-Miner_miner-chem-gene.tsv', separator = '\t')
dg = pl.read_csv('./data/snap/DG-AssocMiner_miner-disease-gene.tsv', separator = '\t')
ppi = pl.read_csv('./data/snap/PP-Decagon_ppi.csv', has_header = False)

NameError: name 'pl' is not defined

dch:
  - Disease ID -- MESH/OMIM
  - Chemical ID -- DrugBank ??
  
chg:
  - Drug ID -- DrugBank ??
  - Gene ID -- UniProt
  
dg:
  - Disease ID -- Concept ID NCBI
  - Gene ID -- NCBI
  
ppi:
  - Both are NCBI

In [3]:
for df in [dch, chg, dg, ppi]:
  print(df.head(5))

NameError: name 'dch' is not defined

## CTD

In [2]:
ctd_chg = pl.read_csv(
    './data/ctd/CTD_chem_gene_ixns.tsv',
    separator = '\t',
    columns = [0, 1, 4, 9]
).rename({
    'ChemicalName': 'CHEM_NAME',
    'ChemicalID': 'CHEM_MESH_ID',
    'GeneID': 'GENE_NCBI_ID',
    'InteractionActions': 'INTERACTIONS'
})
ctd_dch_curr = pl.read_csv(
    './data/ctd/CTD_curated_chemicals_diseases.tsv',
    separator = '\t',
    columns = [0, 1, 3, 4]
).rename({
    'ChemicalName': 'CHEM_NAME',
    'ChemicalID': 'CHEM_MESH_ID',
    'DiseaseName': 'DS_NAME',
    'DiseaseID': 'DS_OMIM_MESH_ID'
})
ctd_dch = pl.read_csv(
    './data/ctd/CTD_chemicals_diseases.tsv',
    separator = '\t',
    columns = [0, 1, 3, 4]
).rename({
    'ChemicalName': 'CHEM_NAME',
    'ChemicalID': 'CHEM_MESH_ID',
    'DiseaseName': 'DS_NAME',
    'DiseaseID': 'DS_OMIM_MESH_ID'
})
ctd_dg = pl.read_csv(
    './data/ctd/CTD_curated_genes_diseases.tsv',
    separator = '\t',
    columns = [1, 2, 3, 5],
    schema_overrides = {
        'GeneID': pl.Int64,
        'DiseaseName': pl.String,
        'DiseaseID': pl.String,
        'OmimIDs': pl.String
    }
).rename({
    'GeneID': 'GENE_NCBI_ID',
    'DiseaseName': 'DS_NAME',
    'DiseaseID': 'DS_OMIM_MESH_ID',
    'OmimIDs': 'DS_OMIM_IDS'
})
ppi = pl.read_csv(
    './data/snap/PP-Decagon_ppi.csv',
    has_header = False,
    new_columns = [
        'GENE_NCBI_ID_1',
        'GENE_NCBI_ID_2'
    ]
)

In [None]:
ctd_dch_curr.join(
    ctd_dg,
    left_on = 'DS_OMIM_MESH_ID',
    right_on = 'DS_OMIM_MESH_ID',
    how = 'left'
).select([
    'CHEM_MESH_ID',
    'DS_OMIM_MESH_ID',
    'GENE_NCBI_ID'
]).rename({'GENE_NCBI_ID': 'DRUG_GENE_ID'}).join(
    ctd_chg,
    left_on = 'CHEM_MESH_ID',
    right_on = 'CHEM_MESH_ID',
    how = 'left'
).rename({'GENE_NCBI_ID': 'CHEM_GENE_ID'}).select([
    'CHEM_MESH_ID',
    'DS_OMIM_MESH_ID',
    'DRUG_GENE_ID',
    'CHEM_MESH_ID'
])

CHEM_MESH_ID,DS_OMIM_MESH_ID,GENE_NCBI_ID
str,str,i64
"""C046983""","""MESH:D054198""",5243
"""C046983""","""MESH:D054198""",25
"""C046983""","""MESH:D054198""",84159
"""C046983""","""MESH:D054198""",405
"""C046983""","""MESH:D054198""",581
…,…,…
"""D015054""","""MESH:D014605""",5949
"""D015054""","""MESH:D014605""",6295
"""D015054""","""MESH:D014605""",5176
"""D015054""","""MESH:D014605""",6648


ctd_dch / ctd_dch_curr:
  - Disease ID -- MESH/OMIM
  - Chem ID -- MESH
  
ctd_chg:
  - Chem ID -- MESH
  - Gene ID -- NCBI
  
ctd_dg:
  - Disease ID -- MESH/OMIM
  - Gene ID -- NCBI
  
ppi:
  - Both are NCBI

# Disease ID Standartizing

In [4]:
print(f'Initial num of rows: {dch.shape[0]}')

Initial num of rows: 466657


In [5]:
dch.filter(
  (pl.col('# Disease(MESH)') + pl.col('Chemical')).is_unique()
)

# Disease(MESH),Chemical
str,str
"""MESH:D005923""","""DB00564"""
"""MESH:D009503""","""DB01072"""
"""MESH:D016115""","""DB01759"""
"""MESH:D018476""","""DB00451"""
"""MESH:C567059""","""DB00641"""
…,…
"""MESH:C565545""","""DB00482"""
"""MESH:D009164""","""DB00977"""
"""MESH:D010518""","""DB04216"""
"""MESH:D002653""","""DB02701"""


In [6]:
ioc = dch.filter(pl.col('# Disease(MESH)').str.starts_with('OMIM')).shape[0]
imc = dch.filter(pl.col('# Disease(MESH)').str.starts_with('MESH')).shape[0]
print(f'OMIM IDs: {ioc}')
print(f'MESH IDs: {imc}')

OMIM IDs: 9470
MESH IDs: 457186


In [7]:
map = pl.read_csv('./data/MedGenIDMappings.txt', separator='|').select([
  '#CUI_or_CN_id',
  'pref_name',
  'source_id',
  'source'
])

FileNotFoundError: No such file or directory (os error 2): ./data/MedGenIDMappings.txt

In [8]:
map.filter(
  pl.col('source').is_in([
    'OMIM',
    #'OMIM included',
  ])
).drop(['pref_name', 'source']).rename({
    '#CUI_or_CN_id': 'cui',
    'source_id': 'omim_id'
    }).write_csv('./omim_cui.csv')

map.filter(
  pl.col('source') == 'MeSH'
).drop(['pref_name', 'source']).rename({
    '#CUI_or_CN_id': 'cui',
    'source_id': 'mesh_id'
    }).write_csv('./mesh_cui.csv')

AttributeError: type object 'map' has no attribute 'filter'

In [113]:
mc = pl.read_csv('./mesh_cui.csv')
oc = pl.read_csv('./omim_cui.csv')

In [114]:
dch_omim_cui = dch.filter(
    pl.col('# Disease(MESH)').str.starts_with('OMIM')
  ).with_columns([
    pl.col('# Disease(MESH)').str.split(':').list.get(1).cast(pl.Int64).alias('omim_id')
  ]).join(
      oc,
      on='omim_id',
      how = 'left'
  )

In [115]:
dch_mesh_cui = dch.filter(
  pl.col('# Disease(MESH)').str.starts_with('MESH')
).with_columns(
  pl.col('# Disease(MESH)').str.split(':').list.get(1).alias('mesh_id')
).join(
  mc,
  on='mesh_id',
  how = 'left'
)

In [116]:
mc

cui,mesh_id
str,str
"""C0000727""","""D000006"""
"""C0000772""","""D000015"""
"""C0000771""","""D000014"""
"""C0000778""","""D000017"""
"""C0000744""","""D000012"""
…,…
"""C6016529""","""D006031"""
"""C6016530""","""D006031"""
"""C6016573""","""D000083202"""
"""C6016610""","""D000099136"""


In [120]:
mc.filter(
  pl.col('mesh_id') == 'D005923'
)

cui,mesh_id
str,str
"""C0017668""","""D005923"""
"""C0086432""","""D005923"""


In [125]:
dch_omim_cui

# Disease(MESH),Chemical,omim_id,cui
str,str,i64,str
"""OMIM:613721""","""DB00252""",613721,"""C3150987"""
"""OMIM:168600""","""DB01213""",168600,"""C3160718"""
"""OMIM:256600""","""DB04557""",256600,"""C0270724"""
"""OMIM:606771""","""DB00999""",606771,"""C1847529"""
"""OMIM:613382""","""DB00624""",613382,"""C3150644"""
…,…,…,…
"""OMIM:168600""","""DB00158""",168600,"""C3160718"""
"""OMIM:604370""","""DB01174""",604370,"""C2676676"""
"""OMIM:613454""","""DB00530""",613454,"""C3150705"""
"""OMIM:610251""","""DB01563""",610251,"""C2674838"""


In [124]:
dch_mesh_cui.filter(
  pl.col('cui').is_not_null(),
  (pl.col('mesh_id') + pl.col('Chemical')).is_unique()
)

# Disease(MESH),Chemical,mesh_id,cui
str,str,str,str
"""MESH:D009503""","""DB01072""","""D009503""","""C0027947"""
"""MESH:C567059""","""DB00641""","""C567059""","""C1970712"""
"""MESH:D010198""","""DB00481""","""D010198""","""C0030312"""
"""MESH:D007898""","""DB04173""","""D007898""","""C0023290"""
"""MESH:D001249""","""DB00814""","""D001249""","""C0004096"""
…,…,…,…
"""MESH:C563513""","""DB00755""","""C563513""","""C1834144"""
"""MESH:D006317""","""DB01696""","""D006317""","""C0018781"""
"""MESH:D015430""","""DB01592""","""D015430""","""C0043094"""
"""MESH:D009164""","""DB00977""","""D009164""","""C0026918"""


In [119]:
print(f'OMIM IDs: {dch_omim_cui.shape[0]} (initially: {ioc}, diff = {ioc - dch_omim_cui.shape[0]})')
print(f'MESH IDs: {dch_mesh_cui.shape[0]} (initially: {imc}, diff = {imc - dch_mesh_cui.shape[0]})')

OMIM IDs: 9470 (initially: 9470, diff = 0)
MESH IDs: 957814 (initially: 457186, diff = -500628)


In [126]:
map

#CUI_or_CN_id,pref_name,source_id,source
str,str,str,str
"""C0000727""","""Acute abdomen""","""9209005""","""SNOMEDCT_US"""
"""C0000727""","""Acute abdomen""","""HP:0033400""","""HPO"""
"""C0000731""","""Abdominal distention""","""34""","""MedGen"""
"""C0000734""","""Abdominal mass""","""271860004""","""SNOMEDCT_US"""
"""C0000735""","""Neoplasm of abdomen""","""128050000""","""SNOMEDCT_US"""
…,…,…,…
"""CN970821""","""Kleefstra syndrome due to 9q34…","""1610338""","""MedGen"""
"""CN970821""","""Kleefstra syndrome due to 9q34…","""16846""","""GARD"""
"""CN971896""","""Inflammatory myopathy""","""HP:0009071""","""HPO"""
"""CN971896""","""Inflammatory myopathy""","""1611049""","""MedGen"""
