In [2]:
import polars as pl

# Data processing

In [188]:
ctd_chg = pl.scan_csv(
    './data/ctd/CTD_chem_gene_ixns.tsv',
    separator = '\t',
    schema_overrides = pl.Schema({
      'ChemicalName': pl.String,
      'ChemicalID': pl.String,
      'GeneID': pl.Int64,
      'InteractionActions': pl.String
    })
).select(['ChemicalName', 'ChemicalID', 'GeneID', 'InteractionActions']).rename({
    'ChemicalName': 'CHEM_NAME',
    'ChemicalID': 'CHEM_MESH_ID',
    'GeneID': 'GENE_NCBI_ID',
    'InteractionActions': 'INTERACTIONS'
})
ctd_chd = pl.scan_csv(
    './data/ctd/CTD_curated_chemicals_diseases.tsv',
    separator = '\t',
    schema_overrides = pl.Schema({
      'ChemicalName': pl.String,
      'ChemicalID': pl.String,
      'DiseaseName': pl.String,
      'DiseaseID': pl.String
    })
).select(['ChemicalName', 'ChemicalID', 'DiseaseName', 'DiseaseID']).rename({
    'ChemicalName': 'CHEM_NAME',
    'ChemicalID': 'CHEM_MESH_ID',
    'DiseaseName': 'DS_NAME',
    'DiseaseID': 'DS_OMIM_MESH_ID'
})
ctd_dg = pl.scan_csv(
    './data/ctd/CTD_curated_genes_diseases.tsv',
    separator = '\t',
    schema_overrides = pl.Schema({
      'GeneID': pl.Int64,
      'DiseaseName': pl.String,
      'DiseaseID': pl.String,
      'OmimIDs': pl.String
    })
).select(['GeneID', 'DiseaseName', 'DiseaseID', 'OmimIDs']).rename({
    'GeneID': 'GENE_NCBI_ID',
    'DiseaseName': 'DS_NAME',
    'DiseaseID': 'DS_OMIM_MESH_ID',
    'OmimIDs': 'DS_OMIM_IDS'
})
ppi = pl.scan_csv(
    './data/snap/PP-Decagon_ppi.csv',
    has_header = False,
    new_columns = [
        'GENE_NCBI_ID_1',
        'GENE_NCBI_ID_2'
    ]
)

ctd_chd:
  - Disease ID - MESH/OMIM
  - Chem ID - MESH
  
ctd_chg:
  - Chem ID - MESH
  - Gene ID - NCBI
  
ctd_dg:
  - Disease ID - MESH/OMIM
  - Gene ID - NCBI
  
ppi:
  - Both are NCBI

In [189]:
genes_core = (
  ctd_chg.select(pl.col('GENE_NCBI_ID').unique())
  .join(
    ctd_dg.select(pl.col('GENE_NCBI_ID').unique()),
    on = 'GENE_NCBI_ID',
    how = 'inner'
    )
  .join(
    pl.concat([
      ppi.select('GENE_NCBI_ID_1').rename({'GENE_NCBI_ID_1': 'GENE_NCBI_ID'}),
      ppi.select('GENE_NCBI_ID_2').rename({'GENE_NCBI_ID_2': 'GENE_NCBI_ID'})
    ],
      how = 'vertical'
    ).unique(),
    on = 'GENE_NCBI_ID',
    how = 'inner'
  )
  .with_columns(
    pl.col('GENE_NCBI_ID').cast(pl.UInt32)
  )
).with_row_index('GENE_ID').collect()

ds_core = (
  ctd_dg.select(['DS_OMIM_MESH_ID', 'DS_NAME']).unique('DS_OMIM_MESH_ID')
  .join(
    ctd_chd.select(['DS_OMIM_MESH_ID', 'DS_NAME']).unique('DS_OMIM_MESH_ID'),
    on = 'DS_OMIM_MESH_ID',
    how = 'inner'
    )
).select(pl.col('DS_OMIM_MESH_ID', 'DS_NAME')).with_row_index('DS_ID').collect()

chems_core = (
  ctd_chg.select(['CHEM_MESH_ID', 'CHEM_NAME']).unique('CHEM_MESH_ID')
  .join(
    ctd_chd.select(['CHEM_MESH_ID', 'CHEM_NAME']).unique('CHEM_MESH_ID'),
    on = 'CHEM_MESH_ID',
    how = 'inner'
    )
).select(pl.col('CHEM_MESH_ID', 'CHEM_NAME')).with_row_index('CHEM_ID').collect()

In [190]:
r = []
for i in [genes_core, ds_core, chems_core]:
  r.append(i.height)
print(f'Genes: {r[0]}\nDiseases: {r[1]}\nChemicals: {r[2]}\n\nAll nodes: {sum(r)}')

Genes: 8095
Diseases: 1885
Chemicals: 7906

All nodes: 17886


In [191]:
ctd_chg_final = (
  ctd_chg.collect()
  .join(genes_core, on='GENE_NCBI_ID', how='inner')
  .join(chems_core, on='CHEM_MESH_ID', how='inner')
  .with_columns([
    pl.col('INTERACTIONS').cast(pl.Utf8).fill_null('').alias('INTERACTIONS'),
    pl.col('INTERACTIONS').str.contains(r'\|').alias('has_list'),
    pl.col('INTERACTIONS').str.count_matches(r'\|').alias('n_pipes'),
    pl.col('INTERACTIONS').str.len_chars().alias('n_chars'),
  ])
  .group_by(['CHEM_MESH_ID', 'GENE_NCBI_ID'])
  .agg([
    pl.col('INTERACTIONS')
      .sort_by(['has_list', 'n_pipes', 'n_chars'], descending=True)
      .first()
      .alias('INTERACTIONS'),
    pl.first('CHEM_ID').alias('CHEM_ID'),
    pl.first('GENE_ID').alias('GENE_ID'),
  ])
  .with_columns(
    pl.when(pl.col('INTERACTIONS') == '')
      .then(pl.lit(None, dtype=pl.List(pl.Utf8)))
      .otherwise(pl.col('INTERACTIONS').str.split('|'))
      .alias('INTERACTION_ITEM')
  )
  .explode('INTERACTION_ITEM')
  .drop_nulls('INTERACTION_ITEM')
  .with_columns(
    pl.col('INTERACTION_ITEM')
      .str.split_exact('^', 1)
      .struct.rename_fields(['ACTION_TYPE', 'ACTION_SUBJECT'])
      .alias('parts')
  )
  .unnest('parts')
  .select([
    'CHEM_ID',
    'GENE_ID',
    'ACTION_TYPE',
    'ACTION_SUBJECT',
  ])
  .unique()
  .with_row_index('CHEM_GENE_IDX')
)

ctd_dg_final = (
  ctd_dg.collect()
  .join(genes_core, on = 'GENE_NCBI_ID')
  .join(ds_core, on = 'DS_OMIM_MESH_ID')
  .unique(['GENE_NCBI_ID', 'DS_OMIM_MESH_ID'])
  .select([
    'GENE_ID',
    'DS_ID'
  ])
  .with_row_index('GENE_DS_IDX')
)

ctd_chd_final = (
  ctd_chd.collect()
  .join(chems_core, on = 'CHEM_MESH_ID')
  .join(ds_core, on = 'DS_OMIM_MESH_ID')
  .unique(['CHEM_MESH_ID', 'DS_OMIM_MESH_ID'])
  .select([
    'CHEM_ID',
    'DS_ID'
  ])
  .with_row_index('CHEM_DS_IDX')
)

ppi_final = (
  ppi.collect()
  .join(
    genes_core.select([
      pl.col('GENE_NCBI_ID').alias('GENE_NCBI_ID_1'),
      pl.col('GENE_ID').alias('GENE_ID_1'),
    ]),
    on='GENE_NCBI_ID_1',
    how='inner',
  )
  .join(
    genes_core.select([
      pl.col('GENE_NCBI_ID').alias('GENE_NCBI_ID_2'),
      pl.col('GENE_ID').alias('GENE_ID_2'),
    ]),
    on='GENE_NCBI_ID_2',
    how='inner',
  )
  .filter(pl.col('GENE_ID_1') != pl.col('GENE_ID_2'))
  .with_columns([
    pl.min_horizontal('GENE_ID_1', 'GENE_ID_2').alias('GENE_ID_SRC'),
    pl.max_horizontal('GENE_ID_1', 'GENE_ID_2').alias('GENE_ID_DST'),
  ])
  .select(['GENE_ID_SRC', 'GENE_ID_DST'])
  .unique()
  .with_row_index('PPI_IDX')
)

ppi_directed = pl.concat([
  ppi_final.select([
    pl.col('GENE_ID_SRC').alias('GENE_ID_1'),
    pl.col('GENE_ID_DST').alias('GENE_ID_2'),
  ]),
  ppi_final.select([
    pl.col('GENE_ID_DST').alias('GENE_ID_1'),
    pl.col('GENE_ID_SRC').alias('GENE_ID_2'),
  ]),
], how='vertical').with_row_index('PPI_DIR_IDX')


In [194]:
print(f'Chemical-Gene edges: {ctd_chg_final.height}\nDisease-Gene edges: {ctd_dg_final.height}\nChemical-Disease edges: {ctd_chd_final.height}\nPPI edges: {ppi_final.height}\nPPI directed edges: {ppi_directed.height}\n\nAll edges (PPI undirected): {ctd_chg_final.height + ctd_dg_final.height + ctd_chd_final.height + ppi_final.height}\nAll edges (PPI directed): {ctd_chg_final.height + ctd_dg_final.height + ctd_chd_final.height + ppi_directed.height}')

Chemical-Gene edges: 1327372
Disease-Gene edges: 27182
Chemical-Disease edges: 87959
PPI edges: 264560
PPI directed edges: 529120

All edges (PPI undirected): 1707073
All edges (PPI directed): 1971633


In [195]:
# nodes
genes_core.write_parquet('./data/processed/genes_nodes.parquet')
ds_core.write_parquet('./data/processed/diseases_nodes.parquet')
chems_core.write_parquet('./data/processed/chemicals_nodes.parquet')

# edges
ctd_chg_final.write_parquet('./data/processed/chem_gene_edges.parquet')
ctd_dg_final.write_parquet('./data/processed/disease_gene_edges.parquet')
ctd_chd_final.write_parquet('./data/processed/chem_disease_edges.parquet')
ppi_final.write_parquet('./data/processed/ppi_edges.parquet')
ppi_directed.write_parquet('./data/processed/ppi_directed_edges.parquet')

# Modelling

In [196]:
chd = pl.read_parquet('./data/processed/chem_disease_edges.parquet')
chg = pl.read_parquet('./data/processed/chem_gene_edges.parquet')
dg = pl.read_parquet('./data/processed/disease_gene_edges.parquet')
ppi = pl.read_parquet('./data/processed/ppi_edges.parquet')
ppi_directed = pl.read_parquet('./data/processed/ppi_directed_edges.parquet')