# Export Pathway Info from Reactome

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
from collections import defaultdict

from bioservices.kegg import KEGG

In [17]:
import sys

sys.path.append('..')

from pals.common import *
from pals.reactome import *

## Export Data from Reactome

In [4]:
all_species = get_species_list()
all_species

['Alphapapillomavirus 9',
 'Arabidopsis thaliana',
 'Arenicola marina',
 'Bacillus anthracis',
 'Bos taurus',
 'Caenorhabditis elegans',
 'Candida albicans',
 'Canis familiaris',
 'Cavia porcellus',
 'Cercopithecus aethiops',
 'Chlamydia trachomatis',
 'Chlorocebus sabaeus',
 'Clostridium botulinum',
 'Clostridium tetani',
 'Corynephage beta',
 'Cowpox virus',
 'Cricetulus griseus',
 'Crithidia fasciculata',
 'Danio rerio',
 'Dictyostelium discoideum',
 'Drosophila melanogaster',
 'Escherichia coli',
 'Felis catus',
 'Gallus gallus',
 'Hepatitis B virus',
 'Hepatitis C Virus',
 'Hepatitis C virus genotype 2a',
 'Hepatitis C virus subtype 1a',
 'Homarus americanus',
 'Homo sapiens',
 'Human alphaherpesvirus 2',
 'Human cytomegalovirus',
 'Human gammaherpesvirus 4',
 'Human herpesvirus 1',
 'Human herpesvirus 8',
 'Human immunodeficiency virus 1',
 'Human papillomavirus type 16',
 'Infectious bronchitis virus',
 'Influenza A virus',
 'Legionella pneumophila',
 'Listeria monocytogenes',
 

### Export Compound Data

Get KEGG compound dict from exported PiMP data

In [None]:
json_file = '../pals/data/PiMP_KEGG.json.zip'
all_kegg_compounds = load_json(json_file, compressed=True)
kegg_entity_dict = all_kegg_compounds['entity_dict']

In [None]:
out_file = '../pals/data/COMPOUND.json.zip'
save_json(kegg_entity_dict, out_file, compressed=True)

Get ChEBI compound dict from the Ontology file downloaded from https://www.ebi.ac.uk/chebi/downloadsForward.do

In [None]:
owl_file = 'C:/Users/joewa/Downloads/chebi.owl'
chebi_entity_dict = parse_chebi_entity_dict(owl_file)

In [None]:
out_file = '../pals/data/ChEBI.json.zip'
save_json(chebi_entity_dict, out_file, compressed=True)

Export Reactome data for all species

In [None]:
for species in all_species:
    
    # KEGG - metabolic pathways only
    database_name = DATABASE_REACTOME_KEGG
    metabolic_pathway_only = True
    print('Writing %s - %s metabolic_pathway_only=%s' % (species, database_name, metabolic_pathway_only))

    pathway_dict = get_pathway_dict(species, metabolic_pathway_only)    
    mapping_dict = get_compound_mapping_dict(species, database_name, metabolic_pathway_only)    
    json_file = '../pals/data/reactome/metabolic_pathways/%s/%s.json.zip' % (database_name, species)
    write_database(pathway_dict, kegg_entity_dict, mapping_dict, json_file)

    # KEGG - all pathways
    database_name = DATABASE_REACTOME_KEGG
    metabolic_pathway_only = False    
    print('Writing %s - %s metabolic_pathway_only=%s' % (species, database_name, metabolic_pathway_only))
    
    pathway_dict = get_pathway_dict(species, metabolic_pathway_only)    
    mapping_dict = get_compound_mapping_dict(species, database_name, metabolic_pathway_only)    
    json_file = '../pals/data/reactome/all_pathways/%s/%s.json.zip' % (database_name, species)
    write_database(pathway_dict, kegg_entity_dict, mapping_dict, json_file)
    
    # ChEBI - metabolic pathways only
    database_name = DATABASE_REACTOME_CHEBI
    metabolic_pathway_only = True    
    pathway_dict = get_pathway_dict(species, metabolic_pathway_only)        
    print('Writing %s - %s metabolic_pathway_only=%s' % (species, database_name, metabolic_pathway_only))
    
    pathway_dict = get_pathway_dict(species, metabolic_pathway_only)    
    mapping_dict = get_compound_mapping_dict(species, database_name, metabolic_pathway_only)    
    json_file = '../pals/data/reactome/metabolic_pathways/%s/%s.json.zip' % (database_name, species)
    write_database(pathway_dict, chebi_entity_dict, mapping_dict, json_file)

    # ChEBI - all pathways
    database_name = DATABASE_REACTOME_CHEBI
    metabolic_pathway_only = False    
    print('Writing %s - %s metabolic_pathway_only=%s' % (species, database_name, metabolic_pathway_only))

    pathway_dict = get_pathway_dict(species, metabolic_pathway_only)        
    mapping_dict = get_compound_mapping_dict(species, database_name, metabolic_pathway_only)    
    json_file = '../pals/data/reactome/all_pathways/%s/%s.json.zip' % (database_name, species)    
    write_database(pathway_dict, chebi_entity_dict, mapping_dict, json_file)

### Export UniProt Data

In [7]:
species = 'Homo sapiens'

In [8]:
database_name = DATABASE_REACTOME_UNIPROT
metabolic_pathway_only = True
pathway_dict = get_pathway_dict(species, metabolic_pathway_only)
len(pathway_dict), pathway_dict

(268,
 {'R-HSA-1475029': {'display_name': 'Reversible hydration of carbon dioxide'},
  'R-HSA-1855196': {'display_name': 'IP3 and IP4 transport between cytosol and nucleus'},
  'R-HSA-1855229': {'display_name': 'IP6 and IP7 transport between cytosol and nucleus'},
  'R-HSA-1855167': {'display_name': 'Synthesis of pyrophosphates in the cytosol'},
  'R-HSA-1855231': {'display_name': 'Synthesis of IPs in the ER lumen'},
  'R-HSA-1855215': {'display_name': 'IPs transport between ER lumen and cytosol'},
  'R-HSA-1855183': {'display_name': 'Synthesis of IP2, IP, and Ins in the cytosol'},
  'R-HSA-1855192': {'display_name': 'IPs transport between nucleus and ER lumen'},
  'R-HSA-1855204': {'display_name': 'Synthesis of IP3 and IP4 in the cytosol'},
  'R-HSA-1855156': {'display_name': 'IPs transport between ER lumen and nucleus'},
  'R-HSA-1855191': {'display_name': 'Synthesis of IPs in the nucleus'},
  'R-HSA-1855184': {'display_name': 'IPs transport between cytosol and ER lumen'},
  'R-HSA-1

In [16]:
entity_dict = get_protein_entity_dict(species, database_name)
len(entity_dict), entity_dict


        MATCH
            (rg:ReferenceGeneProduct)-[:referenceGene]->
            (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
            rg.databaseName = {database_name} AND            
            s.displayName = {species}
        RETURN DISTINCT 
            rg.identifier AS entity_id, 
            rg.description AS display_name
        


(10743,
 {'P49815': {'display_name': 'Tuberin'},
  'P25054': {'display_name': 'Adenomatous polyposis coli protein shortName'},
  'Q96PL1': {'display_name': 'Secretoglobin family 3A member 2'},
  'Q9UEW3': {'display_name': 'Macrophage receptor MARCO'},
  'Q6ZMJ2': {'display_name': 'fullName evidence="1"Scavenger receptor class A member 5'},
  'P21757': {'display_name': 'Macrophage scavenger receptor types I and II'},
  'Q9NY15': {'display_name': 'Stabilin-1'},
  'A1L4H1': {'display_name': 'Soluble scavenger receptor cysteine-rich domain-containing protein SSC5D'},
  'Q14162': {'display_name': 'Scavenger receptor class F member 1'},
  'P00739': {'display_name': 'Haptoglobin-related protein'},
  'Q86VB7': {'display_name': 'Scavenger receptor cysteine-rich type 1 protein M130'},
  'P02760': {'display_name': 'Protein AMBP  component recommendedName'},
  'P02790': {'display_name': 'Hemopexin'},
  'Q9BRG1': {'display_name': 'Vacuolar protein-sorting-associated protein 25 shortName'},
  'Q86VN

In [7]:
mapping_dict = get_protein_mapping_dict(species, DATABASE_REACTOME_UNIPROT)
len(mapping_dict), mapping_dict

MATCH (tp:TopLevelPathway)-[:hasEvent*]->
              (p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent),
              (rle)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent
              |hasMember|hasCandidate*]->(pe:PhysicalEntity),
              (pe:PhysicalEntity)-[:referenceEntity]->
              (re:ReferenceEntity)-[:referenceDatabase]->
              (rd:ReferenceDatabase)
        WHERE
              rle.speciesName IN {species} AND
              rd.displayName = {database_name} AND
         (p)-[:hasEvent]->(rle) AND  tp.displayName = 'Metabolism' 
            RETURN DISTINCT
                p.stId AS pathway_id,
                re.identifier AS entity_id
        


(2095,
 {'P35218': ['R-HSA-1475029'],
  'Q9Y2D0': ['R-HSA-1475029'],
  'P07451': ['R-HSA-1475029'],
  'P00915': ['R-HSA-1475029'],
  'Q8N1Q1': ['R-HSA-1475029'],
  'P00918': ['R-HSA-1475029'],
  'P43166': ['R-HSA-1475029'],
  'Q16790': ['R-HSA-1475029'],
  'O43570': ['R-HSA-1475029'],
  'P22748': ['R-HSA-1475029'],
  'Q9ULX7': ['R-HSA-1475029'],
  'P23280': ['R-HSA-1475029'],
  'Q96G61': ['R-HSA-1855167'],
  'O95989': ['R-HSA-1855167'],
  'Q8NFP7': ['R-HSA-1855167'],
  'Q9NZJ9': ['R-HSA-1855167'],
  'Q9H8X2': ['R-HSA-1855167', 'R-HSA-1855191'],
  'Q6PFW1': ['R-HSA-1855167'],
  'O43314': ['R-HSA-1855167'],
  'Q13572': ['R-HSA-1855167', 'R-HSA-1855204'],
  'Q92551': ['R-HSA-1855167', 'R-HSA-1855191'],
  'Q96PC2': ['R-HSA-1855167'],
  'Q9UNW1': ['R-HSA-1855231'],
  'Q9UGB7': ['R-HSA-1855183'],
  'P32019': ['R-HSA-1855183', 'R-HSA-1855204'],
  'Q14642': ['R-HSA-1855183'],
  'O14732': ['R-HSA-1855183'],
  'P29218': ['R-HSA-1855183'],
  'Q96PE3': ['R-HSA-1855183', 'R-HSA-1660516', 'R-HSA-166

### Export ENSEMBL Data

In [19]:
species = 'Homo sapiens'
database_name = DATABASE_REACTOME_ENSEMBL

In [20]:
entity_dict = get_gene_entity_dict(species, database_name)
len(entity_dict), entity_dict


        MATCH
            (rg:ReferenceGeneProduct)-[:referenceGene]->
            (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
            rs.databaseName = {database_name} AND            
            s.displayName = {species}
        RETURN DISTINCT 
            rs.identifier AS entity_id, 
            rs.geneName[0] AS display_name
        


(11863,
 {'ENSG00000103197': {'display_name': 'TSC2'},
  'ENSG00000134982': {'display_name': 'APC'},
  'ENSG00000164265': {'display_name': 'SCGB3A2'},
  'ENSG00000019169': {'display_name': 'MARCO'},
  'ENSG00000168079': {'display_name': 'SCARA5'},
  'ENSG00000038945': {'display_name': 'MSR1'},
  'ENSG00000010327': {'display_name': 'STAB1'},
  'ENSG00000179954': {'display_name': 'SSC5D'},
  'ENSG00000276336': {'display_name': 'SCARF1'},
  'ENSG00000074660': {'display_name': 'SCARF1'},
  'ENSG00000261701': {'display_name': 'HPR'},
  'ENSG00000177575': {'display_name': 'CD163'},
  'ENSG00000106927': {'display_name': 'AMBP'},
  'ENSG00000110169': {'display_name': 'HPX'},
  'ENSG00000131475': {'display_name': 'VPS25'},
  'ENSG00000136100': {'display_name': 'VPS36'},
  'ENSG00000159210': {'display_name': 'SNF8'},
  'ENSG00000136827': {'display_name': 'TOR1A'},
  'ENSG00000136816': {'display_name': 'TOR1B'},
  'ENSG00000135018': {'display_name': 'UBQLN1'},
  'ENSG00000127946': {'display_name'

In [8]:
mapping_dict = get_gene_mapping_dict(species, database_name)
len(mapping_dict), mapping_dict

MATCH (tp:TopLevelPathway)-[:hasEvent*]->
              (p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent),
              (rle)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent
              |hasMember|hasCandidate*]->(pe:PhysicalEntity),
              (pe:PhysicalEntity)-[:referenceEntity]->
              (rg:ReferenceGeneProduct)-[:referenceGene]->
              (rs:ReferenceSequence)-[:species]->(s:Species)
        WHERE
              rs.databaseName = {database_name} AND            
              s.displayName IN {species} AND
         (p)-[:hasEvent]->(rle) AND  tp.displayName = 'Metabolism' 
            RETURN DISTINCT
                p.stId AS pathway_id,
                rs.identifier AS entity_id
        


(2235,
 {'ENSG00000174990': ['R-HSA-1475029'],
  'ENSG00000169239': ['R-HSA-1475029'],
  'ENSG00000164879': ['R-HSA-1475029'],
  'ENSG00000133742': ['R-HSA-1475029'],
  'ENSG00000185015': ['R-HSA-1475029'],
  'ENSG00000104267': ['R-HSA-1475029'],
  'ENSG00000168748': ['R-HSA-1475029'],
  'ENSG00000107159': ['R-HSA-1475029'],
  'ENSG00000074410': ['R-HSA-1475029'],
  'ENSG00000167434': ['R-HSA-1475029'],
  'ENSG00000118298': ['R-HSA-1475029'],
  'ENSG00000131686': ['R-HSA-1475029'],
  'ENSG00000196368': ['R-HSA-1855167'],
  'ENSG00000272325': ['R-HSA-1855167'],
  'ENSG00000122824': ['R-HSA-1855167'],
  'ENSG00000173598': ['R-HSA-1855167'],
  'ENSG00000127080': ['R-HSA-1855167', 'R-HSA-1855191'],
  'ENSG00000168781': ['R-HSA-1855167'],
  'ENSG00000145725': ['R-HSA-1855167'],
  'ENSG00000274958': ['R-HSA-1855167', 'R-HSA-1855204'],
  'ENSG00000100605': ['R-HSA-1855167', 'R-HSA-1855204'],
  'ENSG00000176095': ['R-HSA-1855167', 'R-HSA-1855191'],
  'ENSG00000161896': ['R-HSA-1855167'],
  'EN