# Export Pathway Info from Reactome

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
from collections import defaultdict

from bioservices.kegg import KEGG

In [3]:
import sys

sys.path.append('../pals')

from common import *

## Common methods

In [4]:
from neo4j import GraphDatabase, basic_auth
NEO4J_SERVER='bolt://localhost:7687'
NEO4J_USER='neo4j'
NEO4J_PASSWORD='neo4j'
driver = GraphDatabase.driver(NEO4J_SERVER,
                              auth=basic_auth(NEO4J_USER, NEO4J_PASSWORD))

def get_neo4j_session():
    session = None
    try:
        session = driver.session()
    except Exception:
        raise
    return session

def rchop(thestring, ending):
    if thestring.endswith(ending):
        return thestring[:-len(ending)]
    return thestring

## Export Data from Reactome

In [5]:
def get_species_list():
    results = []
    try:
        session = get_neo4j_session()
        query = """
        MATCH (n:Species) RETURN n.displayName AS name order by name        
        """
        query_res = session.run(query)
        # print(query)
        for record in query_res:
            results.append(record['name'])
    finally:
        if session is not None: session.close()
    return results

In [6]:
def get_pathway_dict(species, metabolic_pathway_only=True, leaf=True):
    results = {}
    try:
        session = get_neo4j_session()

        # initial match clause in the query
        query = """
            MATCH (tp:TopLevelPathway)-[:hasEvent*]->(p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent)
            WHERE
                tp.speciesName = {species} AND
        """
        
        if leaf: # retrieve only the leaf nodes in the pathway hierarchy
            query += " (p)-[:hasEvent]->(rle) AND "

        if metabolic_pathway_only: # only retrieves metabolic pathways
            query += " tp.displayName = 'Metabolism' AND "

        # remove last AND
        query = rchop(query.strip(), 'AND')

        # add return clause        
        query += """
            RETURN DISTINCT
                p.speciesName AS species_name,            
                p.displayName AS pathway_name,
                p.stId AS pathway_id                       
        """
        
        params = {
            'species': species
        }
        query_res = session.run(query, params)
        # print(query)

        for record in query_res:
            pathway_name = record['pathway_name']
            pathway_id = record['pathway_id']
            results[pathway_id] = { 'display_name' : pathway_name }
    finally:
        if session is not None: session.close()
    return results

In [7]:
all_species = get_species_list()
all_species

['Alphapapillomavirus 9',
 'Arabidopsis thaliana',
 'Arenicola marina',
 'Bacillus anthracis',
 'Bos taurus',
 'Caenorhabditis elegans',
 'Candida albicans',
 'Canis familiaris',
 'Cavia porcellus',
 'Cercopithecus aethiops',
 'Chlamydia trachomatis',
 'Chlorocebus sabaeus',
 'Clostridium botulinum',
 'Clostridium tetani',
 'Corynephage beta',
 'Cowpox virus',
 'Cricetulus griseus',
 'Crithidia fasciculata',
 'Danio rerio',
 'Dictyostelium discoideum',
 'Drosophila melanogaster',
 'Escherichia coli',
 'Felis catus',
 'Gallus gallus',
 'Hepatitis B virus',
 'Hepatitis C Virus',
 'Hepatitis C virus genotype 2a',
 'Hepatitis C virus subtype 1a',
 'Homarus americanus',
 'Homo sapiens',
 'Human alphaherpesvirus 2',
 'Human cytomegalovirus',
 'Human gammaherpesvirus 4',
 'Human herpesvirus 1',
 'Human herpesvirus 8',
 'Human immunodeficiency virus 1',
 'Human papillomavirus type 16',
 'Infectious bronchitis virus',
 'Influenza A virus',
 'Legionella pneumophila',
 'Listeria monocytogenes',
 

### Export Compound Data

Retrieve the mapping of compounds to pathways from Reactome

In [8]:
def get_compound_mapping_dict(species, database_name, metabolic_pathway_only=True, leaf=True):
    results = defaultdict(list)
    try:
        session = get_neo4j_session()

        # initial match clause in the query
        query = """
        MATCH (tp:TopLevelPathway)-[:hasEvent*]->
              (p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent),
              (rle)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent
              |hasMember|hasCandidate*]->(pe:PhysicalEntity),
              (pe:PhysicalEntity)-[:crossReference|:referenceEntity]->(do:DatabaseObject)
        WHERE
              tp.speciesName = {species} AND
              do.databaseName = {database_name} AND
        """
        
        if leaf: # retrieve only the leaf nodes in the pathway hierarchy
            query += " (p)-[:hasEvent]->(rle) AND "

        if metabolic_pathway_only: # only retrieves metabolic pathways
            query += " tp.displayName = 'Metabolism' AND "

        # remove last AND
        query = rchop(query.strip(), 'AND')

        # add return clause        
        query += """
            RETURN DISTINCT
                p.stId AS pathway_id,
                do.identifier AS entity_id
        """
        
        params = {
            'species': species,
            'database_name': database_name
        }
        query_res = session.run(query, params)
        # print(query)

        i = 0
        for record in query_res:
            pathway_id = record['pathway_id']
            entity_id = record['entity_id']
            results[entity_id].append(pathway_id)
    finally:
        if session is not None: session.close()
    return dict(results)

Get KEGG compound dict from downloaded JSON file

In [9]:
# json_file = '../pals/data/all_kegg_compounds.json.test.zip'
# all_kegg_compounds = load_json(json_file, compressed=True)
# entity_dict = {}
# compound_info = all_kegg_compounds['cmpd_info']
# for compound_id in compound_info:
#     try:
#         name = compound_info[compound_id]['NAME'][0]        
#         formula = compound_info[compound_id]['FORMULA']
#         entity_dict[compound_id] = {
#             'unique_id': formula,
#             'display_name': name
#         }
#     except:
#         pass

Get KEGG compound dict from exported PiMP data

In [10]:
json_file = '../pals/data/PiMP_KEGG.json.zip'
all_kegg_compounds = load_json(json_file, compressed=True)
kegg_entity_dict = all_kegg_compounds['entity_dict']

Get ChEBI compound dict from the Ontology file downloaded from https://www.ebi.ac.uk/chebi/downloadsForward.do

In [11]:
chebi_id = None
display_name = None
formula = None
chebi_entity_dict = {}

with open('C:/Users/joewa/Downloads/chebi.owl', encoding='utf-8') as f:
    for line in f:
        if 'owl:Class' in line and 'rdf:about' in line:
            found = line.strip()
            res = re.search('CHEBI_(.*)"', found)
            chebi_id = res.group(1)   
        if 'chebi:formula' in line:
            found = line.strip()
            res = re.search('<chebi:formula.*>(.*)<\/chebi:formula>', found)
            formula = res.group(1)
        if 'rdfs:label' in line:
            found = line.strip()
            res = re.search('<rdfs:label.*>(.*)<\/rdfs:label>', found)
            display_name = res.group(1)
        if '</owl:Class>' in line:
            if chebi_id is not None and display_name is not None and formula is not None:
                chebi_entity_dict[chebi_id] = {
                    'display_name': display_name,
                    'unique_id': formula
                }
                chebi_id = None
                display_name = None
                formula = None

Export only the metabolic pathways

In [12]:
def write_database(pathway_dict, entity_dict, mapping_dict, json_file):
    if len(mapping_dict) > 0:
        data = {
            'pathway_dict': pathway_dict,
            'entity_dict': entity_dict,
            'mapping_dict': mapping_dict
        }
        save_json(data, json_file, compressed=True)

In [13]:
for species in all_species:
    
    # KEGG - metabolic pathways only
    database_name = DATABASE_REACTOME_KEGG
    metabolic_pathway_only = True
    print('Writing %s - %s metabolic_pathway_only=%s' % (species, database_name, metabolic_pathway_only))

    pathway_dict = get_pathway_dict(species, metabolic_pathway_only)    
    mapping_dict = get_compound_mapping_dict(species, database_name, metabolic_pathway_only)    
    json_file = '../pals/data/reactome/metabolic_pathways/%s/%s.json.zip' % (database_name, species)
    write_database(pathway_dict, kegg_entity_dict, mapping_dict, json_file)

    # KEGG - all pathways
    database_name = DATABASE_REACTOME_KEGG
    metabolic_pathway_only = False    
    print('Writing %s - %s metabolic_pathway_only=%s' % (species, database_name, metabolic_pathway_only))
    
    pathway_dict = get_pathway_dict(species, metabolic_pathway_only)    
    mapping_dict = get_compound_mapping_dict(species, database_name, metabolic_pathway_only)    
    json_file = '../pals/data/reactome/all_pathways/%s/%s.json.zip' % (database_name, species)
    write_database(pathway_dict, kegg_entity_dict, mapping_dict, json_file)
    
    # ChEBI - metabolic pathways only
    database_name = DATABASE_REACTOME_CHEBI
    metabolic_pathway_only = True    
    pathway_dict = get_pathway_dict(species, metabolic_pathway_only)        
    print('Writing %s - %s metabolic_pathway_only=%s' % (species, database_name, metabolic_pathway_only))
    
    pathway_dict = get_pathway_dict(species, metabolic_pathway_only)    
    mapping_dict = get_compound_mapping_dict(species, database_name, metabolic_pathway_only)    
    json_file = '../pals/data/reactome/metabolic_pathways/%s/%s.json.zip' % (database_name, species)
    write_database(pathway_dict, chebi_entity_dict, mapping_dict, json_file)

    # ChEBI - all pathways
    database_name = DATABASE_REACTOME_CHEBI
    metabolic_pathway_only = False    
    print('Writing %s - %s metabolic_pathway_only=%s' % (species, database_name, metabolic_pathway_only))

    pathway_dict = get_pathway_dict(species, metabolic_pathway_only)        
    mapping_dict = get_compound_mapping_dict(species, database_name, metabolic_pathway_only)    
    json_file = '../pals/data/reactome/all_pathways/%s/%s.json.zip' % (database_name, species)    
    write_database(pathway_dict, chebi_entity_dict, mapping_dict, json_file)

Writing Alphapapillomavirus 9 - COMPOUND metabolic_pathway_only=True
Writing Alphapapillomavirus 9 - COMPOUND metabolic_pathway_only=False
Writing Alphapapillomavirus 9 - ChEBI metabolic_pathway_only=True
Writing Alphapapillomavirus 9 - ChEBI metabolic_pathway_only=False
Writing Arabidopsis thaliana - COMPOUND metabolic_pathway_only=True
Writing Arabidopsis thaliana - COMPOUND metabolic_pathway_only=False
Writing Arabidopsis thaliana - ChEBI metabolic_pathway_only=True
Writing Arabidopsis thaliana - ChEBI metabolic_pathway_only=False
Writing Arenicola marina - COMPOUND metabolic_pathway_only=True
Writing Arenicola marina - COMPOUND metabolic_pathway_only=False
Writing Arenicola marina - ChEBI metabolic_pathway_only=True
Writing Arenicola marina - ChEBI metabolic_pathway_only=False
Writing Bacillus anthracis - COMPOUND metabolic_pathway_only=True
Writing Bacillus anthracis - COMPOUND metabolic_pathway_only=False
Writing Bacillus anthracis - ChEBI metabolic_pathway_only=True
Writing Baci

Writing Human cytomegalovirus - COMPOUND metabolic_pathway_only=False
Writing Human cytomegalovirus - ChEBI metabolic_pathway_only=True
Writing Human cytomegalovirus - ChEBI metabolic_pathway_only=False
Writing Human gammaherpesvirus 4 - COMPOUND metabolic_pathway_only=True
Writing Human gammaherpesvirus 4 - COMPOUND metabolic_pathway_only=False
Writing Human gammaherpesvirus 4 - ChEBI metabolic_pathway_only=True
Writing Human gammaherpesvirus 4 - ChEBI metabolic_pathway_only=False
Writing Human herpesvirus 1 - COMPOUND metabolic_pathway_only=True
Writing Human herpesvirus 1 - COMPOUND metabolic_pathway_only=False
Writing Human herpesvirus 1 - ChEBI metabolic_pathway_only=True
Writing Human herpesvirus 1 - ChEBI metabolic_pathway_only=False
Writing Human herpesvirus 8 - COMPOUND metabolic_pathway_only=True
Writing Human herpesvirus 8 - COMPOUND metabolic_pathway_only=False
Writing Human herpesvirus 8 - ChEBI metabolic_pathway_only=True
Writing Human herpesvirus 8 - ChEBI metabolic_path

Writing Saccharomyces cerevisiae - ChEBI metabolic_pathway_only=False
Writing Salmonella enterica - COMPOUND metabolic_pathway_only=True
Writing Salmonella enterica - COMPOUND metabolic_pathway_only=False
Writing Salmonella enterica - ChEBI metabolic_pathway_only=True
Writing Salmonella enterica - ChEBI metabolic_pathway_only=False
Writing Salmonella typhimurium - COMPOUND metabolic_pathway_only=True
Writing Salmonella typhimurium - COMPOUND metabolic_pathway_only=False
Writing Salmonella typhimurium - ChEBI metabolic_pathway_only=True
Writing Salmonella typhimurium - ChEBI metabolic_pathway_only=False
Writing Schizosaccharomyces pombe - COMPOUND metabolic_pathway_only=True
Writing Schizosaccharomyces pombe - COMPOUND metabolic_pathway_only=False
Writing Schizosaccharomyces pombe - ChEBI metabolic_pathway_only=True
Writing Schizosaccharomyces pombe - ChEBI metabolic_pathway_only=False
Writing Sendai virus - COMPOUND metabolic_pathway_only=True
Writing Sendai virus - COMPOUND metabolic_

### Export UniProt Data

### Export ENSEMBL Data