In [5]:
import urllib.request
import os

from indra.assemblers.html import HtmlAssembler
from indra.statements import *

from indra_cogex.client import Neo4jClient
client = Neo4jClient()

import networkx as nx
import obonet
import pandas as pd

In [2]:
def get_go_obo():
    """ Loads the GO terms data into a networkx graph
    Returns
    -------
    graph : Networkx graph with the GO terms hierarchical data 

    """
    # Reads and loads data into graph
    url = 'https://snapshot.geneontology.org/ontology/go.obo'
    graph = obonet.read_obo(url)

    # Prints number of nodes and edges for visualization
    print(f"Number of nodes: {len(graph)}")
    print(f"Number of edges: {graph.number_of_edges()}")

    return graph
graph = get_go_obo()

INFO: [2024-11-01 21:18:53] root - Will decode content from https://snapshot.geneontology.org/ontology/go.obo using utf-8 charset.


Number of nodes: 40665
Number of edges: 81412


In [3]:
def find_indra_statements():
    """ Writes query to get HGNC-GO INDRA relationships from database
    Returns
    -------
    graph : Networkx graph with the GO terms hierarchical data 

    """
    # Cypher query to collect data
    cypher = """MATCH p1=(source:BioEntity)-[:indra_rel]->(target:BioEntity) WHERE source.id starts with 'hgnc' AND target.type = 'biological_process' RETURN p1"""
    results = client.query_tx(cypher)
    return results
results = find_indra_statements()
results

[[<Path start=<Node element_id='64391068' labels=frozenset({'BioEntity'}) properties={'name': 'ASIC1', 'obsolete': False, 'id': 'hgnc:100', 'type': 'human_gene_protein'}> end=<Node element_id='65578558' labels=frozenset({'BioEntity'}) properties={'name': 'Synaptic Transmission', 'id': 'mesh:D009435', 'type': 'biological_process'}> size=1>],
 [<Path start=<Node element_id='64391068' labels=frozenset({'BioEntity'}) properties={'name': 'ASIC1', 'obsolete': False, 'id': 'hgnc:100', 'type': 'human_gene_protein'}> end=<Node element_id='65532965' labels=frozenset({'BioEntity'}) properties={'name': 'Hydrogen-Ion Concentration', 'id': 'mesh:D006863', 'type': 'biological_process'}> size=1>],
 [<Path start=<Node element_id='64391068' labels=frozenset({'BioEntity'}) properties={'name': 'ASIC1', 'obsolete': False, 'id': 'hgnc:100', 'type': 'human_gene_protein'}> end=<Node element_id='64095938' labels=frozenset({'BioEntity'}) properties={'name': 'positive regulation of calcium ion import', 'id': 'go

In [6]:
# Iterates through results to create a dataframe 
source_targ_ids = {}
hgnc_ids = []
go_ids = []
stmt_jsons = []
stmt_hash = []
for list in results:
    attrs = list[0].__dict__
    # COllecrs the GO id, HGNC id, stmt json and stmt hash for each statement
    if attrs["_nodes"][1].__dict__["_properties"]["id"].startswith("go"):
        go_ids.append(attrs["_nodes"][1].__dict__["_properties"]["id"])
        hgnc_ids.append(attrs["_nodes"][0].__dict__["_properties"]["id"])
        stmt_jsons.append(attrs["_relationships"][0].__dict__["_properties"]["stmt_json"])
        stmt_hash.append(attrs["_relationships"][0].__dict__["_properties"]["stmt_hash"])
        
source_targ_ids = {"hgnc_id" : hgnc_ids, "go_id": go_ids, "stmt_json": stmt_jsons, "stmt_hash": stmt_hash}
source_targ_ids
source_targ_df = pd.DataFrame(source_targ_ids)
source_targ_df

Unnamed: 0,hgnc_id,go_id,stmt_json,stmt_hash
0,hgnc:100,go:0090280,"{""type"": ""Inhibition"", ""subj"": {""name"": ""ASIC1...",-3274567538064804
1,hgnc:100,go:0061724,"{""type"": ""Activation"", ""subj"": {""name"": ""ASIC1...",-25024314781710374
2,hgnc:100,go:0060072,"{""type"": ""Complex"", ""members"": [{""name"": ""ASIC...",-21760308130759727
3,hgnc:100,go:0051899,"{""type"": ""Activation"", ""subj"": {""name"": ""ASIC1...",4961136544799410
4,hgnc:100,go:0051881,"{""type"": ""Inhibition"", ""subj"": {""name"": ""ASIC1...",27140835016821419
...,...,...,...,...
340709,hgnc:9999,go:0006935,"{""type"": ""Activation"", ""subj"": {""name"": ""RGS3""...",9354760842082735
340710,hgnc:9999,go:0006915,"{""type"": ""Inhibition"", ""subj"": {""name"": ""RGS3""...",-13605135813838622
340711,hgnc:9999,go:0006915,"{""type"": ""Activation"", ""subj"": {""name"": ""RGS3""...",-20316819234392314
340712,hgnc:9999,go:0006351,"{""type"": ""Activation"", ""subj"": {""name"": ""RGS3""...",6470332097344010


In [7]:
import pandas as pd
import gzip
def load_goa() -> pd.DataFrame:
    """Load GO annotations from the given URL and process them.
        df: Dataframe containing GO terms Associations data
    """
    
    GOA_URL = "http://geneontology.org/gene-associations/goa_human.gaf.gz"

    local_filename, _ = urllib.request.urlretrieve(GOA_URL) 
    
    # Use gzip to open the compressed file
    with gzip.open(local_filename, 'rt') as f: 
        df = pd.read_csv(
            f,
            sep="\t",
            comment="!",  # Ignore lines starting with '!'
            dtype=str,
            header=None,
            usecols=[1, 3, 4, 6],  # Select relevant columns (0-indexed)
            names=[
                "DB_Object_ID", 
                "Qualifier",
                "GO_ID",
                "Evidence_Code",
            ],
        )

    return df

df = load_goa()
df 

Unnamed: 0,DB_Object_ID,Qualifier,GO_ID,Evidence_Code
0,A0A024RBG1,enables,GO:0003723,IEA
1,A0A024RBG1,enables,GO:0005515,IPI
2,A0A024RBG1,enables,GO:0046872,IEA
3,A0A024RBG1,located_in,GO:0005829,IDA
4,A0A075B6H7,involved_in,GO:0002250,IEA
...,...,...,...,...
773769,Q5JT82,involved_in,GO:0006357,IBA
773770,Q9BZJ4,is_active_in,GO:0005739,IBA
773771,P0DJD0,is_active_in,GO:0005737,IBA
773772,Q6UXG8,enables,GO:0005102,IBA


In [8]:
from indra.databases import uniprot_client
# Converts the uniprot id to HGNC id
df["HGNC_ID"] = df.apply(lambda row: uniprot_client.get_hgnc_id(row["DB_Object_ID"]),axis=1)


In [9]:
grouped_data = {}
# Groups the GOA data by HGNC id, so for each HGNC id the known associated GO ids are listed
for index, row in df.iterrows():
    hgnc_id = row['HGNC_ID']
    go_id = row['GO_ID']
    if hgnc_id not in grouped_data:
        grouped_data[hgnc_id] = []
    grouped_data[hgnc_id].append(go_id)

df_grouped = pd.DataFrame({'HGNC_ID': grouped_data.keys(), 
                            'GO_ID_list': grouped_data.values()}) 
df_grouped

Unnamed: 0,HGNC_ID,GO_ID_list
0,18012,"[GO:0003723, GO:0005515, GO:0046872, GO:000582..."
1,5821,"[GO:0002250, GO:0005886, GO:0019814, GO:001981..."
2,5757,"[GO:0002250, GO:0005886, GO:0019814, GO:001981..."
3,5921,"[GO:0002250, GO:0005886, GO:0019814, GO:001981..."
4,5931,"[GO:0002250, GO:0005886, GO:0019814, GO:000695..."
...,...,...
19531,26656,[GO:0016020]
19532,29276,[GO:0007283]
19533,16785,[GO:0005634]
19534,27814,[GO:0005737]


In [35]:
import pandas as pd
import numpy as np

filter_df = np.ones(len(source_targ_df), dtype=bool)

# iterates through the INDRA db dataframe by using the hgnc id, and go id
for i, (hgnc_id, go_id) in enumerate(zip(source_targ_df["hgnc_id"], source_targ_df["go_id"])):
    hgnc_id = hgnc_id[5:]
    go_id = go_id.replace("go", "GO")
    # Finds the associated GO terms for each hgnc id 
    hgnc_id_go_terms = df_grouped[df_grouped["HGNC_ID"] == hgnc_id]["GO_ID_list"].values
    if len(hgnc_id_go_terms)>0:
        # If the GO id is already in the associated GO ids, filters that row out
        if go_id in hgnc_id_go_terms[0]:
            filter_df[i] = False  

source_targ_copy = source_targ_df.copy() 
source_targ_copy = source_targ_copy[filter_df] 
source_targ_copy

Unnamed: 0,hgnc_id,go_id,stmt_json,stmt_hash
0,hgnc:100,go:0090280,"{""type"": ""Inhibition"", ""subj"": {""name"": ""ASIC1...",-3274567538064804
1,hgnc:100,go:0061724,"{""type"": ""Activation"", ""subj"": {""name"": ""ASIC1...",-25024314781710374
2,hgnc:100,go:0060072,"{""type"": ""Complex"", ""members"": [{""name"": ""ASIC...",-21760308130759727
3,hgnc:100,go:0051899,"{""type"": ""Activation"", ""subj"": {""name"": ""ASIC1...",4961136544799410
4,hgnc:100,go:0051881,"{""type"": ""Inhibition"", ""subj"": {""name"": ""ASIC1...",27140835016821419
...,...,...,...,...
340709,hgnc:9999,go:0006935,"{""type"": ""Activation"", ""subj"": {""name"": ""RGS3""...",9354760842082735
340710,hgnc:9999,go:0006915,"{""type"": ""Inhibition"", ""subj"": {""name"": ""RGS3""...",-13605135813838622
340711,hgnc:9999,go:0006915,"{""type"": ""Activation"", ""subj"": {""name"": ""RGS3""...",-20316819234392314
340712,hgnc:9999,go:0006351,"{""type"": ""Activation"", ""subj"": {""name"": ""RGS3""...",6470332097344010


In [60]:
filter_df = np.ones(len(source_targ_copy), dtype=bool)
# Iterates through the filtered dataframe, so current dataframe has filtered out GO terms directly associated with 
# respective HGNC id
for i, (hgnc_id, go_id) in enumerate(zip(source_targ_copy["hgnc_id"], source_targ_copy["go_id"])):
    hgnc_id = hgnc_id[5:]
    hgnc_id_go_terms = df_grouped[df_grouped["HGNC_ID"] == hgnc_id]["GO_ID_list"].values
    go_id = go_id.replace("go", "GO")
    # Finds the children for each GO term
    try:
        descendants = nx.descendants(graph, go_id)
    except nx.NetworkXError:
        descendants = set()  
    for descendant in descendants:
        if len(hgnc_id_go_terms)>0:
            # If any of the children are in the associated list of GO terms, that row is filtered out
            if descendant in hgnc_id_go_terms[0]:
                filter_df[i] = False 

source_targ_filtered = source_targ_copy.copy()             
source_targ_filtered = source_targ_filtered[filter_df] 
source_targ_filtered

Unnamed: 0,hgnc_id,go_id,stmt_json,stmt_hash
0,hgnc:100,go:0090280,"{""type"": ""Inhibition"", ""subj"": {""name"": ""ASIC1...",-3274567538064804
1,hgnc:100,go:0061724,"{""type"": ""Activation"", ""subj"": {""name"": ""ASIC1...",-25024314781710374
5,hgnc:100,go:0046960,"{""type"": ""Activation"", ""subj"": {""name"": ""ASIC1...",-15287315324007122
6,hgnc:100,go:0042311,"{""type"": ""Activation"", ""subj"": {""name"": ""ASIC1...",22581260561020466
7,hgnc:100,go:0033344,"{""type"": ""Inhibition"", ""subj"": {""name"": ""ASIC1...",-13979930258279457
...,...,...,...,...
340709,hgnc:9999,go:0006935,"{""type"": ""Activation"", ""subj"": {""name"": ""RGS3""...",9354760842082735
340710,hgnc:9999,go:0006915,"{""type"": ""Inhibition"", ""subj"": {""name"": ""RGS3""...",-13605135813838622
340711,hgnc:9999,go:0006915,"{""type"": ""Activation"", ""subj"": {""name"": ""RGS3""...",-20316819234392314
340712,hgnc:9999,go:0006351,"{""type"": ""Activation"", ""subj"": {""name"": ""RGS3""...",6470332097344010
