In [1]:
import requests
import gzip
import pandas as pd
import networkx as nx
import sys

import ddot
from ddot import Ontology

In [3]:
ndex_server = 'http://test.ndexbio.org' 

# Set the NDEx server and the user account (replace with your own account)
ndex_user, ndex_pass = 'xiaoran.a.yan', 'a1l9i8f4e3'

In [4]:
# Download GO obo file
r = requests.get('http://purl.obolibrary.org/obo/go/go-basic.obo')
with open('go-basic.obo', 'wb') as f:
    f.write(r.content)

# Parse OBO file
ddot.parse_obo('go-basic.obo', 'go.tab', 'goID_2_name.tab', 'goID_2_namespace.tab', 'goID_2_alt_id.tab')

# Download gene-term annotations for human
r = requests.get('http://geneontology.org/gene-associations/goa_human.gaf.gz')
with open('goa_human.gaf.gz', 'wb') as f:
    f.write(r.content)

In [42]:
hierarchy = pd.read_table('go.tab',
                          sep='\t',
                          header=None,
                          names=['Parent', 'Child', 'Relation', 'Namespace'])
with gzip.open('goa_human.gaf.gz', 'rb') as f:
    mapping = ddot.parse_gaf(f)

In [43]:
go_human = Ontology.from_table(
    table=hierarchy,
    parent='Parent',
    child='Child',
    mapping=mapping,
    mapping_child='DB Object ID',
    mapping_parent='GO ID',
    add_root_name='GO:00SUPER',
    ignore_orphan_terms=True)
go_human.clear_node_attr()
go_human.clear_edge_attr()
go_human

Unifying 3 roots into one super-root


19626 genes, 44412 terms, 287930 gene-term relations, 89326 term-term relations
node_attributes: []
edge_attributes: []

In [11]:
go_human = go_human.collapse_ontology(method='mhkramer')
if 'GO:00SUPER' not in go_human.terms: go_human.add_root('GO:00SUPER', inplace=True)
print(go_human)

collapse command: /home/yan30/.ddot/lib/python3.7/site-packages/ddot/alignOntology/collapseRedundantNodes /tmp/tmpwhvvee09
19626 genes, 20153 terms, 228993 gene-term relations, 44286 term-term relations
node_attributes: []
edge_attributes: []


In [44]:
go_descriptions = pd.read_table('goID_2_name.tab',
                                header=None,
                                names=['Term', 'Term_Description'],
                                index_col=0)
go_human.update_node_attr(go_descriptions)

go_branches = pd.read_table('goID_2_namespace.tab',
                                header=None,
                                names=['Term', 'Branch'],
                                index_col=0)
go_human.update_node_attr(go_branches)

In [19]:
go_human.to_table('go_human.txt')

Unnamed: 0,Parent,Child,EdgeType
0,GO:0000002,GO:0032042,Child-Parent
1,GO:0000003,GO:0019953,Child-Parent
2,GO:0000003,GO:0022414,Child-Parent
3,GO:0000003,GO:0032504,Child-Parent
4,GO:0000003,GO:0071515,Child-Parent
5,GO:0000012,GO:1903516,Child-Parent
6,GO:0000012,GO:1903518,Child-Parent
7,GO:0000012,GO:1903824,Child-Parent
8,GO:0000014,GO:1990599,Child-Parent
9,GO:0000018,GO:0000019,Child-Parent


In [20]:
import mygene
mg = mygene.MyGeneInfo()
name = 'Human-specific Gene Ontology'

In [45]:
go_human_uniprot = go_human.copy()

# Write GO to file
go_human_uniprot.to_table('collapsed_go.uniprot', clixo_format=True)
go_human_uniprot.to_pickle('collapsed_go.uniprot.pkl')

url, G = go_human_uniprot.to_ndex(name='%s, %s' % (name, 'UniProt'),
                                  ndex_server=ndex_server,
                                  ndex_user=ndex_user,
                                  ndex_pass=ndex_pass,
                                  layout=None,
                                  visibility='PUBLIC')
print(url)

https://test.ndexbio.org/v2/network/91bd87aa-bb70-11ea-a4d3-0660b7976219


In [46]:
uniprot_2_symbol_df = mg.querymany(go_human.genes, scopes='uniprot', fields='symbol', species='human', as_dataframe=True)

def f(x):
    x = x['symbol']
    if len(x)==1:
        return x[0]
    else:
        return x.tolist()
uniprot_2_symbol = uniprot_2_symbol_df.dropna(subset=['symbol']).groupby('query').apply(f)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-19626...done.
Finished.
321 input query terms found dup hits:
	[('A0A075B6H7', 2), ('A0A075B6J1', 2), ('A0A075B6J2', 2), ('A0A075B6P5', 2), ('A0A075B6Q5', 2), ('A0
731 input query terms found no hit:
	['A0A075B734', 'A0A075B767', 'A0A087WSY4', 'A0A087WUL8', 'A0A087WUU8', 'A0A087WX78', 'A0A087X1C1', '
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [47]:
go_human_symbol = go_human.delete(to_delete=set(go_human.genes) - set(uniprot_2_symbol.keys()))
go_human_symbol = go_human_symbol.rename(genes=uniprot_2_symbol.to_dict())
print(go_human_symbol)

# Write GO to file
go_human_symbol.to_table('collapsed_go.symbol', clixo_format=True)
go_human_symbol.to_pickle('collapsed_go.symbol.pkl')

url, G = go_human_symbol.to_ndex(name='%s, %s' % (name, 'Symbol'),
                                 ndex_server=ndex_server,
                                 ndex_user=ndex_user,
                                 ndex_pass=ndex_pass,
                                 layout=None,
                                 visibility='PUBLIC')
print(url)

19021 genes, 44412 terms, 285936 gene-term relations, 89326 term-term relations
node_attributes: ['Term_Description', 'Branch']
edge_attributes: []
https://test.ndexbio.org/v2/network/bd0eb3cd-bb70-11ea-a4d3-0660b7976219


In [26]:
go_human_symbol.to_table('go_human.txt')

Unnamed: 0,Parent,Child,EdgeType
0,GO:0000002,GO:0032042,Child-Parent
1,GO:0000003,GO:0019953,Child-Parent
2,GO:0000003,GO:0022414,Child-Parent
3,GO:0000003,GO:0032504,Child-Parent
4,GO:0000003,GO:0071515,Child-Parent
5,GO:0000012,GO:1903516,Child-Parent
6,GO:0000012,GO:1903518,Child-Parent
7,GO:0000012,GO:1903824,Child-Parent
8,GO:0000014,GO:1990599,Child-Parent
9,GO:0000018,GO:0000019,Child-Parent


In [32]:
"ALKBH3" in go_human_symbol.genes

True

In [33]:
import pickle

with open ('hierarchyQnew0.save', 'rb') as fp:
    itemlist = pickle.load(fp)

hierarchy  = [x for x in itemlist if "subsystem" in str(x[0])]
mapping = [x for x in itemlist if "subsystem" not in str(x[0])]

import ddot
from ddot import Ontology
# Construct ontology
ont = Ontology(hierarchy, mapping)

# Prints a summary of the ontology's structure
print(ont)
ont.to_table('toy_ontology2.txt')

306 genes, 394 terms, 357 gene-term relations, 470 term-term relations
node_attributes: []
edge_attributes: []


Unnamed: 0,Parent,Child,EdgeType
0,subsystem100,subsystem85,Child-Parent
1,subsystem100,subsystem88,Child-Parent
2,subsystem101,subsystem86,Child-Parent
3,subsystem101,subsystem94,Child-Parent
4,subsystem102,subsystem87,Child-Parent
5,subsystem102,subsystem93,Child-Parent
6,subsystem104,subsystem3,Child-Parent
7,subsystem104,subsystem6,Child-Parent
8,subsystem108,subsystem41,Child-Parent
9,subsystem109,subsystem95,Child-Parent


In [52]:
matchers = ont.genes
matching = [s for s in matchers if any(xs in s for xs in go_human_symbol.genes)]
matching

['ACD',
 'ADNP',
 'ALKBH2',
 'ALKBH3',
 'APEX1',
 'APEX2',
 'APLF',
 'APTX',
 'ATAD5',
 'ATM',
 'ATR',
 'ATRIP',
 'ATRX',
 'BARD1',
 'BCCIP',
 'BLM',
 'BRAT1',
 'BRCA1',
 'BRCA2',
 'BRCC3',
 'BRIP1',
 'CBX3',
 'CCNA2',
 'CCNH',
 'CDC45',
 'CDC7',
 'CDK2',
 'CDK7',
 'CENPK',
 'CEP152',
 'CETN2',
 'CETN3',
 'CHAF1A',
 'CHEK1',
 'CHEK2',
 'CHTF18',
 'CKS2',
 'CLK2',
 'CLSPN',
 'DBF4B',
 'DCLRE1A',
 'DCLRE1B',
 'DCLRE1C',
 'DDB1',
 'DDB2',
 'DHX9',
 'DMC1',
 'DNA2',
 'DRG1',
 'DSCC1',
 'DTL',
 'DUT',
 'EME1',
 'EME2',
 'ENDOV',
 'ERCC1',
 'ERCC2',
 'ERCC3',
 'ERCC4',
 'ERCC5',
 'ERCC6',
 'ERCC8',
 'EXO1',
 'FAAP100',
 'FAAP20',
 'FAAP24',
 'FAN1',
 'FANCA',
 'FANCB',
 'FANCC',
 'FANCD2',
 'FANCE',
 'FANCF',
 'FANCG',
 'FANCI',
 'FANCL',
 'FANCM',
 'FEN1',
 'GADD45A',
 'GADD45G',
 'GEN1',
 'GGCT',
 'GINS1',
 'GINS2',
 'GINS3',
 'GINS4',
 'GMPS',
 'GTF2H1',
 'GTF2H2',
 'GTF2H3',
 'GTF2H4',
 'GTF2H5',
 'HELQ',
 'HLTF',
 'HMGB1',
 'HMGB2',
 'HNRNPR',
 'HNRNPUL1',
 'HUS1',
 'HUS1B',
 'KIN',
 'L

In [53]:
len(matching)

303

In [63]:
go_human_symbol

19021 genes, 44412 terms, 285936 gene-term relations, 89326 term-term relations
node_attributes: ['Term_Description', 'Branch']
edge_attributes: []

In [54]:
ont0 = pd.read_csv('data/pearl2015.txt', sep='\t', header=None)
ont2 = Ontology.from_table(ont0)
print(ont2)
ont2.to_table('pearl2015.txt')

437 genes, 74 terms, 533 gene-term relations, 74 term-term relations
node_attributes: []
edge_attributes: [2]


Unnamed: 0,Parent,Child,EdgeType
0,Alternative mechanism,MRN Complex,Child-Parent
1,Associated process,Checkpoint factors,Child-Parent
2,Associated process,Chromatin remodelling,Child-Parent
3,Associated process,Chromosome segregation,Child-Parent
4,Associated process,DNA replication,Child-Parent
5,Associated process,Modulation of nucleotide pools,Child-Parent
6,Associated process,p53 pathway,Child-Parent
7,Associated process,Telomere maintenance,Child-Parent
8,Associated process,TLS,Child-Parent
9,Associated process,Topisomerase damage reversal,Child-Parent


In [55]:
matchers = ont2.genes
matching = [s for s in matchers if any(xs in s for xs in go_human_symbol.genes)]
matching

['ABL1',
 'ACD',
 'ACTL6A',
 'ACTR5',
 'ACTR8',
 'ALKBH1',
 'ALKBH2',
 'ALKBH3',
 'AMN1',
 'ANKRD28',
 'ANKRD44',
 'ANKRD52',
 'APEX1',
 'APEX2',
 'APLF',
 'APTX',
 'ARID1A',
 'ARID1B',
 'ARID2',
 'ASF1A',
 'ASF1B',
 'ATF2',
 'ATM',
 'ATR',
 'ATRIP',
 'ATRX',
 'ATXN3',
 'BAP1',
 'BARD1',
 'BAZ1A',
 'BAZ1B',
 'BCAS2',
 'BLM',
 'BRCA1',
 'BRCA2',
 'BRCC3',
 'BRD7',
 'BRIP1',
 'BTG2',
 'CCDC98',
 'CCNA1',
 'CCNA2',
 'CCNB1',
 'CCNB2',
 'CCNB3',
 'CCND1',
 'CCND2',
 'CCND3',
 'CCNE1',
 'CCNH',
 'CCNO',
 'CDC25A',
 'CDC25B',
 'CDC5L',
 'CDK2',
 'CDK4',
 'CDK7',
 'CDKN1A',
 'CDKN2A',
 'CDKN2D',
 'CETN2',
 'CHAF1A',
 'CHEK1',
 'CHEK2',
 'CHRAC1',
 'CIB1',
 'CLK2',
 'CLSPN',
 'CNOT7',
 'COPS2',
 'COPS3',
 'COPS4',
 'COPS5',
 'COPS6',
 'COPS7A',
 'COPS7B',
 'COPS8',
 'CRY1',
 'CRY2',
 'CSNK1D',
 'CSNK1E',
 'CTC1',
 'CUL3',
 'CUL4A',
 'CUL5',
 'DAXX',
 'DCLRE1A',
 'DCLRE1B',
 'DCLRE1C',
 'DDB1',
 'DDB2',
 'DDX11',
 'DKC1',
 'DLGAP5',
 'DMC1',
 'DNA2',
 'DNMT1',
 'DNTT',
 'DUT',
 'EID3',
 'EME1',

In [56]:
len(matching)

425

In [65]:
matchers = ont.genes
matching = [s for s in matchers if any(xs in s for xs in ont2.genes)]
len(matching)

233

In [None]:
[({218, 219, 220, 221, 222}, 0.5802251813002428), ({109, 111, 112, 113, 114}, 0.6067056040722213), ({192, 193, 170}, 0.6835762943989093), ({202, 127, 138}, 0.7005106393471607), ({248, 140, 247}, 0.7174214438597685), ({139, 141, 143}, 0.7297438599366777), ({232, 230, 231}, 0.7319602600860855), ({85, 86, 87}, 0.7359528143738937), ({142, 247}, 0.8489723014245765), ({264, 265}, 0.8565835752200133), ({177, 178}, 0.8597459057197241), ({168, 120}, 0.8650893417644118), ({165, 246}, 0.8719422626098627), ({50, 36}, 0.87737607502982), ({170, 171}, 0.8852184524534124), ({84, 85}, 0.8923500122737862), ({128, 132}, 0.9261346565729521), ({164, 246}, 0.9482938968281389), ({251, 252}, 0.9495998443046346), ({105, 300}, 0.9847271483003348)]
[(1, -3339724.0), (3, -3340049.0), (0, -3340576.0), (10, -3341805.0), (4, -3342391.0), (7, -3342517.0), (14, -3344757.2), (8, -3344895.5), (6, -3344945.2), (2, -3344954.5), (12, -3345258.0), (11, -3346821.5), (5, -3346951.2), (13, -3347552.0), (15, -3348224.0), (9, -3360879.8)]