### Subgraph Generation
Pull out subgraphs of shortest paths between nodes of interest  
  
---

In [11]:
# get all the hetio functions
import random
from hetio.readwrite import *
from hetio.pathtools import *
from hetio.stats import *
import pandas as pd

In [2]:
%%time
# Read Hetionet v1.0
url = 'https://github.com/dhimmel/hetionet/raw/{}/{}'.format(
    '00bf0b6f8886821d91cfdf00eadad145a7a1b6da',
    'hetnet/json/hetionet-v1.0.json.bz2',
)
graph = read_graph(url)
metagraph = graph.metagraph

Wall time: 1min 31s


In [3]:
# see how many gene nodes we have in the whole graph
graph.count_nodes(metanode='Gene')

20945

In [4]:
%%time
# get list of possible Gene-to-Gene metapaths up to len==4 for downstream calcs
possible_metapaths = metagraph.extract_metapaths(source='Gene', target='Gene', max_length=3)
print(len(possible_metapaths)) # how many are there?

435
Wall time: 22 ms


---
#### 1. High Correlation Genes 
Genes ID-ed by Erin as highly correlated with CD4+ T-cell differentiation data from Sui Huang.

In [5]:
# 'query 1' from Erin:
genes1 = ["TFE3","TYK2","CPT1A","NUCB1","ENTPD4","DDX17","KLC1","JPH4","FAM214B","WDR48","CPT2"]
# genes not in HetioNet (that I know of) = ["H2A","H2B1"]

In [6]:
# 'PC 1' from Erin:
genes2 = ["BOP1","ELAC2","ADK2B","IMP4","MICAL2","NAA10","PIM2"]

In [7]:
# 'random' from Erin:
genes3 = ["ATP2C1","RBM6","ITGAL","SSR1","ZNF157","COX7A2","EIF1AX","PUF60","PLCH1","EIF3I","HMG17"]

In [8]:
subgraph_nodes = [] # init list of nodes of interest
for n in graph.get_nodes():
    if n.name in genes1:
        subgraph_nodes.append(n)

subgraph_nodes # see if we successfully grabbed gene nodes by name

[Gene::10521,
 Gene::57599,
 Gene::4924,
 Gene::84502,
 Gene::1376,
 Gene::3831,
 Gene::7297,
 Gene::80256,
 Gene::7030,
 Gene::1374,
 Gene::9583]

In [12]:
subgraph_nodes = {'ID':[], 'name':[], 'query':[]} # init dictionary of nodes of interest
for n in graph.get_nodes():
    if n.name in genes1:
        subgraph_nodes['ID'].append(n)
        subgraph_nodes['name'].append(n.name)
        subgraph_nodes['query'].append('query 1')
        
    if n.name in genes2:
        subgraph_nodes['ID'].append(n)
        subgraph_nodes['name'].append(n.name)
        subgraph_nodes['query'].append('PC 1')
        
    if n.name in genes3:
        subgraph_nodes['ID'].append(n)
        subgraph_nodes['name'].append(n.name)
        subgraph_nodes['query'].append('random')

node_dictionary = pd.DataFrame(subgraph_nodes)
node_dictionary.to_csv('gene_dict.csv')

In [15]:
%%time

paths = {'pair':[], 'source':[], 'target':[], 'metapath':[], 'paths':[], 'DWPC':[]} # init output dictionary

for pair in list(itertools.combinations(subgraph_nodes, 2)): # iterate through possible combinations of gene nodes of interest
    print('Analyzing paths between genes {} and {}'.format(pair[0].get_id()[1], pair[1].get_id()[1])) # status update
    for meta in possible_metapaths: # iterate through metapaths of len<=3 between source and target genes
        path = paths_between(graph=graph, source=pair[0], target=pair[1], metapath=meta) # get all paths between source and target of metapath type
        if len(path)!=0: # if that metapath exists between the source and target genes, append results to dictionary
            paths['pair'].append(pair)
            paths['source'].append(pair[0])
            paths['target'].append(pair[1])
            paths['metapath'].append(meta)
            paths['paths'].append(path)
            paths['DWPC'].append(DWPC(path, damping_exponent=0.4))

Analyzing paths between genes 10521 and 57599
Analyzing paths between genes 10521 and 4924
Analyzing paths between genes 10521 and 84502
Analyzing paths between genes 10521 and 1376
Analyzing paths between genes 10521 and 3831
Analyzing paths between genes 10521 and 7297
Analyzing paths between genes 10521 and 80256
Analyzing paths between genes 10521 and 7030
Analyzing paths between genes 10521 and 1374
Analyzing paths between genes 10521 and 9583
Analyzing paths between genes 57599 and 4924
Analyzing paths between genes 57599 and 84502
Analyzing paths between genes 57599 and 1376
Analyzing paths between genes 57599 and 3831
Analyzing paths between genes 57599 and 7297
Analyzing paths between genes 57599 and 80256
Analyzing paths between genes 57599 and 7030
Analyzing paths between genes 57599 and 1374
Analyzing paths between genes 57599 and 9583
Analyzing paths between genes 4924 and 84502
Analyzing paths between genes 4924 and 1376
Analyzing paths between genes 4924 and 3831
Analyzi

In [16]:
import pandas as pd
out = pd.DataFrame(paths)
out.to_csv('out2_11Mar19.csv')

In [33]:
DWPC(paths[1], damping_exponent=0.4)

0.0006933577678448712

In [46]:
subgraph1 = graph.get_subgraph(nodes=subgraph_nodes)

In [63]:
subgraph1.n_nodes

11

In [5]:
# Specify compound and disease
compound_id = 'Compound', 'DB01156'  # Bupropion
disease_id = 'Disease', 'DOID:0050742'  # nicotine dependences
metapath = metagraph.metapath_from_abbrev('CbGpPWpGaD')

In [20]:
compound_id

('Compound', 'DB01156')

In [6]:
# Extract walks between genes
paths = hetio.pathtools.paths_between(
    graph, 
    source=graph.node_dict[compound_id],
    target=graph.node_dict[disease_id],
    #metapath=metapath,
    duplicates=True,
)

In [7]:
metaedges = set(metapath)
nodes = set()
for path in paths:
    nodes.update(path.get_nodes())
    for edge in path:
        # Add incidental nodes along paths to enable correct DWWC/DWPC computations
        # for the CbGpPWpGaD metapath
        nodes.update(e.target for e in edge.source.get_edges(edge.metaedge))
        nodes.update(e.target for e in edge.target.get_edges(edge.metaedge.inverse))

# Add Gene-interacts-Gene metaedge (not essential but may be useful)
metaedges.add(metagraph.metapath_from_abbrev('GiG')[0])

# Get subgraph
subgraph = graph.get_subgraph(metaedges=metaedges, nodes=nodes)

In [8]:
# Metagraph size
subgraph.metagraph.n_nodes, subgraph.metagraph.n_edges

(4, 4)

In [9]:
# Graph size
subgraph.n_nodes, subgraph.n_edges

(3060, 18474)

In [10]:
# Metanode info
hetio.stats.get_metanode_df(subgraph)

Unnamed: 0,metanode,abbreviation,metaedges,nodes,unconnected_nodes
0,Compound,C,1,730,730
3,Disease,D,1,16,16
1,Gene,G,4,2226,612
2,Pathway,PW,1,88,0


In [11]:
# Metaedge info
hetio.stats.get_metaedge_df(subgraph)

Unnamed: 0,metaedge,abbreviation,edges,source_nodes,target_nodes
0,Compound - binds - Gene,CbG,5634,730,472
1,Disease - associates - Gene,DaG,937,16,557
2,Gene - interacts - Gene,GiG,5124,1061,1213
3,Gene - participates - Pathway,GpPW,6779,2222,88


In [12]:
# Export as JSON
hetio.readwrite.write_graph(subgraph, 'bupropion-CbGpPWpGaD-subgraph.json.xz')

## Random subgraph of ~100 nodes per metanode

Choose a different number of nodes per metanode to increase the liklihood rotten matrix operations cause misalignment.

In [13]:
metanode_to_nodes = graph.get_metanode_to_nodes()
n_nodes = 100
node_subset = list()
random.seed(0, version=2)
for metanode, nodes in sorted(metanode_to_nodes.items()):
    nodes = sorted(nodes)
    nodes = random.sample(nodes, n_nodes)
    node_subset.extend(nodes)
    n_nodes += 1

In [14]:
# Get subgraph
subgraph = graph.get_subgraph(nodes=node_subset)

In [15]:
# Metagraph size
subgraph.metagraph.n_nodes, subgraph.metagraph.n_edges

(11, 24)

In [16]:
# Graph size
subgraph.n_nodes, subgraph.n_edges

(1155, 3123)

In [17]:
# Metanode info
hetio.stats.get_metanode_df(subgraph)

Unnamed: 0,metanode,abbreviation,metaedges,nodes,unconnected_nodes
0,Anatomy,A,4,100,7
1,Biological Process,BP,1,101,86
2,Cellular Component,CC,1,102,90
3,Compound,C,8,103,73
4,Disease,D,8,104,22
5,Gene,G,16,105,19
6,Molecular Function,MF,1,106,95
7,Pathway,PW,1,107,87
8,Pharmacologic Class,PC,1,108,108
9,Side Effect,SE,1,109,76


In [18]:
# Metaedge info
hetio.stats.get_metaedge_df(subgraph)

Unnamed: 0,metaedge,abbreviation,edges,source_nodes,target_nodes
0,Anatomy - downregulates - Gene,AdG,173,13,64
1,Anatomy - expresses - Gene,AeG,735,29,80
2,Anatomy - upregulates - Gene,AuG,126,13,58
6,Compound - binds - Gene,CbG,10,10,1
9,Compound - causes - Side Effect,CcSE,170,53,33
7,Compound - downregulates - Gene,CdG,6,6,4
4,Compound - palliates - Disease,CpD,26,16,13
3,Compound - resembles - Compound,CrC,16,14,14
5,Compound - treats - Disease,CtD,37,18,26
8,Compound - upregulates - Gene,CuG,4,3,4


In [19]:
# Export as JSON
hetio.readwrite.write_graph(subgraph, 'random-subgraph.json.xz')