### Subgraph Generation
Pull out subgraphs of shortest paths between nodes of interest  
  
---

In [3]:
import random
import hetio.readwrite
import hetio.pathtools
import hetio.stats

In [4]:
# Read Hetionet v1.0
url = 'https://github.com/dhimmel/hetionet/raw/{}/{}'.format(
    '00bf0b6f8886821d91cfdf00eadad145a7a1b6da',
    'hetnet/json/hetionet-v1.0.json.bz2',
)
graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph

In [66]:
# get list of possible metapaths up to len==4 for downstream calcs
possible_metapaths = metagraph.extract_metapaths(source='Gene', target='Gene', max_length=4)
#metagraph.extract_all_metapaths(max_length=4, exclude_inverts=True)

In [67]:
len(possible_metapaths)

3977

In [68]:
possible_metapaths

[GcG,
 GiG,
 G<rG,
 Gr>G,
 GdAdG,
 GdAeG,
 GdAuG,
 GeAdG,
 GeAeG,
 GeAuG,
 GuAdG,
 GuAeG,
 GuAuG,
 GpBPpG,
 GpCCpG,
 GbCbG,
 GbCdG,
 GbCuG,
 GdCbG,
 GdCdG,
 GdCuG,
 GuCbG,
 GuCdG,
 GuCuG,
 GaDaG,
 GaDdG,
 GaDuG,
 GdDaG,
 GdDdG,
 GdDuG,
 GuDaG,
 GuDdG,
 GuDuG,
 GcGcG,
 GcGiG,
 GcG<rG,
 GcGr>G,
 GiGcG,
 GiGiG,
 GiG<rG,
 GiGr>G,
 G<rGcG,
 G<rGiG,
 G<rG<rG,
 G<rGr>G,
 Gr>GcG,
 Gr>GiG,
 Gr>G<rG,
 Gr>Gr>G,
 GpMFpG,
 GpPWpG,
 GdAlDaG,
 GdAlDdG,
 GdAlDuG,
 GdAdGcG,
 GdAdGiG,
 GdAdG<rG,
 GdAdGr>G,
 GdAeGcG,
 GdAeGiG,
 GdAeG<rG,
 GdAeGr>G,
 GdAuGcG,
 GdAuGiG,
 GdAuG<rG,
 GdAuGr>G,
 GeAlDaG,
 GeAlDdG,
 GeAlDuG,
 GeAdGcG,
 GeAdGiG,
 GeAdG<rG,
 GeAdGr>G,
 GeAeGcG,
 GeAeGiG,
 GeAeG<rG,
 GeAeGr>G,
 GeAuGcG,
 GeAuGiG,
 GeAuG<rG,
 GeAuGr>G,
 GuAlDaG,
 GuAlDdG,
 GuAlDuG,
 GuAdGcG,
 GuAdGiG,
 GuAdG<rG,
 GuAdGr>G,
 GuAeGcG,
 GuAeGiG,
 GuAeG<rG,
 GuAeGr>G,
 GuAuGcG,
 GuAuGiG,
 GuAuG<rG,
 GuAuGr>G,
 GpBPpGcG,
 GpBPpGiG,
 GpBPpG<rG,
 GpBPpGr>G,
 GpCCpGcG,
 GpCCpGiG,
 GpCCpG<rG,
 GpCCpGr>G,
 GbCrCbG,
 GbCrCdG

In [28]:
# see how many gene nodes we have in the whole graph
graph.count_nodes(metanode='Gene')

20945

#### 1. High Correlation Genes 
Genes ID-ed by Erin as highly correlated with CD4+ T-cell differentiation data from Sui Huang.

In [39]:
genes1 = ["TFE3","TYK2","H2A","H2B1","CPT1A","NUCB1","ENTPD4","DDX17","KLC1","JPH4","FAM214B","WDR48","CPT2"]

In [44]:
subgraph_nodes = [] # init list of nodes of interest
for n in graph.get_nodes():
    if n.name in genes1:
        subgraph_nodes.append(n)

In [45]:
subgraph_nodes

[Gene::10521,
 Gene::57599,
 Gene::4924,
 Gene::84502,
 Gene::1376,
 Gene::3831,
 Gene::7297,
 Gene::80256,
 Gene::7030,
 Gene::1374,
 Gene::9583]

In [46]:
subgraph1 = graph.get_subgraph(nodes=subgraph_nodes)

In [63]:
subgraph1.n_nodes

11

In [61]:
graph.get_metaedge_to_edges('{}'.format(possible_metapaths[0]))

{<hetio.hetnet.MetaEdge at 0x1f2083198>: [<hetio.hetnet.Edge at 0x15619c400>,
  <hetio.hetnet.Edge at 0x15619cb70>,
  <hetio.hetnet.Edge at 0x15619cd30>,
  <hetio.hetnet.Edge at 0x15619cda0>,
  <hetio.hetnet.Edge at 0x1561a6160>,
  <hetio.hetnet.Edge at 0x1561a61d0>,
  <hetio.hetnet.Edge at 0x1561a6780>,
  <hetio.hetnet.Edge at 0x1561a6b00>,
  <hetio.hetnet.Edge at 0x1561a6e80>,
  <hetio.hetnet.Edge at 0x1561af160>,
  <hetio.hetnet.Edge at 0x1561b6b00>,
  <hetio.hetnet.Edge at 0x1561bc2b0>,
  <hetio.hetnet.Edge at 0x1561c5a20>,
  <hetio.hetnet.Edge at 0x1561c5cc0>,
  <hetio.hetnet.Edge at 0x1561cc080>,
  <hetio.hetnet.Edge at 0x1561d3240>,
  <hetio.hetnet.Edge at 0x1561d36a0>,
  <hetio.hetnet.Edge at 0x1561d38d0>,
  <hetio.hetnet.Edge at 0x1561da6a0>,
  <hetio.hetnet.Edge at 0x1561e35c0>,
  <hetio.hetnet.Edge at 0x1561e3be0>,
  <hetio.hetnet.Edge at 0x1561e3d30>,
  <hetio.hetnet.Edge at 0x1561ea8d0>,
  <hetio.hetnet.Edge at 0x1561eae80>,
  <hetio.hetnet.Edge at 0x1561f16a0>,
  <hetio.h

In [74]:
for meta in possible_metapaths[0:10]:
    print(hetio.pathtools.paths_between(graph=graph, source=subgraph_nodes[0], target=subgraph_nodes[1], 
                                  metapath=meta))

[]
[]
[]
[]
[Gene::10521 - downregulates - Anatomy::UBERON:0002030 - downregulates - Gene::57599, Gene::10521 - downregulates - Anatomy::UBERON:0000997 - downregulates - Gene::57599, Gene::10521 - downregulates - Anatomy::UBERON:0002107 - downregulates - Gene::57599, Gene::10521 - downregulates - Anatomy::UBERON:0002369 - downregulates - Gene::57599]
[Gene::10521 - downregulates - Anatomy::UBERON:0001013 - expresses - Gene::57599, Gene::10521 - downregulates - Anatomy::UBERON:0002030 - expresses - Gene::57599, Gene::10521 - downregulates - Anatomy::UBERON:0002107 - expresses - Gene::57599, Gene::10521 - downregulates - Anatomy::UBERON:0000955 - expresses - Gene::57599, Gene::10521 - downregulates - Anatomy::UBERON:0000473 - expresses - Gene::57599, Gene::10521 - downregulates - Anatomy::UBERON:0000997 - expresses - Gene::57599, Gene::10521 - downregulates - Anatomy::UBERON:0002369 - expresses - Gene::57599, Gene::10521 - downregulates - Anatomy::UBERON:0001891 - expresses - Gene::57599

In [5]:
# Specify compound and disease
compound_id = 'Compound', 'DB01156'  # Bupropion
disease_id = 'Disease', 'DOID:0050742'  # nicotine dependences
metapath = metagraph.metapath_from_abbrev('CbGpPWpGaD')

In [20]:
compound_id

('Compound', 'DB01156')

In [6]:
# Extract walks between genes
paths = hetio.pathtools.paths_between(
    graph, 
    source=graph.node_dict[compound_id],
    target=graph.node_dict[disease_id],
    #metapath=metapath,
    duplicates=True,
)

In [7]:
metaedges = set(metapath)
nodes = set()
for path in paths:
    nodes.update(path.get_nodes())
    for edge in path:
        # Add incidental nodes along paths to enable correct DWWC/DWPC computations
        # for the CbGpPWpGaD metapath
        nodes.update(e.target for e in edge.source.get_edges(edge.metaedge))
        nodes.update(e.target for e in edge.target.get_edges(edge.metaedge.inverse))

# Add Gene-interacts-Gene metaedge (not essential but may be useful)
metaedges.add(metagraph.metapath_from_abbrev('GiG')[0])

# Get subgraph
subgraph = graph.get_subgraph(metaedges=metaedges, nodes=nodes)

In [8]:
# Metagraph size
subgraph.metagraph.n_nodes, subgraph.metagraph.n_edges

(4, 4)

In [9]:
# Graph size
subgraph.n_nodes, subgraph.n_edges

(3060, 18474)

In [10]:
# Metanode info
hetio.stats.get_metanode_df(subgraph)

Unnamed: 0,metanode,abbreviation,metaedges,nodes,unconnected_nodes
0,Compound,C,1,730,730
3,Disease,D,1,16,16
1,Gene,G,4,2226,612
2,Pathway,PW,1,88,0


In [11]:
# Metaedge info
hetio.stats.get_metaedge_df(subgraph)

Unnamed: 0,metaedge,abbreviation,edges,source_nodes,target_nodes
0,Compound - binds - Gene,CbG,5634,730,472
1,Disease - associates - Gene,DaG,937,16,557
2,Gene - interacts - Gene,GiG,5124,1061,1213
3,Gene - participates - Pathway,GpPW,6779,2222,88


In [12]:
# Export as JSON
hetio.readwrite.write_graph(subgraph, 'bupropion-CbGpPWpGaD-subgraph.json.xz')

## Random subgraph of ~100 nodes per metanode

Choose a different number of nodes per metanode to increase the liklihood rotten matrix operations cause misalignment.

In [13]:
metanode_to_nodes = graph.get_metanode_to_nodes()
n_nodes = 100
node_subset = list()
random.seed(0, version=2)
for metanode, nodes in sorted(metanode_to_nodes.items()):
    nodes = sorted(nodes)
    nodes = random.sample(nodes, n_nodes)
    node_subset.extend(nodes)
    n_nodes += 1

In [14]:
# Get subgraph
subgraph = graph.get_subgraph(nodes=node_subset)

In [15]:
# Metagraph size
subgraph.metagraph.n_nodes, subgraph.metagraph.n_edges

(11, 24)

In [16]:
# Graph size
subgraph.n_nodes, subgraph.n_edges

(1155, 3123)

In [17]:
# Metanode info
hetio.stats.get_metanode_df(subgraph)

Unnamed: 0,metanode,abbreviation,metaedges,nodes,unconnected_nodes
0,Anatomy,A,4,100,7
1,Biological Process,BP,1,101,86
2,Cellular Component,CC,1,102,90
3,Compound,C,8,103,73
4,Disease,D,8,104,22
5,Gene,G,16,105,19
6,Molecular Function,MF,1,106,95
7,Pathway,PW,1,107,87
8,Pharmacologic Class,PC,1,108,108
9,Side Effect,SE,1,109,76


In [18]:
# Metaedge info
hetio.stats.get_metaedge_df(subgraph)

Unnamed: 0,metaedge,abbreviation,edges,source_nodes,target_nodes
0,Anatomy - downregulates - Gene,AdG,173,13,64
1,Anatomy - expresses - Gene,AeG,735,29,80
2,Anatomy - upregulates - Gene,AuG,126,13,58
6,Compound - binds - Gene,CbG,10,10,1
9,Compound - causes - Side Effect,CcSE,170,53,33
7,Compound - downregulates - Gene,CdG,6,6,4
4,Compound - palliates - Disease,CpD,26,16,13
3,Compound - resembles - Compound,CrC,16,14,14
5,Compound - treats - Disease,CtD,37,18,26
8,Compound - upregulates - Gene,CuG,4,3,4


In [19]:
# Export as JSON
hetio.readwrite.write_graph(subgraph, 'random-subgraph.json.xz')