# Extracting degree information from `hetio-ind`

In [2]:
import gzip

import pandas

import hetio.readwrite
import hetio.stats

In [2]:
# Download heterogeneous network
url = 'https://raw.githubusercontent.com/dhimmel/integrate/2256f1d6d01758c8bab59212a68d890ecb42bb7f/data/graph.json.gz'
! wget --no-verbose --timestamping --directory-prefix download {url}

No data received.
Last-modified header missing -- time-stamps turned off.
2015-08-16 11:20:08 URL:https://raw.githubusercontent.com/dhimmel/integrate/2256f1d6d01758c8bab59212a68d890ecb42bb7f/data/graph.json.gz [39660493/39660493] -> "download/graph.json.gz" [1]


In [3]:
# Read the heterogeneous network
graph = hetio.readwrite.read_json('download/graph.json.gz')

In [4]:
# Calculate degrees for genes
gene = graph.metagraph.get_node('gene')
degree_df = hetio.stats.get_degrees_for_metanode(graph, gene)

In [5]:
degree_df['entrez_gene_id'] = degree_df['node'].map(lambda x: int(str(x).split('::')[1]))
degree_df.head(2)

Unnamed: 0,node,metaedge,degree,entrez_gene_id
0,gene::3008,gene < overexpression downregulation < gene,0,3008
1,gene::3008,gene - participation - molecular function,3,3008


In [6]:
# Read entrez gene
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/genes-human.tsv'
entrez_df = pandas.read_table(url)
entrez_df = entrez_df.query("type_of_gene == 'protein-coding'")
entrez_df = entrez_df[['GeneID', 'Symbol']]
entrez_df = entrez_df.rename(columns={'GeneID': 'entrez_gene_id', 'Symbol': 'symbol'})
entrez_df.head(2)

Unnamed: 0,entrez_gene_id,symbol
0,1,A1BG
1,2,A2M


In [7]:
# Merge Entrez Gene info and node degrees
gene_df = entrez_df.merge(degree_df)
gene_df.head(2)

Unnamed: 0,entrez_gene_id,symbol,node,metaedge,degree
0,1,A1BG,gene::1,gene < overexpression downregulation < gene,0
1,1,A1BG,gene::1,gene - participation - molecular function,0


In [8]:
# Save as tsv
with gzip.open('data/gene-degrees.tsv.gz', 'wt') as write_file:
    gene_df.to_csv(write_file, sep='\t', index=False)

In [8]:
# Summary of metaegdes starting on Gene
url = 'https://raw.githubusercontent.com/dhimmel/integrate/2256f1d6d01758c8bab59212a68d890ecb42bb7f/data/summary/metaedges.tsv'
summary_df = pandas.read_table(url)
summary_df = summary_df[summary_df.metaedge.str.startswith('gene ')]
summary_df.to_csv('download/network-summary.tsv', sep='\t', index=False)
summary_df.head(2)

Unnamed: 0,metaedge,abbreviation,inverted,edges,source_nodes,target_nodes,unbiased
0,gene - binding - compound,GbC,1,2502,572,554,0
2,gene - upregulation - compound,GuC,1,17261,953,729,17261
