In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from neo4j.v1 import GraphDatabase
import sys
import os
import hetio.readwrite
import hetio.neo4j
import hetio.pathtools
from hetio.matrix import metaedge_to_adjacency_matrix, get_node_to_position

In [2]:
sys.path.insert(0,'../../')
from hetmech.degree_weight import dwpc

In [3]:
url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0.json.bz2'
graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph

## Paths up to length 4

In [4]:
metapaths = metagraph.extract_all_metapaths(4, exclude_inverts=True)

len(metapaths)

19716

In [13]:
metanodes = list(set([metapath.source() for metapath in metapaths]))

In [26]:
metanode_size_dict = {node: len(list(get_node_to_position(graph, node))) for node in metanodes}

In [29]:
metapaths_df = pd.DataFrame({'metapath': metapaths})

In [30]:
metapaths_df['source'] = metapaths_df['metapath'].apply(lambda x: x.source())
metapaths_df['source_size'] = metapaths_df['source'].apply(lambda x: metanode_size_dict[x])

In [34]:
metapaths_df['target'] = metapaths_df['metapath'].apply(lambda x: x.target())
metapaths_df['target_size'] = metapaths_df['target'].apply(lambda x: metanode_size_dict[x])

In [36]:
metapaths_df['array_size'] = metapaths_df['source_size'] * metapaths_df['target_size']

In [38]:
metapaths_df.head()

Unnamed: 0,metapath,source,source_size,target,target_size,array_size
0,(Disease - upregulates - Gene),Disease,137,Gene,20945,2869465
1,(Disease - palliates - Compound),Disease,137,Compound,1552,212624
2,(Disease - presents - Symptom),Disease,137,Symptom,438,60006
3,(Disease - downregulates - Gene),Disease,137,Gene,20945,2869465
4,(Disease - resembles - Disease),Disease,137,Disease,137,18769


In [44]:
metapaths_df.sort_values(by='array_size', ascending=False).head()

Unnamed: 0,metapath,source,source_size,target,target_size,array_size
14236,"(Gene - covaries - Gene, Gene < regulates < Ge...",Gene,20945,Gene,20945,438693025
17395,"(Gene < regulates < Gene, Gene - interacts - G...",Gene,20945,Gene,20945,438693025
17401,"(Gene < regulates < Gene, Gene - interacts - G...",Gene,20945,Gene,20945,438693025
17400,"(Gene < regulates < Gene, Gene - interacts - G...",Gene,20945,Gene,20945,438693025
17399,"(Gene < regulates < Gene, Gene - interacts - G...",Gene,20945,Gene,20945,438693025


In [45]:
# Average array size (number of numbers)
metapaths_df['array_size'].mean()

61573118.349969566

In [20]:
# Total number of numbers to be stored
# Average size * number of arrays number of numbers must be stored
"{:,}".format(61573118 * 19716)

'1,213,975,594,488'

# Representative matrices

## Largest matrices
G---G. ~ 20,000 x 20,000 = 400,000,000 numbers

In [2]:
random_array = np.random.rand(20000, 20000)
random_array[random_array > 0.3] = 0

np.save('test_large_array.npy', random_array)

sparse_array = sparse.csc_matrix(random_array)
sparse.save_npz('test_sparse_large_array.npz', sparse_array)

log_array = np.log1p(random_array)
np.save('test_log1p.npy', log_array)

del random_array, sparse_array, log_array

In [19]:
print(f" Dense: {os.path.getsize('test_large_array.npy') / 1000000 :.4g} MB\n",
      f"Sparse: {os.path.getsize('test_sparse_large_array.npz') / 1000000 :.4g} MB\n",
      f"Log1p: {os.path.getsize('test_log1p.npy') / 1000000 :.4g} MB")

 Dense: 3200 MB
 Sparse: 1115 MB
 Log1p: 3200 MB


In [5]:
"{:,}".format(20000**2)

'400,000,000'

400 million numbers requires 3200 MB on disk in the dense .npy format we use

## Average matrix 
61573118 numbers or ~ 7847 x 7847

In [10]:
"{:,}".format(61573118)

'61,573,118'

In [11]:
random_array = np.random.rand(7847, 7847)
random_array[random_array > 0.3] = 0

np.save('test_avgerage_array.npy', random_array)

sparse_array = sparse.csc_matrix(random_array)
sparse.save_npz('test_sparse_average_array.npz', sparse_array)

del random_array, sparse_array

In [12]:
print(f"Dense: {os.path.getsize('test_avgerage_array.npy') / 1000000 :.4g}", 'MB')
print(f"Sparse: {os.path.getsize('test_sparse_average_array.npz') / 1000000 :.4g}", 'MB')

Dense: 492.6 MB
Sparse: 171.3 MB


An average matrix requires almost 500 megabytes. With 19716 such matrices, this would be nearly 10 million megabytes, or 10 terabytes

In [14]:
"{:,}".format(19716 * 500)

'9,858,000'

## Sparse matrices

To make similar estimates for sparse matrices, we first need to know the average density of all matrices, and especially whether the densities correlate with size in some way. See `average-density.ipynb`