# Comparison of disk storage methods

This notebook is used to compare PyTables, an HDF5 Python implementation, with Numpy's native .npy/.npz file format

In [1]:
import tables

import pandas as pd
from neo4j.v1 import GraphDatabase
import hetio.readwrite
import hetio.neo4j
import hetio.pathtools
import numpy as np
from scipy import sparse
import time
from hetmech.matrix import get_node_to_position, metaedge_to_adjacency_matrix

In [2]:
url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0.json.bz2'
graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph

In [4]:
# Sparse matrices out of .npz
times1=[]
for i in range(100):
    t1 = time.time()
    sparse_gig_adj_load_sp = sparse.load_npz('data/sparse_gig_adj.npz')
    sparse_ctd_adj_load_sp = sparse.load_npz('data/sparse_ctd_adj.npz')
    t2 = time.time()
    times1.append(t2-t1)
print(f'Average: {np.mean(times1)*1000 :.4} ms, Stdev: {np.std(times1)*1000 :.4} ms, Max: {np.max(times1)*1000 :.4} ms')

Average: 4.181 ms, Stdev: 0.1207 ms, Max: 4.972 ms


# NPY Saving and loading

In [5]:
r, c, gig_adj = metaedge_to_adjacency_matrix(graph, 'GiG')
r, c, ctd_adj = metaedge_to_adjacency_matrix(graph, 'CtD')
r, c, sparse_gig_adj = metaedge_to_adjacency_matrix(graph, 'GiG', sparse_threshold=1)
r, c, sparse_ctd_adj = metaedge_to_adjacency_matrix(graph, 'CtD', sparse_threshold=1)

## Save matrices

#### np.save

In [6]:
# Dense matrices into .npy
t1 = time.time()
np.save('data/gig_adj', gig_adj)
np.save('data/ctd_adj', ctd_adj)
t2 = time.time()
print(f'{(t2 - t1)*1000 :.4} ms')

640.0 ms


In [7]:
# Sparse matrices into .npy
t1 = time.time()
np.save('data/sparse_gig_adj', sparse_gig_adj)
np.save('data/sparse_ctd_adj', sparse_ctd_adj)
t2 = time.time()
print(f'{(t2 - t1)*1000 :.4} ms')

5.203 ms


#### sparse.save_npz

In [8]:
# Sparse matrices into .npz with compression
t1 = time.time()
sparse.save_npz('data/sparse_gig_adj', sparse_gig_adj, compressed=True)
sparse.save_npz('data/sparse_ctd_adj', sparse_ctd_adj, compressed=True)
t2 = time.time()
print(f'{(t2 - t1)*1000 :.4} ms')

224.4 ms


In [9]:
# Sparse matrices into .npz without compression
t1 = time.time()
sparse.save_npz('data/sparse_gig_adj', sparse_gig_adj, compressed=False)
sparse.save_npz('data/sparse_ctd_adj', sparse_ctd_adj, compressed=False)
t2 = time.time()
print(f'{(t2 - t1)*1000 :.4} ms')

11.15 ms


## Load matrices

#### np.load

In [10]:
# Dense matrices out of .npy
t1 = time.time()
gig_adj_load = np.load('data/gig_adj.npy')
ctd_adj_load = np.load('data/ctd_adj.npy')
t2 = time.time()
print(f'{(t2 - t1)*1000 :.4} ms')

89.1 ms


In [11]:
# Sparse matrices out of .npy
t1 = time.time()
sparse_gig_adj_load = np.load('data/sparse_gig_adj.npy')
sparse_ctd_adj_load = np.load('data/sparse_ctd_adj.npy')
t2 = time.time()
print(f'{(t2 - t1)*1000 :.4} ms')

11.86 ms


#### sparse.load_npz

In [12]:
# Sparse matrices out of .npz
times=[]
for i in range(100):
    t1 = time.time()
    sparse_gig_adj_load_sp = sparse.load_npz('data/sparse_gig_adj.npz')
    sparse_ctd_adj_load_sp = sparse.load_npz('data/sparse_ctd_adj.npz')
    t2 = time.time()
    times.append(t2-t1)
print(f'Average: {np.mean(times)*1000 :.4} ms, Stdev: {np.std(times)*1000 :.4} ms, Max: {np.max(times)*1000 :.4} ms')

Average: 4.54 ms, Stdev: 2.209 ms, Max: 21.46 ms


In [19]:
import matplotlib.pyplot as plt
%matplotlib.inline

ModuleNotFoundError: No module named 'matplotlib'

# HDF5 Saving and loading

In [13]:
abbr = [metaedge.get_abbrev() for metaedge in metagraph.get_edges()]

In [14]:
h5file = tables.open_file('data/sparse_matrices.h5', mode='w', title='Adjacency Matrices')

for edge in abbr:
    group_path = f'{edge[0]}{edge[-1]}'
    try:
        h5file.create_group('/', group_path)
    except:
        pass
    r, c, sparse_matrix = metaedge_to_adjacency_matrix(graph, edge, sparse_threshold=1)
    group = h5file.create_group('/'+group_path+'/', edge)
    h5file.create_array(group, edge+'_data', sparse_matrix.data)
    h5file.create_array(group, edge+'_indices', sparse_matrix.indices)
    h5file.create_array(group, edge+'_indptr', sparse_matrix.indptr)
    h5file.create_array(group, edge+'_shape', sparse_matrix.shape)

h5file.close()



In [15]:
def pt_load_array(h5file, node):
    
    data = h5file.get_node(f'/{node[0]}{node[-1]}/{node}/{node}_data').read()
    indices = h5file.get_node(f'/{node[0]}{node[-1]}/{node}/{node}_indices').read()
    indptr = h5file.get_node(f'/{node[0]}{node[-1]}/{node}/{node}_indptr').read()
    mat_shape = h5file.get_node(f'/{node[0]}{node[-1]}/{node}/{node}_shape').read()
    return sparse.csc_matrix((data, indices, indptr), shape=mat_shape)

In [16]:
times_pt = []
for i in range(5000):
    t1 = time.time()
    h5file = tables.open_file('data/sparse_matrices.h5', 'a')

    sparse_ctd_load = pt_load_array(h5file, 'CtD')
    sparse_gig_load = pt_load_array(h5file, 'GiG')

    h5file.close()
    t2 = time.time()
    times.append(t2-t1)

print(f'Average: {np.mean(times_pt)*1000 :.4} ms, Stdev: {np.std(times_pt)*1000 :.4} ms')

Average: nan ms, Stdev: nan ms


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
