# Evaluating efficiency gains from caching

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import collections
import time

import hetmech.degree_weight
import hetmech.hetmat
import hetmech.hetmat.caching

In [2]:
hetmat = hetmech.hetmat.HetMat('../data/hetionet-v1.0.hetmat/')

In [3]:
metapaths = list()
for metapath in hetmat.metagraph.extract_metapaths('Compound', 'Disease', max_length=4):
    if hetmech.degree_weight.categorize(metapath) in {'long_repeat', 'other'}:
        continue
    metapaths.append(metapath)
# metapaths = metapaths[:3]  # for development
len(metapaths)

1172

In [4]:
segment_counts = hetmech.degree_weight.order_segments(hetmat.metagraph, metapaths, store_inverses=False)

In [5]:
sum(segment_counts.values())

4740

In [6]:
segment_counts.most_common(10)

[(CtD, 306),
 (CpD, 306),
 (CdG, 291),
 (CbG, 291),
 (CuG, 291),
 (GuD, 279),
 (GdD, 279),
 (GaD, 279),
 (Gr>G, 144),
 (CrC, 91)]

In [7]:
# Why are there so few segments?
len(segment_counts)

1255

## Caching performance when computing rephetio metapaths

In [8]:
def compute_dwpcs(allocate_GB):
    print(f'Beginning DWPCs for {len(metapaths):,} metapaths with a cache allocation of {allocate_GB:.2f} GB')
    cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat, allocate_GB=allocate_GB)
    cache.priorities = segment_counts
    hetmat.path_counts_cache = cache
    start = time.perf_counter()
    for metapath in metapaths:
        rows_ids, col_ids, matrix = hetmech.degree_weight.dwpc(hetmat, metapath, dense_threshold=0.7)
    end = time.perf_counter()
    total = time.strftime('%H:%M:%S', time.gmtime(end - start))
    print(f'Computation complete in {total}')
    print(cache.get_stats())
    print()
    return cache

In [9]:
for allocate_GB in 0, 0.2, 1, 5, 20:
    cache = compute_dwpcs(allocate_GB)

Beginning DWPCs for 1,172 metapaths with a cache allocation of 0.00 GB
Computation complete in 00:28:29
PathCountPriorityCache containing 0 items
  total gets: 4,141
  cache hits: memory = 0, disk = 0, absent = 4,141
  0.00 GB in use of 0.00 GB allocated

Beginning DWPCs for 1,172 metapaths with a cache allocation of 0.20 GB
Computation complete in 00:27:35
PathCountPriorityCache containing 222 items
  total gets: 4,141
  cache hits: memory = 2,877, disk = 0, absent = 1,264
  0.20 GB in use of 0.20 GB allocated

Beginning DWPCs for 1,172 metapaths with a cache allocation of 1.00 GB
Computation complete in 00:25:27
PathCountPriorityCache containing 1,228 items
  total gets: 4,141
  cache hits: memory = 2,937, disk = 0, absent = 1,204
  0.67 GB in use of 1.00 GB allocated

Beginning DWPCs for 1,172 metapaths with a cache allocation of 5.00 GB
Computation complete in 00:25:24
PathCountPriorityCache containing 1,228 items
  total gets: 4,141
  cache hits: memory = 2,937, disk = 0, absent =

## Caching performance when computing the _CbGiGaDrD_ metapath repeatedly

In [8]:
def repeated_compute_dwpcs(allocate_GB):
    metapath = 'CbGiGaDrD'
    print(f'Beginning DWPCs for 100 × {metapath} with a cache allocation of {allocate_GB:.2f} GB')
    cache = hetmech.hetmat.caching.PathCountPriorityCache(hetmat, allocate_GB=allocate_GB)
    hetmat.path_counts_cache = cache
    start = time.perf_counter()
    for i in range(200):
        rows_ids, col_ids, matrix = hetmech.degree_weight.dwpc(hetmat, metapath, dense_threshold=0.7)
    end = time.perf_counter()
    total = time.strftime('%H:%M:%S', time.gmtime(end - start))
    print(f'Computation complete in {total}')
    print(cache.get_stats())
    print()
    return cache

In [9]:
for allocate_GB in 0, 0.2:
    cache = repeated_compute_dwpcs(allocate_GB)

Beginning DWPCs for 100 × CbGiGaDrD with a cache allocation of 0.00 GB
Computation complete in 00:00:13
PathCountPriorityCache containing 0 items
  total gets: 1,000
  cache hits: memory = 0, disk = 0, absent = 1,000
  0.00 GB in use of 0.00 GB allocated

Beginning DWPCs for 100 × CbGiGaDrD with a cache allocation of 0.20 GB
Computation complete in 00:00:00
PathCountPriorityCache containing 5 items
  total gets: 204
  cache hits: memory = 199, disk = 0, absent = 5
  0.01 GB in use of 0.20 GB allocated



In [10]:
hetmech.degree_weight.get_all_segments(hetmat.metagraph, 'CbGiGaDrD')

[CbGiGaDrD, CbG, GiG, GaD, DrD]

In [11]:
list(cache.cache)

[(CbG, 'dwpc', 0.5),
 (GiG, 'dwpc', 0.5),
 (GaD, 'dwpc', 0.5),
 (DrD, 'dwpc', 0.5),
 (CbGiGaDrD, 'dwpc', 0.5)]