In [3]:
from kipoi_cadd.data import cadd_train_valid_data, train_test_split_indexes, CaddDataset
from kipoi_cadd.utils import dump_to_pickle, load_pickle
import pandas as pd
import time
from tqdm import tqdm, trange
import numpy as np
from kipoi.data_utils import numpy_collate_concat
%load_ext line_profiler

In [4]:
training_dir = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/"
lmdb_dir = training_dir + "lmdb"
valid_id_file = training_dir + "valid_idx.pkl"
all_ids_file = training_dir + "variant_ids.pkl"
ids_10k_file = training_dir + "ids_10k.pkl"

In [6]:
np.finfo(np.float16).max

65500.0

In [7]:
def load_n_batch(ds, num_workers=1, shuffle=False):
    D = len(ds.__getitem__(0)['inputs'])
    N = ds.__len__()
    X = np.zeros((N, D), dtype=np.float16)
    y = np.zeros(N, dtype=np.float16)

    it = ds.batch_train_iter(batch_size=N, shuffle=shuffle, num_workers=num_workers)
    X, y = next(it)
    return X, y

def load_n_single(ds, num_workers=1, shuffle=False):
    D = len(ds.__getitem__(0)['inputs'])
    N = ds.__len__()
    X = np.zeros((N, D), dtype=np.float16)
    y = np.zeros(N, dtype=np.float16)

    for idx in trange(N):
        item = ds.__getitem__(idx)
        X[idx,:] = item['inputs']
        y[idx] = item['targets']
    return X, y

def load_n_batch_loop(ds, num_workers=1, batch_size=64, shuffle=False):
    D = len(ds.__getitem__(0)['inputs'])
    N = ds.__len__()
    X = np.zeros((N, D), dtype=np.float16)
    y = np.zeros(N, dtype=np.float16)
    
    # Batch iter does not loop infinetly. When the ds will end, the last batch's size, will be that
    # of the number of remaining samples
    it = ds.batch_iter(batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
    i = 0
    for sample in tqdm(it, total=(n//64)-1):
        bs = sample['inputs'].shape[0]
        end = min(i+bs, n)
        X[i:end,:] = sample['inputs'][:end-i]
        y[i:end] = sample['targets'][:end-i]
        i = i + bs + 1
    return X, y

def load_n_collate(ds, num_workers=1, batch_size=64, shuffle=False):
    my_list = []
    it = ds.batch_iter(64, num_workers=num_workers, shuffle=shuffle)
    i = 0
    for x in tqdm(it, total=(n//64)-1):
        bs = x['inputs'].shape[0]
        if n - i >= bs:
            my_list.append(x)
        else:
            x['targets'] = x['targets'][:n-i]
            x['inputs'] = x['inputs'][:n-i]
            x['metadata']['variant_id'] = x['metadata']['variant_id'][:n-i]
            my_list.append(x)
        i = i + bs
        if i >= n:
            break
            
    return numpy_collate_concat(my_list)

In [8]:
%%time
"""
times_n_batch = {'bs': [], 'time': [], 'pos': []}
for s in [10000, 50000, 100000, 1000000, 10000000]:
    if s < 1000000:
        start = time.time()
        X, y = load_n_batch(valid_ds, n=s, num_workers=1, shuffle=False)
        end = time.time()
    else:
        end = start = 0
        y = [0]
    times_n_batch['bs'].append(s)
    times_n_batch['time'].append(end-start)
    times_n_batch['pos'].append(sum(y))
    
    print(sum(y), X.shape, end-start)
"""
valid_ds = CaddDataset(lmdb_dir, valid_id_file)
X, y = load_n_batch(valid_ds, n=1, num_workers=1, shuffle=False)
print(sum(y), X.shape, end-start)

NameError: name 's' is not defined

In [None]:
"""
times_n_single = {'bs': [], 'time': [], 'pos': []}
for s in [10000, 100000, 1000000, 7008611]:
    start = time.time()
    X, y = load_n_single(valid_ds, n=s, num_workers=10, shuffle=False)
    end = time.time()
    times_n_single['bs'].append(s)
    times_n_single['time'].append(end-start)
    times_n_single['pos'].append(y.sum())
    print(y.sum(), X.shape, end-start)
"""
valid_ds = CaddDataset(lmdb_dir, valid_id_file)
X, y = load_n_single(valid_ds, n=s, num_workers=10, shuffle=False)
print(sum(y), X.shape, end-start)

In [None]:
"""
times_n_collate = {'bs': [], 'time': [], 'pos': []}
for s in [10000, 100000, 1000000, 7008611]:
    start = time.time()
    items = load_n_collate(valid_ds, n=s, num_workers=10, shuffle=False)
    end = time.time()
    times_n_collate['bs'].append(s)
    times_n_collate['time'].append(end-start)
    times_n_collate['pos'].append(items['targets'].sum())
    print(items['targets'].sum(), items['inputs'].shape, end-start)
"""
valid_ds = CaddDataset(lmdb_dir, valid_id_file)
%time items = load_n_collate(valid_ds, n=s, num_workers=10, shuffle=False)
print(items['targets'].sum(), items['inputs'].shape, end-start)

In [None]:
"""
times_n_batch_loop = {'bs': [], 'time': [], 'pos': []}
for s in [10000, 100000, 1000000, 7008611]:
    start = time.time()
    X, y = load_n_batch_loop(valid_ds, n=s, num_workers=10, shuffle=False)
    end = time.time()
    times_n_batch_loop['bs'].append(s)
    times_n_batch_loop['time'].append(end-start)
    times_n_batch_loop['pos'].append(y.sum())
    print(y.sum(), X.shape, end-start)
"""
valid_ds = CaddDataset(lmdb_dir, valid_id_file)
%time X, y = load_n_batch_loop(valid_ds, n=s, num_workers=10, shuffle=False)
print(y.sum(), X.shape, end-start)

In [None]:
%%time
X, y = load_all_batch(valid_ds, num_workers=1, shuffle=False)
print(y.sum()/len(y), y.sum(), len(y))

In [None]:
%%time
X, y = load_all_single(valid_ds, num_workers=1, shuffle=False)
print(y.sum()/len(y), y.sum(), len(y))