## Generate LMDB by batches
First we will test the functionality in place and check it works for a small amount of batches before generating the whole DB.

In [74]:
from kipoi_cadd.data import CaddBatchDataset, cadd_serialize_numpy_row
from kipoi_cadd.data_utils import dir_batch_generator, OrderedDict, get_one_batch
from kipoi_cadd.utils import dump_to_pickle, load_pickle
import time
import pyarrow as pa
import lmdb
from tqdm import tqdm
import blosc
import pandas as pd
import numpy as np

In [78]:
training_dir = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/"
lmdb_dir = training_dir + "lmdb"
lmdb_batch_dir = training_dir + "lmdb_batched"
csv_file = training_dir + "training_data.imputed.csv"
valid_id_file = training_dir + "valid_idx.pkl"
all_ids_file = training_dir + "variant_ids.pkl"
ids_10k_file = training_dir + "sample_indices/ids_10k.pkl"
std_scaler_file = training_dir + "stats/standard_scaler_first10k.pkl"
short_csv = training_dir + "last10k.csv"
batch_idx_file = training_dir + "shuffle_splits/batch_idxs_256.pkl"
csv_folder = training_dir + "shuffle_splits/"

In [44]:
np.iinfo(np.int32).max

2147483647

In [63]:
def create_lmdb_from_iterator(it, lmdb_batch_dir, variant_ids_file, num_batches=-1,
                              map_size=23399354270):
    start = time.time()

    index_mapping = OrderedDict()
    map_size = None
    txn = None
    batch_num = 0
    variant_ids = load_pickle(variant_ids_file)

    env = lmdb.Environment(lmdb_batch_dir, map_size=map_size, max_dbs=0, lock=False)
    with env.begin(write=True, buffers=True) as txn:
        for batch in tqdm(it, total=num_batches):
            b = {
                "batch_id": np.int32(batch_num),
                "inputs": batch[0].values.astype(np.float16),
                "targets": batch[1].values.astype(np.float16),
                "metadata": {
                    "row_num": np.array(batch[0].index, dtype=np.int32),
                    "variant_id": np.array(variant_ids.loc[batch[0].index], dtype='<U20')
                }
            }

            # Serialize and compress
            buff = pa.serialize(b).to_buffer()
            blzpacked = blosc.compress(buff, typesize=8, cname='blosclz')

            try:
                txn.put(str(batch_num).encode('ascii'), blzpacked)
            except lmdb.MapFullError as err:
                print(str(err) + ". Exiting the program.")

            batch_num += 1
            if batch_num >= num_batches: break

    print("Finished putting " + str(batch_num) + " batches to lmdb.")
    end = time.time()
    print("Total elapsed time: {:.2f} minutes.".format(
        (end - start) / 60))

def calculate_map_size(row_example, nrows, multiplier=1.9):
    row_size = pa.serialize(row_example).to_buffer().size
    map_size = int(row_size * nrows * multiplier)
    return map_size

In [56]:
it = dir_batch_generator(training_dir + "shuffle_splits/training/", 256)
test_batch = next(it)
variant_ids = load_pickle(all_ids_file)
nrows = len(variant_ids)

In [59]:
row_example = {
    "batch_id": np.int32(0),
    "inputs": test_batch[0].values.astype(np.float16),
    "targets": test_batch[1].values.astype(np.float16),
    "metadata": {
        "row_num": np.array(test_batch[0].index, dtype=np.int32),
        "variant_id": np.array(variant_ids.loc[test_batch[0].index], dtype='<U20')
        }
    }

In [72]:
ms = calculate_map_size(row_example, 3)
it = dir_batch_generator(training_dir + "shuffle_splits/training/", 256)
create_lmdb_from_iterator(it, lmdb_batch_dir, all_ids_file, num_batches=3, map_size=ms)

 33%|███▎      | 1/3 [00:07<00:14,  7.42s/it]


Finished putting 3 batches to lmdb.
Total elapsed time: 0.29 minutes.


In [115]:
batch = get_one_batch(lmdb_batch_dir, 0)

In [106]:
batch['inputs'].shape

(256, 1063)

In [107]:
aja = load_pickle(batch_idx_file)

In [108]:
np.unique(batch['metadata']['variant_id'] == np.array(list(aja[2]["variant_ids"]), dtype='<U20'))

array([False])

In [None]:
np.array(list(aja[2]["variant_ids"]), dtype='<U20')

In [117]:
df = pd.read_csv("/s/project/kipoi-cadd/data/raw/v1.3/training_data/shuffle_splits/training/1.csv", nrows=256, index_col=0)

In [120]:
np.unique(df.index.values == batch['metadata']['row_num'])

array([ True])

array([  4144,   5520,  10989,  11561,  22753,  27405,  34208,  36401,
        40030,  42797,  43490,  47133,  48173,  49555,  51536,  51790,
        55042,  65924,  70222,  75696,  84510,  86846,  87233,  87555,
        89820,  91016,  91162,  92174,  95554, 101771, 106002, 109120,
       116459, 116662, 116980, 131516, 132235, 132726, 134684, 143608,
       144456, 145863, 148454, 148522, 149192, 150911, 153727, 153926,
       155002, 159122, 172325, 173566, 174851, 189663, 194331, 195512,
       199216, 201159, 201994, 202977, 215951, 222627, 231284, 234575,
       235140, 236271, 237225, 238171, 243617, 251878, 253858, 255525,
       264728, 265715, 269402, 274299, 275764, 283943, 286478, 289395,
       290387, 293957, 295314, 296892, 301884, 313565, 321225, 326149,
       327454, 331049, 331216, 333209, 333429, 336846, 341371, 342022,
       342977, 348426, 348611, 351716, 359720, 375612, 376450, 380540,
       384247, 387233, 390550, 391014, 396582, 406199, 406344, 407821,
      

## Try with the function

In [88]:
from kipoi_cadd.data_utils import cadd_generate_batched_lmdb_from_many_csv
training_dir = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/"
lmdb_batch_dir = training_dir + "lmdb_batched"
all_ids_file = training_dir + "variant_ids.pkl"
csv_folder = training_dir + "shuffle_splits/"

In [104]:
cadd_generate_batched_lmdb_from_many_csv(lmdb_batch_dir, csv_folder, all_ids_file, num_batches=-1, batch_size=256)

 33%|███▎      | 1/3 [00:06<00:13,  6.90s/it]


Finished putting 3 batches to lmdb.
Total elapsed time: 0.29 minutes.


In [112]:
from kipoi_cadd.utils import get_all_files_extension

In [None]:
li = get_all_files_extension(csv_folder, ".csv")
yui= [print(e) for e in li]