In [1]:
import comet_ml
from kipoi_cadd.trainers import KipoiCaddBatchTrainer
from gin_train.trainers import KerasTrainer
from kipoi_cadd.data import CaddSparseDataset, KipoiCaddDataset
from kipoi_cadd.models import logistic_regression_keras
from kipoi.readers import Reader
from kipoi_cadd.data import Dataset
from tqdm import tqdm
import pandas as pd
import numpy as np

Using TensorFlow backend.


## Shuffle and split npz dataset and vcf file
... and split train and valid

In [15]:
from scipy.sparse import csr_matrix, load_npz, save_npz
from kipoi_cadd.utils import load_pickle, dump_to_pickle
from kipoi_cadd.data_utils import reorder_sparse_matrix, reorder_vcf

npz_file = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/sparse_matrices/sample_chr22_GRCh37.npz"
vcf_file = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/sample_chr22_GRCh37.vcf"
row_ids_file = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/shuffled/sample_chr22_GRCh37.pkl"
row_ids = load_pickle(row_ids_file)
ratio = int(len(row_ids) * 0.6)
train_row_ids = row_ids[:ratio]
valid_row_ids = row_ids[ratio:]

In [16]:
train_npz = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/shuffled/sample_chr22_train_GRCh37.npz"
valid_npz = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/shuffled/sample_chr22_valid_GRCh37.npz"
reorder_sparse_matrix(npz_file, train_row_ids, train_npz)
reorder_sparse_matrix(npz_file, valid_row_ids, valid_npz)

In [17]:
train_vcf = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/shuffled/sample_chr22_train_GRCh37.vcf"
valid_vcf = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/shuffled/sample_chr22_valid_GRCh37.vcf"
reorder_vcf(vcf_file, train_row_ids, train_vcf)
reorder_vcf(vcf_file, valid_row_ids, valid_vcf)

In [18]:
# Save the splitting lines
dump_to_pickle("/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/shuffled/sample_chr22_train_GRCh37.pkl", train_row_ids)
dump_to_pickle("/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/shuffled/sample_chr22_valid_GRCh37.pkl", valid_row_ids)

## Create KipoiCaddSparseDataset

In [1]:
# First we create the CaddSparseBatchDataset and the ZarrBatchDataset
from kipoi_cadd.data import CaddSparseBatchDataset, ZarrBatchDataset, KipoiCaddDataset

zarr_train = "/s/project/kipoi-cadd/data/models/DeepSea_veff/sample_chr22_train_GRCh37.zarr"
zarr_valid = "/s/project/kipoi-cadd/data/models/DeepSea_veff/sample_chr22_valid_GRCh37.zarr"
train_npz = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/shuffled/sample_chr22_train_GRCh37.npz"
valid_npz = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/shuffled/sample_chr22_valid_GRCh37.npz"

train_cds = CaddSparseBatchDataset(train_npz, 0, batch_size=1024)
train_zds = ZarrBatchDataset(zarr_train, ['preds/logit_diff'], batch_size=1024)
train_kcds = KipoiCaddBatchDataset(train_cds, train_zds)

Using TensorFlow backend.


NameError: name 'ZarrReader' is not defined

In [2]:
class CsvReader(Reader):
    def __init__(self, filename, separator):
        self.filename = filename
        self.separator = separator
    def __getitem__(self, idx):
        df = pd.read_csv(self.filename, sep=self.separator, nrows=1)
        tmp = df.drop(columns=['line_idx', 'variant_alt', 'variant_chr', 'variant_id', 'variant_pos', 'variant_ref'])
        item = {'inputs': tmp.values.ravel(), 'targets': [None]*df.shape[0], 'variant_id': df.line_idx}        
        return item
    
    def batch_iter(self, batch_size):
        return pd.read_csv(self.filename, sep=self.separator, chunksize=batch_size)
    def batch_train_iter(self, batch_size=10):
        for chunk in pd.read_csv(self.filename, sep=self.separator, chunksize=batch_size):
            chunk.drop(columns=['line_idx', 'variant_alt', 'variant_chr', 'variant_id', 'variant_pos', 'variant_ref'], inplace=True)
            yield chunk

In [3]:
import threading
'''
    A generic iterator and generator that takes any iterator and wrap it to make it thread safe.
    This method was introducted by Anand Chitipothu in http://anandology.com/blog/using-iterators-and-generators/
    but was not compatible with python 3. This modified version is now compatible and works both in python 2.8 and 3.0 
'''
class threadsafe_iter:
    """Takes an iterator/generator and makes it thread-safe by
    serializing call to the `next` method of given iterator/generator.
    """
    def __init__(self, it):
        self.it = it
        self.lock = threading.Lock()

    def __iter__(self):
        return self

    def __next__(self):
        with self.lock:
            return self.it.__next__()

def threadsafe_generator(f):
    """A decorator that takes a generator function and makes it thread-safe.
    """
    def g(*a, **kw):
        return threadsafe_iter(f(*a, **kw))
    return g

from kipoi.data_utils import numpy_collate_concat
class KipoiCaddDataset(Dataset):
    def __init__(self, datasets):
        """
        Args:
          datasets: list of datasets or readers. Objects with method batch_train_iter.
        """
        self.datasets = datasets
        self.iterators = []

    def __len__(self):
        # Assuming the datasets have been curated
        return self.datasets[0].__len__()

    def __getitem__(self, idx):
        f_inputs = [ds[idx]['inputs'] for ds in self.datasets]
        f_targets = self.datasets[0][idx]['targets']
        item = {'inputs': np.concatenate(f_inputs), 'targets': f_targets}
        return item
    """
    @threadsafe_generator
    def batch_train_iter(self, batch_size=32,**kwargs):
        for d in self.datasets:
            self.iterators.append(d.batch_train_iter(batch_size=batch_size))

        num_batches = 1
        for batch_num in tqdm(range(num_batches)):
            X_batch = None
            for it in self.iterators:
                # Connecting features from all kipoi datasets
                # we assume that the variants have been curated,
                # i.e. the same in the exact order.
                if X_batch is None:
                    # The first batch should be CADD's batch and batch_train_iter
                    # will return a tuple of X and y.
                    X_batch = next(it)
                    if isinstance(X_batch, tuple):
                        X_batch, y_batch = X_batch
                else:
                    t = next(it)
                    if isinstance(t, tuple): print(t)
                    print(X_batch.shape, t.shape)
                    X_batch = np.concatenate((X_batch, t), axis=1)
                    yield X_batch, y_batch
    """

In [4]:
# Initialize deepsea dataset or reader or whatever
deepsea_file = "/s/project/kipoi-cadd/data/models/DeepSea_veff/preds.tsv"
rdr = CsvReader(deepsea_file, "\t")

In [5]:
%%time
# Initialize cadd dataset
train_npz = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/sparse_matrices/simulation_InDels.npz"
train_ids = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/variant_ids/simulation_InDels.pkl"
valid_npz = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/sparse_matrices/clinvar_pathogenic_splice_site_GRCh37.npz"
valid_ids = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/variant_ids/clinvar_pathogenic_splice_site_GRCh37.pkl"
train_ds = CaddSparseDataset(train_npz, train_ids)
valid_ds = CaddSparseDataset(valid_npz, valid_ids)

CPU times: user 14.1 s, sys: 1.11 s, total: 15.2 s
Wall time: 15.4 s


In [6]:
# Initialize Zarr reader
from kipoi.readers import ZarrReader

zarr_file = "/s/project/kipoi-cadd/data/models/DeepSea_veff/clinvar_pathogenic_splice_site_GRCh37.zarr"
zr = ZarrReader(zarr_file)

In [None]:
zr.batch_iter()

In [9]:
np.array([None, None], dtype=str)

array(['None', 'None'], dtype='<U4')

In [6]:
# Initialize kipoi cadd dataset and Keras trainer
tds = KipoiCaddDataset([ds, rdr])
vds = tds
kt = KerasTrainer(logistic_regression_keras(6417), tds, vds, output_dir="/tmp/KipoiCadd/kerasTrainer")

In [7]:
# Train keras trainer
kt.train(batch_size=10, num_workers=1)

Got training iterator. Batch size: 10 num_workers: 1 Train batch sampler: None
Epoch 1/100
    19/183749 [..............................] - ETA: 101:20:56 - loss: 4.5080 - mean_absolute_error: 0.0208 - binary_accuracy: 0.9737

KeyboardInterrupt: 

In [31]:
train_datasets = [ds, rdr]
tr = KipoiCaddBatchTrainer(logistic_regression_keras(6417), train_datasets, output_dir="/tmp/KipoiCadd")

In [32]:
tr.train(batch_size=10, num_batches=1)

  0%|          | 0/1 [00:00<?, ?it/s]

Started loading iterators and training set
Loading and training
(10, 5514) (10, 903)


100%|██████████| 1/1 [00:01<00:00,  1.18s/it]


In [92]:
it = tds.batch_train_iter(batch_size=10)

In [93]:
next(it)

(array([[0.        , 0.        , 0.        , ..., 0.00038068, 0.00040611,
         0.00048398],
        [0.        , 0.        , 0.        , ..., 0.00038068, 0.00040611,
         0.00048398],
        [0.        , 0.        , 0.        , ..., 0.00038068, 0.00040611,
         0.00048398],
        ...,
        [0.        , 0.        , 0.        , ..., 0.00038068, 0.00040611,
         0.00048398],
        [0.        , 0.        , 0.        , ..., 0.00038068, 0.00040611,
         0.00048398],
        [0.        , 0.        , 0.        , ..., 0.00038068, 0.00040611,
         0.00048398]]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32))

In [87]:
np.concatenate([ds[0]['inputs'], ds[0]['inputs']]).shape

(1806,)

In [64]:
b = rdr[1]['inputs'].reshape((1, len(rdr[1]['inputs'])))

In [65]:
c = np.concatenate([a,b], axis=1)
c.shape

(1, 6417)

In [15]:
d = next(rdr.batch_train_iter(5))

In [17]:
d.dtype

dtype('float64')

In [24]:
a = next(rdr.batch_train_iter(10))

In [25]:
b = next(ds.batch_train_iter(batch_size=10))[0]

In [26]:
import numpy as np
c = np.concatenate((a,b), axis=1)

In [27]:
c.shape

(10, 6417)

In [29]:
b

array([[0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 1., 1.]], dtype=float32)

In [None]:
self.root.empty(k, shape=(0,) + fbatch[k].shape[1:], dtype=dtype, compressor=self.compressor, chunks=(self.chunk_size,) + fbatch[k].shape[1:])
self.root.empty(k, shape=(0,) + fbatch[k].shape[1:], dtype=str, compressor=self.compressor, chunks=(self.chunk_size,) + fbatch[k].shape[1:])