In [6]:
import kipoi_veff, kipoi
import pytest
import os
import sys
import lmdb
from tqdm import tqdm
import kipoi_veff.snv_predict as sp
import pandas as pd
import pyarrow as pa
from kipoi.readers import Reader
from kipoi_veff import analyse_model_preds
from kipoi_veff.scores import Diff, LogitRef
from kipoi_cadd.writers import LmdbWriter
from kipoi_cadd.utils import variant_id_string
# Logit, LogitRef, LogitAlt, , DeepSEA_effect, RCScore, scoring_options
from kipoi_veff.utils.io import SyncBatchWriter, SyncPredictonsWriter

In [7]:
os.chdir("/data/ouga/home/ag_gagneur/simancas/Projects/kipoi-veff")
# pytest.main(['-k', 'test_other_writers'])

In [8]:
INSTALL_REQ = False

def test_other_writers(tmpdir):
    if sys.version_info[0] == 2:
        pytest.skip("rbp example not supported on python 2 ")
    model_dir = "tests/models/var_seqlen_model/"
    if INSTALL_REQ:
        install_model_requirements(model_dir, "dir", and_dataloaders=True)

    model = kipoi.get_model(model_dir, source="dir")
    # The preprocessor
    Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir")

    dataloader_arguments = {
        "fasta_file": "example_files/hg38_chr22.fa",
        "preproc_transformer": "dataloader_files/encodeSplines.pkl",
        "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz",
        "intervals_file": "example_files/variant_centered_intervals.tsv"
    }
    dataloader_arguments = {k: model_dir + v for k, v in dataloader_arguments.items()}
    vcf_path = model_dir + "example_files/variants.vcf"
    ref_out_vcf_fpath = model_dir + "example_files/variants_ref_out.vcf"

    vcf_path = kipoi_veff.ensure_tabixed_vcf(vcf_path)
    model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader)

    from kipoi.writers import HDF5BatchWriter, TsvBatchWriter, MultipleBatchWriter

    h5_path = os.path.join(str(tmpdir), 'preds.h5')
    tsv_path = os.path.join(str(tmpdir), 'preds.tsv')
    lmdb_path = os.path.join(str(tmpdir), 'lmdb/')
    my_writer = LmdbBatchWriter(lmdb_path, "var_seqlen")
    writer = SyncBatchWriter(AsyncSyncPredictionsWriter(my_writer))
    # writer = SyncBatchWriter(MultipleBatchWriter([HDF5BatchWriter(h5_path),
    #                                               TsvBatchWriter(tsv_path)]))
    # writer = kipoi_veff.VcfWriter(model, vcf_path, out_vcf_fpath, standardise_var_id=True)
    # writer = LmdbWriter(lmdb_path, "var_seqlen")
    
    vcf_to_region = None
    with pytest.raises(Exception):
        # This has to raise an exception as the sequence length is None.
        vcf_to_region = kipoi_veff.SnvCenteredRg(model_info)
    output = sp.predict_snvs(model, Dataloader, vcf_path, dataloader_args=dataloader_arguments,
                    evaluation_function=analyse_model_preds, batch_size=32,
                    vcf_to_region=vcf_to_region,
                    evaluation_function_kwargs={'diff_types': {'diff': Diff("mean"), 'logitRef': LogitRef("max")}},
                    return_predictions=True,
                    sync_pred_writer=writer)
    return output

In [24]:
tmpdir = "/tmp/kipoi-veff/test_KipoiLmdbDataset"
out, batch = test_other_writers(tmpdir)

  0%|          | 0/1 [00:00<?, ?it/s]INFO:2019-01-16 13:44:00,276:genomelake] Running landmark extractors..
  ("strand", gtf.strand)])
INFO:2019-01-16 13:44:00,298:genomelake] Done!


pp_line              0
varpos_rel          49
ref                  A
alt                  T
start         21541541
end           21541641
id                   0
do_mutate         True
strand               .
Name: 0, dtype: object
Sequence:
TACCTATTTGGGTTTTCACTAGTAAGCAGTTGGTTTGTAAGCAGTTGGTAATTTTAGTTTGTCTGGGTTTCAGCCATGAATATTCTATTGTAAACTTAATT[0m
pp_line              1
varpos_rel          49
ref                  C
alt                  C
start         21541903
end           21542003
id                   1
do_mutate         True
strand               .
Name: 1, dtype: object
Sequence:
GTAGATACGGGGTTTCAACATGTTGCCCAGGCTGGTCTTGAATTCCTGTCCTCAAGCGATCCACTTGCCTCGCCTCCCAAAGTGCTGAGATTACAAGTATG[0m
pp_line              2
varpos_rel          49
ref                  T
alt                  G
start         30630171
end           30630271
id                   2
do_mutate         True
strand               .
Name: 2, dtype: object
Sequence:
GCCCTCAGACTCCCTTCACCCCAAGGTGTGCCATCCTCTCCATTCCACCTAGGCCTGTCCAGGCCTCG

100%|██████████| 1/1 [00:00<00:00,  3.07it/s]

[Variant(chr22:21541590 A/T), Variant(chr22:21541952 C/C), Variant(chr22:30630220 T/G), Variant(chr22:30630701 A/G), Variant(chr22:35503223 C/A), Variant(chr22:36702137 C/A)]


100%|██████████| 5/5 [00:00<00:00, 961.51it/s]



In [None]:
%%time
reader = LmdbReader(tmpdir + "lmdb")
it = reader.single_iter()
b = next(it)
ex = (str(b[0], encoding="ascii"), pa.deserialize(b[1]))

In [46]:
print(ex[0], ex[1].values)

"22:21541590:A:['T']"

In [54]:
from cyvcf2 import VCF
from kipoi_cadd.utils import variant_id_string

In [38]:
model_dir = "tests/models/var_seqlen_model/"
vcf_path = model_dir + "example_files/variants.vcf"
var_it = VCF(vcf_path)
var = next(var_it)
var

Variant(chr22:21541590 A/T)

## Try AsyncBatchWriter

In [11]:
dl_batch = next(it)
pred_batch_array = res

INFO:2019-01-13 16:18:47,108:genomelake] Running landmark extractors..
  ("strand", gtf.strand)])
INFO:2019-01-13 16:18:47,123:genomelake] Done!


In [3]:
# Standalone script
# from kipoi.writers import AsyncBatchWriter
from kipoi_cadd.writers import LmdbBatchWriter, AsyncSyncPredictionsWriter
import tempfile

tmpdir = "/tmp/kipoi-veff/test_AsyncBatchWriter/lmdb"
writer = AsyncSyncPredictionsWriter(LmdbBatchWriter(tmpdir, 'async'))
writer.batch_write(predictions, records, line_ids)
writer.batch_write(predictions, records, line_ids)
writer.close()

NameError: name 'predictions' is not defined

In [28]:
from kipoi_cadd.readers import LmdbReader
rdr = LmdbReader(tmpdir)
it = rdr.single_iter()
ele = next(it)

In [29]:
pa.deserialize(ele[1])

{'diff': var_seqlen:diff_rbp_prb    0.017238
 Name: 0, dtype: float32, 'logitRef': var_seqlen:logitRef_rbp_prb   NaN
 Name: 0, dtype: float32}

In [16]:
predictions, records, line_ids = batch

In [23]:
records = [variant_id_string(r.CHROM, r.POS, r.REF, r.ALT) for r in records]
records

["22:21541590:A:['T']",
 "22:21541952:C:['C']",
 "22:30630220:T:['G']",
 "22:30630701:A:['G']",
 "22:35503223:C:['A']",
 "22:36702137:C:['A']"]

In [24]:
batch = predictions, records, line_ids

In [21]:
for var_num, var in tqdm(enumerate(records), total=len(records)):
    variant_id = variant_id_string(var.CHROM, var.POS, var.REF, var.ALT)
    print(variant_id)
    annos = {}
    # Obtain predictions for this variant...
    for key, preds in predictions.items():
        annos[key] = preds.iloc[var_num, :]
    print(annos)

100%|██████████| 6/6 [00:00<00:00, 346.57it/s]

22:21541590:A:['T']
{'diff': var_seqlen:diff_rbp_prb    0.017238
Name: 0, dtype: float32, 'logitRef': var_seqlen:logitRef_rbp_prb   NaN
Name: 0, dtype: float32}
22:21541952:C:['C']
{'diff': var_seqlen:diff_rbp_prb    0.0
Name: 1, dtype: float32, 'logitRef': var_seqlen:logitRef_rbp_prb   NaN
Name: 1, dtype: float32}
22:30630220:T:['G']
{'diff': var_seqlen:diff_rbp_prb    0.017868
Name: 2, dtype: float32, 'logitRef': var_seqlen:logitRef_rbp_prb   -1.574677
Name: 2, dtype: float32}
22:30630701:A:['G']
{'diff': var_seqlen:diff_rbp_prb    0.032175
Name: 3, dtype: float32, 'logitRef': var_seqlen:logitRef_rbp_prb   NaN
Name: 3, dtype: float32}
22:35503223:C:['A']
{'diff': var_seqlen:diff_rbp_prb   -0.029189
Name: 4, dtype: float32, 'logitRef': var_seqlen:logitRef_rbp_prb   -0.080525
Name: 4, dtype: float32}
22:36702137:C:['A']
{'diff': var_seqlen:diff_rbp_prb   -0.078876
Name: 5, dtype: float32, 'logitRef': var_seqlen:logitRef_rbp_prb   -1.199388
Name: 5, dtype: float32}



