In [1]:
import kipoi
from kipoi_veff import score_variants
import pandas as pd
from kipoi_cadd.utils import load_pickle, decompose_variant_string, generate_intervals_file
import pyarrow as pa
from kipoi_cadd.writers import LmdbWriter
from kipoi_cadd.readers import LmdbReader
from kipoi_cadd.data import KipoiLmdbDataset
from kipoi_cadd.data_utils import calculate_map_size
import kipoi_veff.snv_predict as sp
from kipoi_veff.scores import Diff, LogitRef, Logit
from kipoi import get_model

Using TensorFlow backend.


## Get DeepSea features from example files

In [2]:
training_dir = "/s/project/kipoi-cadd/data/raw/v1.3/training_data/"
sample_intervals_file = "/data/nasif12/home_if12/simancas/.kipoi/models/DeepSEA/variantEffects/downloaded/example_files/intervals_file"
sample_fasta_file = "/data/nasif12/home_if12/simancas/.kipoi/models/DeepSEA/variantEffects/downloaded/example_files/fasta_file"
sample_vcf_file = "/data/nasif12/home_if12/simancas/.kipoi/models/DeepSEA/variantEffects/downloaded/example_files/variants.vcf"
test_dir = "/tmp/kipoi-veff/test_KipoiLmdbDataset/"
variant_ids_file = test_dir + "variant_ids.pkl"
kipoi_features_dir = "/s/project/kipoi-cadd/data/processed/v1.3/kipoi_features/"
intervals_file = kipoi_features_dir + "intervals_10k.tsv"
fasta_file = "/s/genomes/human/hg19/ensembl_GRCh37.p13_release75/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
shuff_10k_file = training_dir + "ids_10k.pkl"

In [None]:
# To trigger the download of sample intervals and sample fasta files, we have to do this:
model = get_model('DeepSEA/variantEffects')
pred = model.pipeline.predict_example()

In [51]:
sample_intervals = pd.read_csv(sample_intervals_file, sep='\t', header=None, names=['chr', 'start', 'end'])
"""
with open(sample_fasta_file, "r") as f:
    for line in f:
        print(line)
        break
"""
sample_intervals.head()

>1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 REF



## Generate intervals file for the variants of our interest

In [None]:
var_ids = load_pickle(shuff_10k_file)
generate_intervals_file(var_ids, intervals_file, use_chr_word=False, header=None)

In [None]:
model = get_model('DeepSEA/variantEffects')
dl_kwargs = {'intervals_file': intervals_file, 'fasta_file': fasta_file}

In [None]:
# Get the dataloader and instantiate it
dl = model.default_dataloader(**{'intervals_file': intervals_file, 'fasta_file': fasta_file, 'num_chr_fasta': True})
it = dl.batch_iter(batch_size=64)
preds = None
for batch in it:
    if preds is None:
        preds = model.predict_on_batch(batch['inputs'])
    else:
        preds = np.concatenate([preds, model.predict_on_batch(batch['inputs'])], axis=0)
preds.shape

## Write results with LMDBWriter for other Kipoi predictions

In [4]:
# Standalone script
import kipoi
from kipoi_cadd.writers import LmdbWriter
import kipoi_veff.snv_predict as sp
from kipoi_veff.scores import Logit
from kipoi import get_model

example_files_dir = "/data/ouga/home/ag_gagneur/simancas/Projects/kipoi-veff/tests/models/var_seqlen_model/"
test_dir = "/tmp/kipoi-veff/test_KipoiLmdbDataset/"

sample_intervals_file = example_files_dir + "example_files/variant_centered_intervals.tsv"
sample_fasta_file = example_files_dir + "example_files/hg38_chr22.fa"
sample_vcf_file = example_files_dir + "example_files/variants.vcf.gz"
gtf_file = example_files_dir + "example_files/gencode_v25_chr22.gtf.pkl.gz"
preproc_transformer = example_files_dir + "dataloader_files/encodeSplines.pkl"
lmdb_deep_sea = test_dir + "lmdb_DeepSea"

model = get_model('DeepSEA/variantEffects')
dl_kwargs = {'intervals_file': sample_intervals_file, 'fasta_file': sample_fasta_file, 'num_chr_fasta': False}
dataloader = model.default_dataloader
writer = LmdbWriter(lmdb_deep_sea, "DeepSea_veff")

preds = sp.predict_snvs(model,
                dataloader,
                sample_vcf_file,
                64,
                num_workers=1,
                dataloader_args=dl_kwargs,
                evaluation_function_kwargs={'diff_types': {'logit': Logit()}},
                return_predictions=True,
                sync_pred_writer=writer)

In [5]:
ds = KipoiLmdbDataset(lmdb_deep_sea, variant_ids_file)
# ds.load_all()

## Get DeepSea scores from CADD's variants
We need:
- fasta file
- vcf file
- intervals file

In [39]:
# Merge vcf files
def concatenate_vcf_files(directory, output=None):
    ext = "vcf.gz"
    vcf = None
    col_names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT']
    for f in get_all_files_extension(training_dir_hg37, ext):
        if vcf is None:
            vcf = pd.read_csv(f, sep='\t', comment='#', names=col_names,
                              dtypes={0:'str',
                                      1:'int32',
                                      2:'str',
                                      3:'str',
                                      4:'str'})
        else:
            vcf = pd.concat([vcf, 
                             pd.read_csv(f, sep='\t', comment='#', names=col_names,
                                         dtypes={0:'str',
                                                 1:'int32',
                                                 2:'str',
                                                 3:'str',
                                                 4:'str'})], ignore_index=True)
        print(f)
    # vcf.astype(dtype={'#CHROM':'object', 'POS':'int32', 'ID':'object', 'REF':'object', 'ALT':'object'})
    vcf.sort_values(by=['#CHROM', 'POS'], inplace=True)
    vcf.reset_index(drop=True, inplace=True)
    
    vcf["QUAL"] = ['.'] * vcf.shape[0]
    vcf["FILTER"] = ['.'] * vcf.shape[0]
    vcf["INFO"] = ['.'] * vcf.shape[0]
    
    if output is not None:
        with open(output, 'w') as f:
            f.write("##fileformat=VCFv4.0\n")
        vcf.to_csv(output, sep='\t', index=None, mode='a')
    return vcf

vcf.head()

  interactivity=interactivity, compiler=compiler, result=result)


/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/humanDerived_InDels.vcf.gz
/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/humanDerived_SNVs.vcf.gz
/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/simulation_InDels.vcf.gz
/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/simulation_SNVs.vcf.gz
/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/out_merged.vcf.gz


Unnamed: 0,#CHROM,POS,ID,REF,ALT
1837498,1,379177,.,G,T
1837499,1,379274,.,G,C
1837500,1,379476,.,T,A
1837501,1,379631,.,G,C
1837502,1,379724,.,G,A


In [11]:
# Add QUAL FILTER INFO and file format rows
all_original = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/all.vcf.gz"
all_new = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/all_new.vcf"

with open(all_new, 'w') as f:
    f.write("##fileformat=VCFv4.0\n")

edit_df = pd.read_csv(all_original, sep="\t", header=0)
edit_df["QUAL"] = ['.'] * edit_df.shape[0]
edit_df["FILTER"] = ['.'] * edit_df.shape[0]
edit_df["INFO"] = ['.'] * edit_df.shape[0]
edit_df.to_csv(all_new, sep="\t", index=False, mode='a')

In [10]:
! cat {all_new} | head

##fileformat=VCFv4.0#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
1	379177	.	G	T	.	.	.
1	379274	.	G	C	.	.	.
1	379476	.	T	A	.	.	.
1	379631	.	G	C	.	.	.
1	379724	.	G	A	.	.	.
1	379938	.	A	G	.	.	.
1	380028	.	T	C	.	.	.
1	380576	.	A	T	.	.	.
1	380577	.	T	TT	.	.	.
cat: write error: Broken pipe


### Get the vcf file

In [36]:
# Standalone script
import os
from kipoi_cadd.utils import concatenate_vcf_files

training_dir_v14 = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/"
training_dir_hg37 = os.path.join(training_dir_v14, "GRCh37")

concatenate_vcf_files(training_dir_hg37, output=os.path.join(training_dir_hg37, "all.vcf"))

In [117]:
x_df = new_vcf.loc[new_vcf.loc[:,'#CHROM']=="X",:]
x_df = x_df.iloc[:5].copy()
tmp = new_vcf.loc[33292286:33292289,:].copy()
new_tmp = pd.concat([x_df,tmp], ignore_index=True)
new_tmp = new_tmp.astype(dtype={'#CHROM': 'str'})
new_tmp.sort_values(by=['#CHROM', 'POS'])
new_tmp

Unnamed: 0,#CHROM,POS,ID,REF,ALT
33496006,X,224538,.,A,G
33496007,X,224658,.,A,G
33496008,X,224712,.,A,G
33496009,X,224894,.,G,T
33496010,X,224896,.,C,T


In [137]:
new_vcf = new_vcf.astype(dtype={'#CHROM':'str', 'POS':'int32', 'ID':'str', 'REF':'str', 'ALT':'str'})
new_vcf = new_vcf.sort_values(by=['#CHROM', 'POS'])
new_vcf.to_csv(os.path.join(training_dir_hg37, "all.vcf"), sep='\t', index=None)

### Get the intervals file

In [152]:
# Generate intervals file from vcf
def generate_intervals_from_vcf(vcf,
                                output=None,
                                col_names=['#CHROM', 'POS', 'ID', 'REF', 'ALT'],
                                dtypes={'#CHROM': 'str', 'POS': 'int32', 'ID': 'str', 'REF':'str', 'ALT':'str'}):
    if isinstance(vcf, str):
        vcf = pd.read_csv(vcf,
                          sep='\t',
                          dtypes=dtypes,
                          header= None,
                          names=col_names,
                          usecols=range(len(col_names)),
                          comment='#')
    elif not isinstance(sparse_matrix, pd.DataFrame):
        raise ValueError("Input must be either a path to a vcf(.gz) file or an object of pd.DataFrame type.")
    
    intervals = {'chr': [], 'start': [], 'end': []}
    for _, row in tqdm(vcf.iterrows()):
        intervals['chr'].append(row['#CHROM'])
        intervals['start'].append(row['POS'] - 1)
        intervals['end'].append((row['POS'] - 1) + len(row['REF']))
    
    df = pd.DataFrame(intervals, index=range(len(intervals['chr'])))
    df.sort_values(by=['chr', 'start'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    if output is not None:
        df.to_csv(output, sep='\t', index=None, header=None)
    return df

In [None]:
# Standalone script
import kipoi
from kipoi_cadd.writers import LmdbWriter
import kipoi_veff.snv_predict as sp
from kipoi_veff.scores import Logit
from kipoi import get_model
import os

cadd_files_dir = "/data/ouga/home/ag_gagneur/simancas/Projects/kipoi-veff/tests/models/var_seqlen_model/"
training_dir_hg37 = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37"
intervals_file = os.path.join(training_dir_hg37, "intervals.tsv")
fasta_file = "/s/genomes/human/hg19/ensembl_GRCh37.p13_release75/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
vcf_file = os.path.join(training_dir_hg37, "all.vcf.gz")
lmdb_deep_sea = os.path.join(training_dir_hg37, "lmdb/lmdb_DeepSea_veff")

model = get_model("DeepSEA/variantEffects")
dl_kwargs = {'intervals_file': intervals_file, 'fasta_file': fasta_file, 'num_chr_fasta': True}
dataloader = model.default_dataloader

num_lines = len(load_pickle(os.path.join(training_dir_hg37, "variant_ids/all.pkl")))
map_size = calculate_map_size(ds[0], num_lines, 1.9)
writer = LmdbWriter(lmdb_deep_sea, "DeepSea_veff", map_size)

sp.predict_snvs(model,
                dataloader,
                vcf_file,
                64,
                num_workers=64,
                dataloader_args=dl_kwargs,
                evaluation_function_kwargs={'diff_types': {'logit': Logit()}},
                return_predictions=False,
                sync_pred_writer=writer)

In [8]:
num_lines = len(load_pickle(os.path.join(training_dir_hg37, "variant_ids/all.pkl")))
map_size = calculate_map_size(ds[0], num_lines, 1.9)
map_size

274578419865