In [6]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, load_npz, save_npz
from kipoi_cadd.utils import load_pickle, dump_to_pickle

## Get the variant ids in the vcf files

In [2]:
def generate_variant_ids(inputfile, outputfile, separator='\t',
                         header=0,
                         variant_cols=['Chrom', 'Pos', 'Ref', 'Alt'],
                         dtype={'Chrom': 'str', 'Pos': np.int32, 'Ref': 'str',
                                'Alt': 'str'}):
    input_df = pd.read_csv(inputfile,
                           sep=separator,
                           header=header,
                           usecols=variant_cols,
                           # nrows=1000,
                           dtype=dtype)
    
    if header is None:
        # Make sure column numbers are reset
        input_df = input_df.T.reset_index(drop=True).T
    variant_ids = input_df.apply(
        lambda row: ':'.join([str(row[0]), str(row[1]), row[2],
                              str(row[3].split(','))]), axis=1)
    
    print(outputfile)
    dump_to_pickle(outputfile, variant_ids)

In [3]:
# Standalone script to generate variant ids from vcf
# from kipoi_cadd.utils import generate_variant_ids
inputfile = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/ExAC_MAF5p_all_GRCh37.vcf"
outputfile = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/ExAC_MAF5p_all_GRCh37.pkl"
generate_variant_ids(inputfile, outputfile, variant_cols=[0,1,3,4], dtype={0:'str',1:'int32',3:'str',4:'str'}, header=None)

/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/ExAC_MAF5p_all_GRCh37.pkl


## Define if variants are repeated in the training set

In [4]:
clinvar_ids_file = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/clinvar_20180729_pathogenic_all_GRCh37_ids.pkl"
exac_ids_file = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/ExAC_MAF5p_all_GRCh37.pkl"
all_training_ids_file = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/variant_ids/all.pkl"

In [11]:
clinvar_ids = load_pickle(clinvar_ids_file)
all_training_ids = load_pickle(all_training_ids_file)
exac_ids = load_pickle(exac_ids_file)
# clinvar_ids.head()

In [12]:
set_clinvar = set(clinvar_ids.values)
set_all = set(all_training_ids.values)
set_exac = set(exac_ids.values)

In [13]:
len(set_exac & set_all)

1326

In [10]:
len(set_clinvar & set_all)

260

In [14]:
len(set_clinvar & set_exac)

19

In [12]:
clinvar_v13_file = "/s/project/kipoi-cadd/data/raw/v1.3/validation/clinvar_ESP/clinvar_caddv1.3.tsv.gz"
clinvar_v13 = pd.read_csv(clinvar_v13_file, sep="\t", nrows=100)
clinvar_v13.shape

(100, 115)

## Check imputed clinvar file

In [12]:
clinvar_imputed_file = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/clinvar_20180729_pathogenic_all_GRCh37_imputed.csv.gz"
clinvar_csr_file = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/sparse_matrices/clinvar_20180729_pathogenic_all_GRCh37.npz"

In [7]:
clinvar_imputed = pd.read_csv(clinvar_imputed_file, header=None)
print(clinvar_imputed.shape)
clinvar_imputed.head()

(50123, 904)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,894,895,896,897,898,899,900,901,902,903
0,0,1,0,0,0,0,0,0,1,0,...,0,0,0,1,1,1,0,0,1,1
1,0,1,0,0,0,0,0,0,1,0,...,1,1,1,1,1,1,0,0,1,1
2,0,0,1,0,0,0,0,0,1,0,...,0,0,0,1,1,1,0,0,1,1
3,0,0,1,0,0,0,0,0,1,0,...,1,1,1,1,1,1,0,0,1,1
4,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [13]:
clinvar_csr = csr_matrix(clinvar_imputed)
sparsity = 1-(clinvar_csr.count_nonzero()/(clinvar_csr.shape[0]*clinvar_csr.shape[1]))
print(sparsity)
save_npz(clinvar_csr_file, clinvar_csr)

## Predict for clinvar

In [1]:
from kipoi_cadd.data import CaddSparseDataset
from sklearn.externals import joblib
import numpy as np
from scipy.sparse import csr_matrix, load_npz, save_npz

clinvar_npz_file = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/sparse_matrices/clinvar_20180729_pathogenic_all_GRCh37.npz"
model_file = "/tmp/gin-train/a8eb036bfe3a495bbbcfc434e1c59bdd/model.h5"
scaler_cadd_file = "/s/project/kipoi-cadd/data/models/CADD1.4-GRCh37.mod"
scaler_kipoicadd_file = "/s/project/kipoi-cadd/data/models/KipoiCadd-GRCh37.scaler"

Using TensorFlow backend.


In [2]:
model = joblib.load(model_file)
_, scaler_cadd = joblib.load(scaler_cadd_file)



In [3]:
"""
ds = CaddSparseDataset(clinvar_npz_file, clinvar_ids_file)
X_clinvar, y_clinvar = ds.load_all()
"""
X_clinvar = load_npz(clinvar_npz_file)
X_clinvar = scaler_cadd.transform(X_clinvar)
y_clinvar = np.ones(X_clinvar.shape[0])

In [4]:
model.predict(X_clinvar)

array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)

In [5]:
model.score(X_clinvar, y_clinvar)

0.8995271631785807

In [6]:
import os

In [7]:
p = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/clinvar_20180729_pathogenic_all_GRCh37.vcf.gz"
os.path.splitext(os.path.splitext(p)[0])

('/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/clinvar_20180729_pathogenic_all_GRCh37',
 '.vcf')