In [1]:
import pandas as pd
import numpy as np
from kipoi_cadd.utils import load_pickle, dump_to_pickle

Using TensorFlow backend.


In [2]:
def generate_variant_ids(inputfile, outputfile, separator='\t',
                         header=0,
                         variant_cols=['Chrom', 'Pos', 'Ref', 'Alt'],
                         dtype={'Chrom': 'str', 'Pos': np.int32, 'Ref': 'str',
                                'Alt': 'str'}):
    input_df = pd.read_csv(inputfile,
                           sep=separator,
                           header=header,
                           usecols=variant_cols,
                           # nrows=1000,
                           dtype=dtype)
    
    if header is None:
        # Make sure column numbers are reset
        input_df = input_df.T.reset_index(drop=True).T
    variant_ids = input_df.apply(
        lambda row: ':'.join([str(row[0]), str(row[1]), row[2],
                              str(row[3].split(','))]), axis=1)
    
    print(outputfile)
    dump_to_pickle(outputfile, variant_ids)

In [3]:
# Standalone script to generate variant ids from vcf
# from kipoi_cadd.utils import generate_variant_ids
inputfile = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/ExAC_MAF5p_all_GRCh37.vcf"
outputfile = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/ExAC_MAF5p_all_GRCh37.pkl"
generate_variant_ids(inputfile, outputfile, variant_cols=[0,1,3,4], dtype={0:'str',1:'int32',3:'str',4:'str'}, header=None)

/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/ExAC_MAF5p_all_GRCh37.pkl


In [4]:
clinvar_ids_file = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/clinvar_20180729_pathogenic_all_GRCh37_ids.pkl"
exac_ids_file = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/ExAC_MAF5p_all_GRCh37.pkl"
all_training_ids_file = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/variant_ids/all.pkl"

In [11]:
clinvar_ids = load_pickle(clinvar_ids_file)
all_training_ids = load_pickle(all_training_ids_file)
exac_ids = load_pickle(exac_ids_file)
# clinvar_ids.head()

In [12]:
set_clinvar = set(clinvar_ids.values)
set_all = set(all_training_ids.values)
set_exac = set(exac_ids.values)

In [13]:
len(set_exac & set_all)

1326

In [10]:
len(set_clinvar & set_all)

260

In [14]:
len(set_clinvar & set_exac)

19

In [12]:
clinvar_v13_file = "/s/project/kipoi-cadd/data/raw/v1.3/validation/clinvar_ESP/clinvar_caddv1.3.tsv.gz"
clinvar_v13 = pd.read_csv(clinvar_v13_file, sep="\t", nrows=100)
clinvar_v13.shape

(100, 115)