In [1]:
from kipoi_cadd.utils import dump_to_pickle, load_pickle, get_all_files_extension, generate_variant_ids
from kipoi_cadd.data_utils import load_csv_chunks_tosparse
from kipoi_cadd.data import sparse_cadd_dataset, CaddSparseDataset
from scipy.sparse import vstack, load_npz, save_npz
from sklearn.model_selection import ShuffleSplit
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

Using TensorFlow backend.


In [2]:
training_dir = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/"
training_dir_hg37 = os.path.join(training_dir, "GRCh37")
training_dir_hg38 = os.path.join(training_dir, "GRCh38")
variant_ids_dir_hg37 = os.path.join(training_dir_hg37, "variant_ids")
variant_ids_dir_hg38 = os.path.join(training_dir_hg38, "variant_ids")
sparse_matrices_dir_hg37 = os.path.join(training_dir_hg37, "sparse_matrices")
sparse_matrices_dir_hg38 = os.path.join(training_dir_hg38, "sparse_matrices")

## Generate training set
This includes converting all csv files to sparse and keeping the variant ids
### 1. Generate variant ids

In [None]:
def generate_variant_ids(inputfile, outputfile, separator='\t', variant_cols=['Chrom', 'Pos', 'Ref', 'Alt']):
    print(inputfile)
    input_df = pd.read_csv(inputfile,
                           sep=separator,
                           usecols=variant_cols,
                           nrows=1000,
                           dtype={
                               'Chrom': 'str',
                               'Pos': np.int32,
                               'Ref': 'str',
                               'Alt': 'str'})
    
    variant_ids = input_df.apply(
        lambda row: ':'.join([str(row[0]), str(row[1]), row[2],
                              str(row[3].split(','))]), axis=1)
    
    print(outputfile)
    dump_to_pickle(outputfile, variant_ids)

### 1.a GRCh37

In [None]:
%%time
ext = "tsv.gz"
for file in get_all_files_extension(training_dir_hg37, ext):
    # if "InDels" in file: continue
    out = os.path.join(training_dir_hg37, "variant_ids", os.path.basename(file).split("." + ext)[0] + ".pkl")
    generate_variant_ids(file, out, variant_cols=['#Chr', 'Pos', 'Ref', 'Alt'])

### 1.b GRCh38

In [None]:
%%time
ext = "tsv.gz"
for file in get_all_files_extension(training_dir_hg38, ext):
    out = os.path.join(training_dir_hg38, "variant_ids", os.path.basename(file).split("." + ext)[0] + ".pkl")
    generate_variant_ids(file, out, variant_cols=['#Chrom', 'Pos', 'Ref', 'Alt'])

## Generate sparse matrices
For v1.4 the training set comes in shape of separated files. As explained in the INFO file: _## *.csv.gz Imputed and transformed training set (directly usable as X in any machine learning setting with first column being Y)_. Namely:
- `humanDerived_InDels.csv.gz`
- `humanDerived_SNVs.csv.gz`
- `simulation_InDels.csv.gz`
- `simulation_SNVs.csv.gz``


### 2.a GRCh37

In [None]:
ext = "csv.gz"
for f in get_all_files_extension(training_dir_hg37, ext):
    # Get the base of the name
    f_name = os.path.basename(f).split("." + ext)[0]
    if f_name == "humanDerived_InDels": continue
    # Num lines is necessary to set the total in tqdm, important feedback in a lengthy function
    num_lines = len(load_pickle(os.path.join(training_dir_hg37, "variant_ids", f_name + ".pkl")))
    output = os.path.join(training_dir_hg37, "sparse_matrices", f_name + ".npz")
    print(f_name, num_lines, output)
    load_csv_chunks_tosparse(f, 10000, np.float32, num_lines=num_lines, output=output, header=None)
    break

### 2.b GRCh38

In [None]:
ext = "csv.gz"
for f in get_all_files_extension(training_dir_hg38, ext):
    # Get the base of the name
    f_name = os.path.basename(f).split("." + ext)[0]
    if f_name == "humanDerived_InDels": continue
    # Num lines is necessary to set the total in tqdm, important feedback in a lengthy function
    num_lines = len(load_pickle(os.path.join(training_dir_hg38, "variant_ids", f_name + ".pkl")))
    output = os.path.join(training_dir_hg38, "sparse_matrices", f_name + ".npz")
    print(f_name, num_lines, output)
    load_csv_chunks_tosparse(f, 10000, np.float32, num_lines=num_lines, output=output, header=None)
    break

## Merge sparse matrices
### 3 Merge variant ids

In [None]:
ext = "pkl"
all_ids = None
for f in tqdm(get_all_files_extension(variant_ids_dir_hg37, ext)):
    if all_ids is None:
        all_ids = load_pickle(f)
    else:
        all_ids = pd.concat([all_ids, load_pickle(f)], ignore_index=True)

print(len(all_ids))
output = os.path.join(variant_ids_dir_hg37, "all.pkl")
dump_to_pickle(output, all_ids)

In [None]:
ext = "pkl"
all_ids = None
for f in tqdm(get_all_files_extension(variant_ids_dir_hg38, ext)):
    if all_ids is None:
        all_ids = load_pickle(f)
    else:
        all_ids = pd.concat([all_ids, load_pickle(f)], ignore_index=True)

print(len(all_ids))
output = os.path.join(variant_ids_dir_hg38, "all.pkl")
dump_to_pickle(output, all_ids)

### 3.a GRCh37

In [None]:
ext = "npz"
all_npz = None
for f in tqdm(get_all_files_extension(sparse_matrices_dir_hg37, ext)):
    if all_npz is None:
        all_npz = load_npz(f)
    else:
        all_npz = vstack([all_npz, load_npz(f)])
output = os.path.join(sparse_matrices_dir_hg37, "all.npz")
save_npz(output, all_npz)

In [6]:
# Verify that y is binary
all_npz = load_npz(os.path.join(sparse_matrices_dir_hg37, "all.npz"))

In [None]:
all_npz[:,0].nonzero()

### 3.b GRCh38

In [None]:
ext = "npz"
all_npz = None
for f in tqdm(get_all_files_extension(sparse_matrices_dir_hg38, ext)):
    if all_npz is None:
        all_npz = load_npz(f)
    else:
        all_npz = vstack([all_npz, load_npz(f)])
output = os.path.join(sparse_matrices_dir_hg38, "all.npz")
save_npz(output, all_npz)

## Split into train and validation set
### 4.a GRCh37

In [3]:
variant_ids_file = os.path.join(variant_ids_dir_hg37, "all.pkl")
s = os.path.join(sparse_matrices_dir_hg37, "all.npz")

In [8]:
s = os.path.join(sparse_matrices_dir_hg37, "humanDerived_InDels.npz")
sn = load_npz(s)
sn.shape

(1837707, 905)

In [4]:
(train, train_ids), (valid, valid_ids) = sparse_cadd_dataset(s, variant_ids_file, output_npz=sparse_matrices_dir_hg37, output_ids=variant_ids_dir_hg37)

In [5]:
print(train.shape, len(train_ids), valid.shape, len(valid_ids))

(1286248, 905) 1286248 (551250, 905) 551250


In [14]:
len(train[:,0].data)

1286248

In [15]:
len(train[:,1].data)

0

### 4.b GRCh38

In [None]:
variant_ids_file = os.path.join(variant_ids_dir_hg38, "all.pkl")
s = os.path.join(sparse_matrices_dir_hg38, "all.npz")
(train, train_ids), (valid, valid_ids) = sparse_cadd_dataset(s, variant_ids_file, output_npz=sparse_matrices_dir_hg38, output_ids=variant_ids_dir_hg38)

## Generate sample set

In [17]:
valid_ids_file = os.path.join(variant_ids_dir_hg37, "valid.pkl")
valid_ids = load_pickle(valid_ids_file)
valid_npz_file = os.path.join(sparse_matrices_dir_hg37, "valid.npz")
valid_npz = load_npz(valid_npz_file)

In [22]:
save_npz(os.path.join(sparse_matrices_dir_hg37, "sample_10k.npz"), valid_npz[90000:100000, :])
dump_to_pickle(os.path.join(variant_ids_dir_hg37, "sample_10k.pkl"), valid_ids[90000:100000])

## Test as a sparse dataset

In [3]:
ds = CaddSparseDataset(os.path.join(sparse_matrices_dir_hg37, "sample_10k.npz"), os.path.join(variant_ids_dir_hg37, "sample_10k.pkl"))
res = ds[1]

In [None]:
it = ds.batch_train_iter(cycle=False,num_workers=1,batch_size=40)

In [None]:
r = next(it)