In [7]:
import cyvcf2

In [8]:
from pathlib import Path

In [9]:
ddir = Path("/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/")

In [10]:
ls {ddir}

INFO                            sample_chr22.vcf
all.vcf                         [0m[01;31msample_chr22_GRCh37.csv.gz[0m
example_10.vcf                  [01;31msample_chr22_GRCh37.tsv.gz[0m
[01;31mexample_10.vcf.gz[0m               sample_chr22_GRCh37.vcf
example_10.vcf.gz.tbi           sample_chr22_prefixed.vcf
example_100.vcf                 [01;31msimulation_InDels.csv.gz[0m
[01;31mhumanDerived_InDels.csv.gz[0m      [01;31msimulation_InDels.tsv.gz[0m
[01;31mhumanDerived_InDels.tsv.gz[0m      [01;31msimulation_InDels.vcf.gz[0m
[01;31mhumanDerived_InDels.vcf.gz[0m      simulation_InDels.vcf.gz.tbi
humanDerived_InDels.vcf.gz.tbi  [01;31msimulation_SNVs.csv.gz[0m
[01;31mhumanDerived_SNVs.csv.gz[0m        [01;31msimulation_SNVs.tsv.gz[0m
[01;31mhumanDerived_SNVs.tsv.gz[0m        simulation_SNVs.vcf
humanDerived_SNVs.vcf           [01;31msimulation_SNVs.vcf.gz[0m
[01;31mhumanDerived_SNVs.vcf.gz[0m        simulation_SNVs.vcf.gz.tbi
humanDerived_SNVs.vcf.gz.tb

In [11]:
vcf = cyvcf2.VCF(str(ddir / "example_10.vcf"))

In [14]:
variant = next(vcf)

In [15]:
def variant_to_dict(variant):
    return OrderedDict([
        ('variant_id', "." if variant.ID is None else str(variant.ID)),
        ('variant_chr', variant.CHROM),
        ('variant_pos', variant.POS),
        ('variant_ref', variant.REF),
        ('variant_alt', variant.ALT[0]),  # WARNING - assuming a single alternative
    ])

In [17]:
from collections import OrderedDict

In [18]:
variant_to_dict(variant)

OrderedDict([('variant_id', '.'),
             ('variant_chr', 'chr1'),
             ('variant_pos', 381581),
             ('variant_ref', 'AA'),
             ('variant_alt', 'A')])

In [12]:
v = next(vcf)

In [None]:
!head 

In [13]:
v.ID

In [20]:
import numpy as np

In [21]:
np.arange(10).ndim

1

## Notes

Write a function:

```python
def reorder_vcf(input_vcf, row_ids, output_vcf, discard_metadata=False):
    """Re-order the vcf file. Note: the output vcf 
    
    Args:
      input_vcf: path to a vcf file
      row_ids List[int]: a list of integer numbers or a path 
        to a .txt file containing the shuffled rows
      output_vcf: output vcf file path
      discard_metadata: if True, the INFO field of the vcf is ignored
    
    1. Load the vcf into pandas
    2. df_vcf.iloc[row_ids]
    3. Store the data-frame into a vcf file
    """
    pass 


def reorder_sparse_matrix(input_npz, row_ids, output_npz):
    """Re-order the vcf file. Note: the output vcf 
    
    Args:
      input_vcf: path to a .npz file
      row_ids List[int]: a list of integer numbers or a path 
        to a .txt file containing the shuffled rows
      output_npz: output .npz file path
    """
    pass 
```

In [26]:
import pandas as pd

vcf = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/example_100.vcf"
vcf_ds = pd.read_csv(vcf, sep="\t", header=None, comment="#", usecols=[0])
row_ids = np.arange(len(vcf_ds))
np.random.shuffle(row_ids)
row_ids[:10]

array([93, 42, 76, 29, 45, 17, 33,  7, 38, 49])

In [28]:
! head {vcf}

##fileformat=VCFv4.0
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
chr1	380577	.	T	TT	.	.	.
chr1	381581	.	AA	A	.	.	.
chr1	382073	.	T	TT	.	.	.
chr1	383722	.	CCTC	C	.	.	.
chr1	384279	.	T	TT	.	.	.
chr1	384386	.	CC	C	.	.	.
chr1	386695	.	C	CC	.	.	.
chr1	387054	.	A	AT	.	.	.


In [1]:
from kipoi_cadd.data_utils import load_pickle, dump_to_pickle
import pandas as pd
import numpy as np

def reorder_vcf(input_vcf, row_ids, output_vcf, discard_metadata=False):
    """Re-order the vcf file. Note: the output vcf 
    
    Args:
      input_vcf: path to a vcf file
      row_ids List[int]: a list of integer numbers or a path 
        to a .txt file containing the shuffled rows
      output_vcf: output vcf file path
      discard_metadata: if True, the INFO field of the vcf is ignored
    """
    if isinstance(row_ids, str):
        row_ids = load_pickle(row_ids)
    
    colnames = ""
    with open(input_vcf, 'r') as f:
        for l in f.readlines():
            if "#CHROM" in l:
                colnames=l.replace("\n", "").split("\t")
                break
    
    vcf_df = pd.read_csv(input_vcf, sep="\t", header=None, names=colnames, comment="#")
    vcf_df = vcf_df.loc[row_ids]
       
    if discard_metadata:
        vcf_df.drop(columns=['ID', 'QUAL', 'FILTER', 'INFO'], inplace=True)
        header = "##fileformat=VCFv4.0\n"
        with open(output_vcf, 'w') as f:
            f.write(header)
        vcf_df.to_csv(output_vcf, sep="\t", mode='a', index=None)
    else:
        header_lines = ""
        with open(input_vcf, 'r') as f:
            for l in f.readlines():
                if l.startswith("#"): header_lines += l
        with open(output_vcf, 'w') as f:
            f.write(header_lines)
        vcf_df.to_csv(output_vcf, sep="\t", mode='a', header=None, index=None)


tmp_vcf = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/sample_chr22_GRCh37.vcf"
out_vcf = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/shuffled/sample_chr22_GRCh37.vcf"
out_ids = "/s/project/kipoi-cadd/data/raw/v1.4/training_data/GRCh37/shuffled/sample_chr22_GRCh37.pkl"
# tmp_vcf = "/s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/clinvar_pathogenic_splice_site_GRCh37_original.vcf"
# tmp_vcf = "/data/ouga/home/ag_gagneur/simancas/Downloads/UAVnhAumGekZZAJa.vcf"
vcf_ds = pd.read_csv(tmp_vcf, sep="\t", header=None, comment="#", usecols=[0])
row_ids = np.arange(len(vcf_ds))
np.random.shuffle(row_ids)
dump_to_pickle(out_ids, row_ids)
reorder_vcf(tmp_vcf, row_ids, out_vcf, False)
! head {out_vcf}

Using TensorFlow backend.


##fileformat=VCFv4.0
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
22	30527907	.	T	C	.	.	.
22	36171531	.	A	C	.	.	.
22	16361788	.	G	A	.	.	.
22	28748698	.	T	C	.	.	.
22	41396825	.	A	G	.	.	.
22	19732135	.	T	A	.	.	.
22	43975818	.	G	C	.	.	.
22	38038057	.	C	T	.	.	.


In [2]:
! head {tmp_vcf}

##fileformat=VCFv4.0
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
22	16139326	.	A	G	.	.	.
22	16139870	.	T	C	.	.	.
22	16139966	.	A	G	.	.	.
22	16140322	.	C	T	.	.	.
22	16140325	.	A	G	.	.	.
22	16141058	.	T	C	.	.	.
22	16141162	.	A	G	.	.	.
22	16141372	.	T	C	.	.	.


In [33]:
"#CHROM\tPOS\tREF\tALT\n".replace("\n", "").split("\t")

['#CHROM', 'POS', 'REF', 'ALT']

In [44]:
! ls /s/project/kipoi-cadd/data/raw/v1.4/validation/clinVar-ExAC/

ExAC_MAF5p_all_GRCh37.vcf
ExAC_MAF5p_missense_GRCh37.vcf
INFO
clinvar_20180729_pathogenic_all_GRCh37.csv.gz
clinvar_20180729_pathogenic_all_GRCh37.tsv.gz
clinvar_20180729_pathogenic_all_GRCh37.vcf
clinvar_20180729_pathogenic_all_GRCh37_vep_annotated.tsv
clinvar_20180729_pathogenic_all_GRCh37_vep_annotated.vcf
clinvar_20180729_pathogenic_missense_GRCh37.vcf
clinvar_pathogenic_splice_site_GRCh37.csv.gz
clinvar_pathogenic_splice_site_GRCh37.tsv.gz
clinvar_pathogenic_splice_site_GRCh37.vcf
clinvar_pathogenic_splice_site_GRCh37.vcf.gz
clinvar_pathogenic_splice_site_GRCh37.vcf.gz.tbi
clinvar_pathogenic_splice_site_GRCh37_original.vcf
sample_15_GRCh37.vcf
sparse_matrices
variant_ids


In [None]:
def reorder_sparse_matrix(input_npz, row_ids, output_npz):
    """Re-order the vcf file. Note: the output vcf 
    
    Args:
      input_npz: path to a .npz file
      row_ids List[int]: a list of integer numbers or a path 
        to a .txt file containing the shuffled rows
      output_npz: output .npz file path
    """
    if isinstance(row_ids, str):
        row_ids = load_pickle(row_ids)
    
    npz = load_npz(input_npz)
    npz = npz[row_ids]
    save_npz(output_npz, npz)
    