## Saving each chromosome as a separate parquet file

In [2]:
import dask.dataframe as dd
import pandas as pd
from fastparquet import write

In [3]:
all_variants = '/s/project/kipoi-cadd/data/processed/kipoi/all_variants'
parquet = dd.read_parquet(all_variants, index="ID")
parquet.head()

Unnamed: 0_level_0,y,Chrom,Pos,Ref,Alt
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,1,379177,T,G
2,0,1,379274,C,G
3,0,1,379476,A,T
4,0,1,379631,C,G
5,0,1,379724,A,G


In [12]:
chrom = 2
temp = parquet[parquet.Chrom == str(chrom)]
chrom_df = temp.compute()
chrom_df.Chrom = chrom_df.Chrom.astype('int64')

chrom_parquet = '/s/project/kipoi-cadd/data/processed/kipoi/chr' + str(chrom) + '/variant_labels.parq'
write(chrom_parquet, chrom_df)

### Checking that it worked...

In [16]:
from fastparquet import ParquetFile
pf = ParquetFile('/s/project/kipoi-cadd/data/processed/kipoi/chr1/variant_labels.parq')
df = pf.to_pandas()
df.head()

Unnamed: 0_level_0,y,Chrom,Pos,Ref,Alt
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,1,379177,T,G
2,0,1,379274,C,G
3,0,1,379476,A,T
4,0,1,379631,C,G
5,0,1,379724,A,G


## Saving each chromosome as VCF file
Using some of the tools provided by Kipoi to manage VCF files.
1. Pandas -> VCF file: [Kipoi code](https://github.com/kipoi/manuscript/blob/master/src/splicing/dbscSNV.smk#L44-L65)
2. BZip and tabix the resulting VCF file: [Kipoi code](https://github.com/kipoi/manuscript/blob/master/src/splicing/dbscSNV.smk#L75-L77)

In [None]:
import pandas as pd
import numpy as np
from m_kipoi.config import VCF_HEADER  # hg19 based
from collections import OrderedDict

df = pd.read_csv(input.in_tsv, sep="\t")
# Write the header
with open(output.vcf, "w") as f:
    f.write(VCF_HEADER)

# Append the variants
variant_id = df.Chr.astype(str) + ":" + df.Position.astype(str) + ":" + df.Ref + ":['" + df.Alt + "']"
pd.DataFrame(OrderedDict([("#CHROM", df.Chr.astype(str)),
                          ("POS", df.Position),
                          ("ID", variant_id),
                          ("REF", df.Ref),
                          ("ALT", df.Alt),
                          ("QUAL", "."),
                          ("FILTER", "."),
                          ("INFO", "."),
                          ])).to_csv(output.vcf, mode='a', header=True, index=False, sep="\t")