In [1]:
from kipoi_aparent2.dataloader import VCFApaDataloader
from kipoi_aparent2.dataloader.apa_annotation import EnsemblAPAAnnotation
from pathlib import Path
import polars as pl

In [2]:
FASTA_FILE = Path('/s/genomes/Gencode/Gencode_human/release_40/GRCh37_mapping/GRCh37.primary_assembly.genome.fa')
GTF_FILE = Path('/s/genomes/Gencode/Gencode_human/release_40/GRCh37_mapping/gencode.v40lift37.annotation.gtf.gz')
VCF_DIR = Path('/s/project/rep/processed/training_results_v14/distinct_variants.valid_snp_indel.vcf.restored/distinct_variants.valid_snp_indel.vcf/')

In [3]:
VCF_FILES = list(VCF_DIR.glob('*.vcf.gz'))

In [4]:
APA_ANNOTATION = EnsemblAPAAnnotation(GTF_FILE, canonical_only=False, protein_coding_only=True)

In [5]:
def vcf_to_polars(vcf_file, variant_upstream_cse=70, variant_downstream_cse=130,
                  canonical_only=False, protein_coding_only=True):
    dataloader = VCFApaDataloader(vcf_file=vcf_file, variant_upstream_cse=variant_upstream_cse,
                                  variant_downstream_cse=variant_downstream_cse,
                                  fasta_file=FASTA_FILE, apa_annotation=APA_ANNOTATION,
                                  seq_length=10000, cse_pos_index=3000)
    return pl.DataFrame(dataloader.load_all()['metadata']). \
        select(pl.exclude(['seq_start', 'seq_end'])). \
        with_columns(vcf_file=pl.lit(str(vcf_file)))

In [6]:
variant_df = pl.concat([vcf_to_polars(vcf_file) for vcf_file in VCF_FILES])

6it [00:01,  4.98it/s]
14it [00:01, 11.18it/s]
19it [00:01, 14.45it/s]
18it [00:01, 13.76it/s]
12it [00:01,  9.32it/s]
93it [00:02, 40.92it/s]
18it [00:01, 13.40it/s]
17it [00:01, 11.62it/s]
13it [00:01,  9.94it/s]
11it [00:01,  9.09it/s]
11it [00:01,  8.87it/s]
12it [00:01, 10.24it/s]
7it [00:01,  6.16it/s]
8it [00:01,  7.58it/s]
9it [00:01,  6.52it/s]
7it [00:01,  5.69it/s]
15it [00:01, 12.49it/s]
17it [00:01, 12.65it/s]
9it [00:01,  6.71it/s]
8it [00:01,  7.83it/s]
7it [00:01,  6.12it/s]
32it [00:01, 19.87it/s]
25it [00:01, 17.27it/s]
39it [00:01, 24.29it/s]
9it [00:01,  7.49it/s]
19it [00:01, 13.75it/s]
25it [00:01, 16.44it/s]
29it [00:01, 19.52it/s]
10it [00:01,  8.10it/s]
9it [00:01,  7.16it/s]
20it [00:01, 13.39it/s]
6it [00:01,  5.82it/s]
12it [00:01,  8.64it/s]
15it [00:01, 10.26it/s]
33it [00:01, 21.74it/s]
15it [00:01, 11.63it/s]
10it [00:01,  8.08it/s]
6it [00:01,  4.79it/s]
58it [00:01, 30.78it/s]
15it [00:01, 10.35it/s]
8it [00:01,  6.58it/s]
14it [00:01, 11.43it/s]
34it 

In [7]:
variant_df.head()

pas_pos,cse_pos,chrom,strand,pas_id,gene_id,transcript_id,variant_start,variant_end,ref,alt,vcf_file
i64,i64,str,str,str,str,str,i64,i64,str,str,str
70320621,70320591,"""chrX""","""+""","""chrX:70320621:...","""ENSG0000018448...","""ENST0000046687...",70320652,70320653,"""C""","""T""","""/s/project/rep..."
70320692,70320662,"""chrX""","""+""","""chrX:70320692:...","""ENSG0000018448...","""ENST0000046459...",70320652,70320653,"""C""","""T""","""/s/project/rep..."
70344909,70344879,"""chrX""","""+""","""chrX:70344909:...","""ENSG0000018463...","""ENST0000069286...",70344989,70344990,"""C""","""T""","""/s/project/rep..."
70346805,70346775,"""chrX""","""+""","""chrX:70346805:...","""ENSG0000018463...","""ENST0000069138...",70346868,70346869,"""G""","""A""","""/s/project/rep..."
70346814,70346784,"""chrX""","""+""","""chrX:70346814:...","""ENSG0000018463...","""ENST0000068900...",70346868,70346869,"""G""","""A""","""/s/project/rep..."


In [8]:
variant_df.shape

(163448, 12)

In [9]:
variant_df.write_parquet('/data/nasif12/home_if12/tsi/kipoi_expression_prediction/variant_tables/apa_variants_70_130.parquet')