In [3]:
from kipoi_aparent2.dataloader import VCFApaDataloader
from kipoi_aparent2.dataloader.apa_annotation import EnsemblAPAAnnotation
from pathlib import Path
import polars as pl

In [4]:
FASTA_FILE = Path('/s/genomes/Gencode/Gencode_human/release_40/GRCh37_mapping/GRCh37.primary_assembly.genome.fa')
GTF_FILE = Path('/s/genomes/Gencode/Gencode_human/release_40/GRCh37_mapping/gencode.v40lift37.annotation.gtf.gz')
VCF_DIR = Path('/s/project/rep/processed/training_results_v14/distinct_variants.valid_snp_indel.vcf.restored/distinct_variants.valid_snp_indel.vcf/')

In [5]:
VCF_FILES = list(VCF_DIR.glob('*.vcf.gz'))

In [6]:
APA_ANNOTATION = EnsemblAPAAnnotation(GTF_FILE, canonical_only=False, protein_coding_only=True)

In [7]:
def vcf_to_polars(vcf_file, variant_upstream_cse=2000, variant_downstream_cse=2000,
                  canonical_only=False, protein_coding_only=True):
    dataloader = VCFApaDataloader(vcf_file=vcf_file, variant_upstream_cse=variant_upstream_cse,
                                  variant_downstream_cse=variant_downstream_cse,
                                  fasta_file=FASTA_FILE, apa_annotation=APA_ANNOTATION,
                                  seq_length=10000, cse_pos_index=3000)
    return pl.DataFrame(dataloader.load_all()['metadata']). \
        select(pl.exclude(['seq_start', 'seq_end'])). \
        with_columns(vcf_file=pl.lit(str(vcf_file)))

In [8]:
variant_df = pl.concat([vcf_to_polars(vcf_file) for vcf_file in VCF_FILES])

120it [00:02, 57.35it/s]
281it [00:03, 71.57it/s]
376it [00:05, 74.73it/s] 
348it [00:04, 75.05it/s]
256it [00:03, 70.35it/s]
1746it [00:20, 83.71it/s] 
333it [00:04, 74.79it/s]
327it [00:04, 73.60it/s] 
273it [00:03, 71.06it/s]
219it [00:03, 68.93it/s]
226it [00:03, 67.83it/s] 
239it [00:03, 69.54it/s]
157it [00:02, 64.80it/s]
173it [00:02, 63.19it/s] 
186it [00:02, 65.77it/s]
140it [00:02, 59.67it/s]
290it [00:04, 70.60it/s] 
315it [00:04, 71.65it/s]
161it [00:02, 63.40it/s]
159it [00:02, 65.38it/s]
134it [00:02, 57.14it/s]
680it [00:08, 78.08it/s] 
517it [00:06, 76.84it/s] 
763it [00:09, 79.96it/s] 
197it [00:03, 64.43it/s]
385it [00:05, 73.55it/s] 
517it [00:06, 77.99it/s] 
568it [00:07, 77.14it/s] 
223it [00:03, 68.20it/s]
177it [00:02, 65.05it/s] 
402it [00:05, 75.55it/s]
140it [00:02, 60.85it/s]
235it [00:03, 66.08it/s]
310it [00:04, 73.57it/s] 
683it [00:08, 78.71it/s] 
308it [00:04, 73.42it/s]
201it [00:03, 64.67it/s]
142it [00:02, 59.21it/s]
1122it [00:13, 81.97it/s] 
335it [

In [16]:
variant_df.head()

pas_pos,cse_pos,chrom,strand,pas_id,gene_id,transcript_id,variant_start,variant_end,ref,alt,vcf_file
i64,i64,str,str,str,str,str,i64,i64,str,str,str
70320621,70320591,"""chrX""","""+""","""chrX:70320621:...","""ENSG0000018448...","""ENST0000046687...",70318603,70318604,"""T""","""TA""","""/s/project/rep..."
70320621,70320591,"""chrX""","""+""","""chrX:70320621:...","""ENSG0000018448...","""ENST0000046687...",70318678,70318679,"""G""","""A""","""/s/project/rep..."
70320692,70320662,"""chrX""","""+""","""chrX:70320692:...","""ENSG0000018448...","""ENST0000046459...",70318678,70318679,"""G""","""A""","""/s/project/rep..."
70320621,70320591,"""chrX""","""+""","""chrX:70320621:...","""ENSG0000018448...","""ENST0000046687...",70318905,70318906,"""T""","""C""","""/s/project/rep..."
70320692,70320662,"""chrX""","""+""","""chrX:70320692:...","""ENSG0000018448...","""ENST0000046459...",70318905,70318906,"""T""","""C""","""/s/project/rep..."


In [14]:
variant_df.write_parquet('/data/nasif12/home_if12/tsi/kipoi_expression_prediction/variant_tables/apa_variants.parquet')