In [1]:
from kipoi_enformer.dataloader import VCFTSSDataloader
from pathlib import Path
import polars as pl
from joblib import Parallel, delayed
import multiprocessing
import pyranges as pr

multiprocessing.cpu_count()

128

In [2]:
FASTA_FILE = Path('/s/genomes/Gencode/Gencode_human/release_40/GRCh37_mapping/GRCh37.primary_assembly.genome.fa')
GTF_FILE = Path('/s/genomes/Gencode/Gencode_human/release_40/GRCh37_mapping/gencode.v40lift37.annotation.gtf.gz')
VCF_DIR = Path('/s/project/rep/processed/training_results_v14/distinct_variants.valid_snp_indel.vcf.restored/distinct_variants.valid_snp_indel.vcf/')

In [3]:
VCF_FILES = list(VCF_DIR.glob('*.vcf.gz'))

In [4]:
GTF_DF = pr.read_gtf(GTF_FILE, as_df=True, duplicate_attr=True)

In [5]:
def vcf_to_polars(vcf_file, gtf, variant_upstream_tss=50, variant_downstream_tss=500,
                  canonical_only=False, protein_coding_only=True):
    dataloader = VCFTSSDataloader(vcf_file=vcf_file, variant_upstream_tss=variant_upstream_tss,
                                  variant_downstream_tss=variant_downstream_tss, shifts=[0],
                                  fasta_file=FASTA_FILE, gtf=gtf,
                                  canonical_only=canonical_only,
                                  protein_coding_only=protein_coding_only,
                                  seq_length=10000)
    return pl.DataFrame(dataloader.load_all()['metadata']). \
        select(pl.exclude(['seq_start', 'seq_end'])). \
        with_columns(vcf_file=pl.lit(str(vcf_file)))

In [6]:
variant_df = pl.concat([vcf_to_polars(vcf_file, GTF_DF.copy()) for vcf_file in VCF_FILES])

22it [00:01, 13.10it/s]
49it [00:01, 27.28it/s]
66it [00:01, 33.87it/s]
70it [00:01, 36.70it/s]
44it [00:01, 23.00it/s]
271it [00:04, 63.70it/s] 
61it [00:01, 31.57it/s]
51it [00:01, 27.20it/s]
65it [00:01, 33.83it/s]
34it [00:01, 20.70it/s]
57it [00:01, 31.60it/s]
45it [00:01, 24.49it/s]
24it [00:01, 16.99it/s]
34it [00:01, 18.72it/s]
38it [00:01, 22.39it/s]
40it [00:01, 21.40it/s]
55it [00:01, 30.66it/s]
50it [00:01, 26.59it/s]
36it [00:01, 24.56it/s]
30it [00:01, 17.70it/s]
25it [00:01, 16.91it/s]
120it [00:02, 47.45it/s]
97it [00:02, 39.86it/s]
136it [00:02, 50.62it/s]
36it [00:01, 23.23it/s]
69it [00:01, 36.87it/s]
98it [00:02, 43.40it/s]
104it [00:02, 44.01it/s]
43it [00:01, 25.18it/s]
33it [00:01, 16.86it/s]
71it [00:02, 32.08it/s]
26it [00:01, 19.14it/s]
49it [00:01, 28.03it/s]
50it [00:01, 25.79it/s]
141it [00:02, 48.68it/s]
66it [00:01, 34.21it/s]
33it [00:01, 18.86it/s]
32it [00:01, 17.01it/s]
167it [00:02, 57.54it/s]
57it [00:02, 24.89it/s]
33it [00:01, 20.49it/s]
52it [00:

In [7]:
variant_df.head()

tss,chrom,strand,gene_id,transcript_id,transcript_start,transcript_end,variant_start,variant_end,ref,alt,vcf_file
i64,str,str,str,str,i64,i64,i64,i64,str,str,str
70315700,"""chrX""","""+""","""ENSG00000184481.17_6""","""ENST00000374259.8_3""",70315700,70323382,70315840,70315841,"""C""","""G""","""/s/project/rep/processed/train…"
70315700,"""chrX""","""+""","""ENSG00000184481.17_6""","""ENST00000374259.8_3""",70315700,70323382,70315994,70315995,"""A""","""G""","""/s/project/rep/processed/train…"
70315700,"""chrX""","""+""","""ENSG00000184481.17_6""","""ENST00000374259.8_3""",70315700,70323382,70316097,70316098,"""T""","""C""","""/s/project/rep/processed/train…"
70316102,"""chrX""","""+""","""ENSG00000184481.17_6""","""ENST00000466874.1_1""",70316102,70320622,70316097,70316098,"""T""","""C""","""/s/project/rep/processed/train…"
70319157,"""chrX""","""+""","""ENSG00000184481.17_6""","""ENST00000464598.1_1""",70319157,70320693,70319566,70319567,"""A""","""T""","""/s/project/rep/processed/train…"


In [8]:
variant_df.shape

(620877, 12)

In [9]:
variant_df.write_parquet('/data/nasif12/home_if12/tsi/kipoi_expression_prediction/variant_tables/tss_variants_50_500.parquet')