In [1]:
from kipoi_enformer.dataloader import VCFTSSDataloader
from pathlib import Path
import polars as pl
from joblib import Parallel, delayed
import multiprocessing
import pyranges as pr

multiprocessing.cpu_count()

128

In [2]:
FASTA_FILE = Path('/s/genomes/Gencode/Gencode_human/release_40/GRCh37_mapping/GRCh37.primary_assembly.genome.fa')
GTF_FILE = Path('/s/genomes/Gencode/Gencode_human/release_40/GRCh37_mapping/gencode.v40lift37.annotation.gtf.gz')
VCF_DIR = Path('/s/project/rep/processed/training_results_v14/distinct_variants.valid_snp_indel.vcf.restored/distinct_variants.valid_snp_indel.vcf/')

In [3]:
VCF_FILES = list(VCF_DIR.glob('*.vcf.gz'))

In [4]:
GTF_DF = pr.read_gtf(GTF_FILE, as_df=True, duplicate_attr=True)

In [5]:
def vcf_to_polars(vcf_file, gtf, variant_upstream_tss=2000, variant_downstream_tss=2000,
                  canonical_only=False, protein_coding_only=True):
    dataloader = VCFTSSDataloader(vcf_file=vcf_file, variant_upstream_tss=variant_upstream_tss,
                                  variant_downstream_tss=variant_downstream_tss, shifts=[0],
                                  fasta_file=FASTA_FILE, gtf=gtf,
                                  canonical_only=canonical_only,
                                  protein_coding_only=protein_coding_only,
                                  seq_length=10000)
    return pl.DataFrame(dataloader.load_all()['metadata']). \
        select(pl.exclude(['seq_start', 'seq_end'])). \
        with_columns(vcf_file=pl.lit(str(vcf_file)))

In [6]:
variant_df = pl.concat([vcf_to_polars(vcf_file, GTF_DF.copy()) for vcf_file in VCF_FILES])

163it [00:02, 56.71it/s] 
321it [00:04, 70.14it/s] 
446it [00:06, 74.10it/s] 
439it [00:05, 74.25it/s] 
311it [00:04, 68.44it/s]
1930it [00:21, 88.74it/s] 
381it [00:04, 77.05it/s] 
376it [00:04, 77.85it/s] 
380it [00:04, 77.05it/s] 
227it [00:03, 67.97it/s] 
345it [00:04, 71.74it/s] 
332it [00:04, 69.39it/s] 
159it [00:02, 54.49it/s]
225it [00:03, 60.13it/s] 
255it [00:04, 61.16it/s]
213it [00:03, 61.20it/s] 
361it [00:04, 74.01it/s] 
335it [00:04, 70.52it/s] 
236it [00:03, 67.84it/s] 
198it [00:03, 62.54it/s] 
158it [00:03, 52.30it/s]
767it [00:09, 78.24it/s] 
630it [00:08, 76.56it/s] 
835it [00:10, 79.02it/s] 
238it [00:03, 61.44it/s]
423it [00:05, 73.39it/s] 
610it [00:07, 78.15it/s] 
701it [00:08, 80.93it/s] 
262it [00:03, 68.68it/s] 
241it [00:03, 60.78it/s] 
507it [00:06, 73.57it/s] 
160it [00:02, 55.97it/s]
315it [00:04, 66.01it/s]
355it [00:05, 65.88it/s] 
878it [00:10, 81.07it/s] 
436it [00:06, 72.44it/s] 
234it [00:03, 60.35it/s]
176it [00:03, 56.81it/s]
1179it [00:13, 84.39

In [9]:
variant_df.head()

tss,chrom,strand,gene_id,transcript_id,transcript_start,transcript_end,variant_start,variant_end,ref,alt,vcf_file
i64,str,str,str,str,i64,i64,i64,i64,str,str,str
70315700,"""chrX""","""+""","""ENSG00000184481.17_6""","""ENST00000374259.8_3""",70315700,70323382,70315840,70315841,"""C""","""G""","""/s/project/rep/processed/train…"
70316102,"""chrX""","""+""","""ENSG00000184481.17_6""","""ENST00000466874.1_1""",70316102,70320622,70315840,70315841,"""C""","""G""","""/s/project/rep/processed/train…"
70316378,"""chrX""","""+""","""ENSG00000184481.17_6""","""ENST00000341558.3_1""",70316378,70321934,70315840,70315841,"""C""","""G""","""/s/project/rep/processed/train…"
70315700,"""chrX""","""+""","""ENSG00000184481.17_6""","""ENST00000374259.8_3""",70315700,70323382,70315994,70315995,"""A""","""G""","""/s/project/rep/processed/train…"
70316102,"""chrX""","""+""","""ENSG00000184481.17_6""","""ENST00000466874.1_1""",70316102,70320622,70315994,70315995,"""A""","""G""","""/s/project/rep/processed/train…"


In [8]:
variant_df.write_parquet('/data/nasif12/home_if12/tsi/kipoi_expression_prediction/variant_tables/tss_variants.parquet')