In [1]:
import polars as pl
from pathlib import Path

In [2]:
tissue_list = [
    "Adipose - Subcutaneous",
    "Adipose - Visceral (Omentum)",
    "Adrenal Gland",
    "Artery - Aorta",
    "Artery - Coronary",
    "Artery - Tibial",
    "Brain - Amygdala",
    "Brain - Anterior cingulate cortex (BA24)",
    "Brain - Caudate (basal ganglia)",
    "Brain - Cerebellar Hemisphere",
    "Brain - Cerebellum",
    "Brain - Cortex",
    "Brain - Frontal Cortex (BA9)",
    "Brain - Hippocampus",
    "Brain - Hypothalamus",
    "Brain - Nucleus accumbens (basal ganglia)",
    "Brain - Putamen (basal ganglia)",
    "Brain - Spinal cord (cervical c-1)",
    "Brain - Substantia nigra",
    "Breast - Mammary Tissue",
    "Cells - EBV-transformed lymphocytes",
    "Cells - Cultured fibroblasts",
    "Colon - Sigmoid",
    "Colon - Transverse",
    "Esophagus - Gastroesophageal Junction",
    "Esophagus - Mucosa",
    "Esophagus - Muscularis",
    "Heart - Atrial Appendage",
    "Heart - Left Ventricle",
    "Kidney - Cortex",
    "Liver",
    "Lung",
    "Minor Salivary Gland",
    "Muscle - Skeletal",
    "Nerve - Tibial",
    "Ovary",
    "Pancreas",
    "Pituitary",
    "Prostate",
    "Skin - Not Sun Exposed (Suprapubic)",
    "Skin - Sun Exposed (Lower leg)",
    "Small Intestine - Terminal Ileum",
    "Spleen",
    "Stomach",
    "Testis",
    "Thyroid",
    "Uterus",
    "Vagina",
    "Whole Blood"
]


In [3]:
tissue_df = pl.DataFrame(dict(tissue=tissue_list))

## Match LM scores to genes and write variant-effect files

### APAs

In [4]:
apa_variant_path = '/s/project/promoter_prediction/kipoi_expression_prediction/variant_tables/apa_variants_2000_2000.parquet'
apa_influence_path = '/s/project/multispecies/fungi_code/ci_matrices/influence_score/data/apa_abex_inf_scores.tsv'

In [5]:
apa_variant_df = pl.read_parquet(apa_variant_path)
apa_influence_df = pl.read_csv(apa_influence_path, separator='\t'). \
    select(['Chromosome', 'variant_start', 'variant_end', 'ref', 'alt', 'influence_score']). \
    rename({'Chromosome': 'chrom',
            'influence_score': 'veff_score'})

In [6]:
apa_df = apa_variant_df.join(apa_influence_df, on=['chrom', 'variant_start', 'variant_end', 'ref', 'alt'], how='inner'). \
    filter(~pl.col('veff_score').is_null())

In [7]:
apa_df.shape

(3082141, 13)

In [8]:
# group by gene
apa_df = apa_df.sort("veff_score", descending=True).group_by(['chrom', 'strand', 'gene_id', 'variant_start', 'variant_end', 'ref', 'alt', 'vcf_file']). \
    agg(pl.col(['pas_pos', 'cse_pos', 'pas_id', 'transcript_id', 'veff_score']).first())

In [9]:
# make the score negative to match the pipeline (there is no directionality in the score)
# add empty tissue column
apa_df = apa_df.with_columns(veff_score=-pl.col('veff_score'))

In [10]:
apa_df.join(tissue_df, how='cross')

chrom,strand,gene_id,variant_start,variant_end,ref,alt,vcf_file,pas_pos,cse_pos,pas_id,transcript_id,veff_score,tissue
str,str,str,i64,i64,str,str,str,i64,i64,str,str,f64,str
"""chr10""","""-""","""ENSG00000151474.23_13""",13715311,13715312,"""A""","""G""","""/s/project/rep/processed/train…",13717037,13717067,"""chr10:13717037:-""","""ENST00000632570.1_6""",-0.034918,"""Adipose - Subcutaneous"""
"""chr10""","""-""","""ENSG00000151474.23_13""",13715311,13715312,"""A""","""G""","""/s/project/rep/processed/train…",13717037,13717067,"""chr10:13717037:-""","""ENST00000632570.1_6""",-0.034918,"""Adipose - Visceral (Omentum)"""
"""chr10""","""-""","""ENSG00000151474.23_13""",13715311,13715312,"""A""","""G""","""/s/project/rep/processed/train…",13717037,13717067,"""chr10:13717037:-""","""ENST00000632570.1_6""",-0.034918,"""Adrenal Gland"""
"""chr10""","""-""","""ENSG00000151474.23_13""",13715311,13715312,"""A""","""G""","""/s/project/rep/processed/train…",13717037,13717067,"""chr10:13717037:-""","""ENST00000632570.1_6""",-0.034918,"""Artery - Aorta"""
"""chr10""","""-""","""ENSG00000151474.23_13""",13715311,13715312,"""A""","""G""","""/s/project/rep/processed/train…",13717037,13717067,"""chr10:13717037:-""","""ENST00000632570.1_6""",-0.034918,"""Artery - Coronary"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr3""","""+""","""ENSG00000058063.16_10""",182614778,182614779,"""C""","""T""","""/s/project/rep/processed/train…",182615112,182615082,"""chr3:182615112:+""","""ENST00000482070.1_1""",-0.031092,"""Testis"""
"""chr3""","""+""","""ENSG00000058063.16_10""",182614778,182614779,"""C""","""T""","""/s/project/rep/processed/train…",182615112,182615082,"""chr3:182615112:+""","""ENST00000482070.1_1""",-0.031092,"""Thyroid"""
"""chr3""","""+""","""ENSG00000058063.16_10""",182614778,182614779,"""C""","""T""","""/s/project/rep/processed/train…",182615112,182615082,"""chr3:182615112:+""","""ENST00000482070.1_1""",-0.031092,"""Uterus"""
"""chr3""","""+""","""ENSG00000058063.16_10""",182614778,182614779,"""C""","""T""","""/s/project/rep/processed/train…",182615112,182615082,"""chr3:182615112:+""","""ENST00000482070.1_1""",-0.031092,"""Vagina"""


In [11]:
apa_output_path = Path('/s/project/promoter_prediction/kipoi_expression_prediction/veff/lm_apa_2kb.parquet/')
# apa_output_path.mkdir()
for vcf_file, group_df in apa_df.join(tissue_df, how='cross').group_by("vcf_file"):
    vcf_file = Path(vcf_file[0]).name
    group_df.select(pl.exclude('vcf_file')).write_parquet(apa_output_path / f'{vcf_file}.parquet')

### TSS

In [12]:
tss_variants_path = '/s/project/promoter_prediction/kipoi_expression_prediction/variant_tables/tss_variants_2000_2000.parquet'
tss_influence_path = '/s/project/multispecies/fungi_code/ci_matrices/influence_score/data/tss_abex_inf_scores.tsv'

In [13]:
tss_variant_df = pl.read_parquet(tss_variants_path)
tss_influence_df = pl.read_csv(tss_influence_path, separator='\t'). \
    select(['Chromosome', 'variant_start', 'variant_end', 'ref', 'alt', 'influence_score']). \
    rename({'Chromosome': 'chrom',
            'influence_score': 'veff_score'})

In [14]:
tss_df = tss_variant_df.join(tss_influence_df, on=['chrom', 'variant_start', 'variant_end', 'ref', 'alt'], how='inner'). \
    filter(~pl.col('veff_score').is_null())

In [15]:
tss_df.shape

(3739997, 13)

In [16]:
# group by gene
tss_df = tss_df.sort("veff_score", descending=True).group_by(['chrom', 'strand', 'gene_id', 'variant_start', 'variant_end', 'ref', 'alt', 'vcf_file']). \
    agg(pl.col(['tss', 'transcript_start', 'transcript_end', 'transcript_id', 'veff_score']).first())

In [17]:
# make the score negative to match the pipeline (there is no directionality in the score)
tss_df = tss_df.with_columns(veff_score=-pl.col('veff_score'))

In [18]:
tss_df.join(tissue_df, how='cross')

chrom,strand,gene_id,variant_start,variant_end,ref,alt,vcf_file,tss,transcript_start,transcript_end,transcript_id,veff_score,tissue
str,str,str,i64,i64,str,str,str,i64,i64,i64,str,f64,str
"""chr8""","""+""","""ENSG00000168522.13_11""",42941211,42941212,"""A""","""G""","""/s/project/rep/processed/train…",42939770,42939770,42940649,"""ENST00000528400.1_1""",-0.017692,"""Adipose - Subcutaneous"""
"""chr8""","""+""","""ENSG00000168522.13_11""",42941211,42941212,"""A""","""G""","""/s/project/rep/processed/train…",42939770,42939770,42940649,"""ENST00000528400.1_1""",-0.017692,"""Adipose - Visceral (Omentum)"""
"""chr8""","""+""","""ENSG00000168522.13_11""",42941211,42941212,"""A""","""G""","""/s/project/rep/processed/train…",42939770,42939770,42940649,"""ENST00000528400.1_1""",-0.017692,"""Adrenal Gland"""
"""chr8""","""+""","""ENSG00000168522.13_11""",42941211,42941212,"""A""","""G""","""/s/project/rep/processed/train…",42939770,42939770,42940649,"""ENST00000528400.1_1""",-0.017692,"""Artery - Aorta"""
"""chr8""","""+""","""ENSG00000168522.13_11""",42941211,42941212,"""A""","""G""","""/s/project/rep/processed/train…",42939770,42939770,42940649,"""ENST00000528400.1_1""",-0.017692,"""Artery - Coronary"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr19""","""+""","""ENSG00000064961.19_7""",3572414,3572415,"""G""","""A""","""/s/project/rep/processed/train…",3572774,3572774,3576942,"""ENST00000416526.5_3""",-0.023363,"""Testis"""
"""chr19""","""+""","""ENSG00000064961.19_7""",3572414,3572415,"""G""","""A""","""/s/project/rep/processed/train…",3572774,3572774,3576942,"""ENST00000416526.5_3""",-0.023363,"""Thyroid"""
"""chr19""","""+""","""ENSG00000064961.19_7""",3572414,3572415,"""G""","""A""","""/s/project/rep/processed/train…",3572774,3572774,3576942,"""ENST00000416526.5_3""",-0.023363,"""Uterus"""
"""chr19""","""+""","""ENSG00000064961.19_7""",3572414,3572415,"""G""","""A""","""/s/project/rep/processed/train…",3572774,3572774,3576942,"""ENST00000416526.5_3""",-0.023363,"""Vagina"""


In [19]:
tss_output_path = Path('/s/project/promoter_prediction/kipoi_expression_prediction/veff/lm_tss_2kb.parquet/')
# tss_output_path.mkdir()
for vcf_file, group_df in tss_df.join(tissue_df, how='cross').group_by("vcf_file"):
    vcf_file = Path(vcf_file[0]).name
    group_df.select(pl.exclude('vcf_file')).write_parquet(tss_output_path / f'{vcf_file}.parquet')

### Merged

In [20]:
tss_df

chrom,strand,gene_id,variant_start,variant_end,ref,alt,vcf_file,tss,transcript_start,transcript_end,transcript_id,veff_score
str,str,str,i64,i64,str,str,str,i64,i64,i64,str,f64
"""chr8""","""+""","""ENSG00000168522.13_11""",42941211,42941212,"""A""","""G""","""/s/project/rep/processed/train…",42939770,42939770,42940649,"""ENST00000528400.1_1""",-0.017692
"""chr2""","""-""","""ENSG00000132031.13_7""",20211183,20211184,"""A""","""G""","""/s/project/rep/processed/train…",20212425,20192797,20212426,"""ENST00000421259.2_3""",-0.047707
"""chr7""","""-""","""ENSG00000155660.11_13""",148705243,148705244,"""C""","""T""","""/s/project/rep/processed/train…",148703374,148701290,148703375,"""ENST00000466592.1_3""",-0.041383
"""chr16""","""-""","""ENSG00000005339.15_13""",3793638,3793639,"""T""","""C""","""/s/project/rep/processed/train…",3793468,3781911,3793469,"""ENST00000576720.1_3""",-0.019578
"""chr19""","""+""","""ENSG00000088256.9_6""",3114000,3114001,"""G""","""A""","""/s/project/rep/processed/train…",3113460,3113460,3121455,"""ENST00000587636.1_1""",-0.023546
…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr11""","""-""","""ENSG00000149243.17_9""",75139698,75139699,"""T""","""C""","""/s/project/rep/processed/train…",75137883,75136550,75137884,"""ENST00000527491.1_3""",-0.021486
"""chr15""","""-""","""ENSG00000273025.1_11""",72614150,72614151,"""C""","""T""","""/s/project/rep/processed/train…",72612286,72559086,72612287,"""ENST00000569547.1_4""",-0.017484
"""chr8""","""+""","""ENSG00000180938.6_6""",125984519,125984520,"""C""","""T""","""/s/project/rep/processed/train…",125985536,125985536,125991631,"""ENST00000319286.6_3""",-0.016123
"""chr11""","""+""","""ENSG00000177106.17_12""",727896,727897,"""G""","""A""","""/s/project/rep/processed/train…",726080,726080,727113,"""ENST00000534679.1_1""",-0.027457


In [21]:
apa_df

chrom,strand,gene_id,variant_start,variant_end,ref,alt,vcf_file,pas_pos,cse_pos,pas_id,transcript_id,veff_score
str,str,str,i64,i64,str,str,str,i64,i64,str,str,f64
"""chr10""","""-""","""ENSG00000151474.23_13""",13715311,13715312,"""A""","""G""","""/s/project/rep/processed/train…",13717037,13717067,"""chr10:13717037:-""","""ENST00000632570.1_6""",-0.034918
"""chr10""","""+""","""ENSG00000171798.19_8""",135024649,135024650,"""G""","""A""","""/s/project/rep/processed/train…",135024227,135024197,"""chr10:135024227:+""","""ENST00000682309.1_2""",-0.049111
"""chr2""","""+""","""ENSG00000115718.18_11""",128178841,128178842,"""C""","""T""","""/s/project/rep/processed/train…",128180672,128180642,"""chr2:128180672:+""","""ENST00000474030.5_2""",-0.04467
"""chr22""","""-""","""ENSG00000015475.19_11""",18222938,18222939,"""G""","""A""","""/s/project/rep/processed/train…",18222213,18222243,"""chr22:18222213:-""","""ENST00000552886.1_3""",-0.039908
"""chr11""","""+""","""ENSG00000165973.19_9""",21251222,21251223,"""G""","""A""","""/s/project/rep/processed/train…",21250933,21250903,"""chr11:21250933:+""","""ENST00000530672.1_1""",-0.037994
…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr4""","""-""","""ENSG00000164093.18_13""",111542625,111542626,"""G""","""A""","""/s/project/rep/processed/train…",111541432,111541462,"""chr4:111541432:-""","""ENST00000557119.2_3""",-0.038825
"""chr11""","""+""","""ENSG00000110330.10_12""",102248707,102248708,"""T""","""G""","""/s/project/rep/processed/train…",102248327,102248297,"""chr11:102248327:+""","""ENST00000534130.1_1""",-0.047017
"""chr1""","""-""","""ENSG00000117090.16_10""",160576287,160576288,"""G""","""A""","""/s/project/rep/processed/train…",160577895,160577925,"""chr1:160577895:-""","""ENST00000302035.11_9""",-0.064608
"""chr17""","""-""","""ENSG00000132514.14_10""",6979476,6979477,"""C""","""T""","""/s/project/rep/processed/train…",6977855,6977885,"""chr17:6977855:-""","""ENST00000416562.7_6;ENST000002…",-0.04321


In [22]:
apa_df = apa_df.rename({'veff_score': 'pas_score', 'transcript_id': 'pas_transcript_id'})
tss_df = tss_df.rename({'veff_score': 'tss_score', 'transcript_id': 'tss_transcript_id', 
                        'transcript_start': 'tss_transcript_start', 'transcript_end': 'tss_transcript_end'})

In [23]:
merged_df = (apa_df.join(tss_df, how='full', on=['chrom', 'strand', 'gene_id', 'variant_start',
                                     'variant_end','ref', 'alt', 'vcf_file', ], coalesce=True).
            with_columns(is_near_tss=~pl.col('tss').is_null(),
                         is_near_pas=~pl.col('pas_id').is_null()))

In [24]:
merged_df = merged_df.join(tissue_df, how='cross')

In [25]:
merged_df

chrom,strand,gene_id,variant_start,variant_end,ref,alt,vcf_file,pas_pos,cse_pos,pas_id,pas_transcript_id,pas_score,tss,tss_transcript_start,tss_transcript_end,tss_transcript_id,tss_score,is_near_tss,is_near_pas,tissue
str,str,str,i64,i64,str,str,str,i64,i64,str,str,f64,i64,i64,i64,str,f64,bool,bool,str
"""chr10""","""-""","""ENSG00000151474.23_13""",13715311,13715312,"""A""","""G""","""/s/project/rep/processed/train…",13717037,13717067,"""chr10:13717037:-""","""ENST00000632570.1_6""",-0.034918,,,,,,false,true,"""Adipose - Subcutaneous"""
"""chr10""","""-""","""ENSG00000151474.23_13""",13715311,13715312,"""A""","""G""","""/s/project/rep/processed/train…",13717037,13717067,"""chr10:13717037:-""","""ENST00000632570.1_6""",-0.034918,,,,,,false,true,"""Adipose - Visceral (Omentum)"""
"""chr10""","""-""","""ENSG00000151474.23_13""",13715311,13715312,"""A""","""G""","""/s/project/rep/processed/train…",13717037,13717067,"""chr10:13717037:-""","""ENST00000632570.1_6""",-0.034918,,,,,,false,true,"""Adrenal Gland"""
"""chr10""","""-""","""ENSG00000151474.23_13""",13715311,13715312,"""A""","""G""","""/s/project/rep/processed/train…",13717037,13717067,"""chr10:13717037:-""","""ENST00000632570.1_6""",-0.034918,,,,,,false,true,"""Artery - Aorta"""
"""chr10""","""-""","""ENSG00000151474.23_13""",13715311,13715312,"""A""","""G""","""/s/project/rep/processed/train…",13717037,13717067,"""chr10:13717037:-""","""ENST00000632570.1_6""",-0.034918,,,,,,false,true,"""Artery - Coronary"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr10""","""-""","""ENSG00000107929.16_8""",936490,936491,"""A""","""G""","""/s/project/rep/processed/train…",,,,,,935773,910209,935774,"""ENST00000693234.1_1""",-0.019352,true,false,"""Testis"""
"""chr10""","""-""","""ENSG00000107929.16_8""",936490,936491,"""A""","""G""","""/s/project/rep/processed/train…",,,,,,935773,910209,935774,"""ENST00000693234.1_1""",-0.019352,true,false,"""Thyroid"""
"""chr10""","""-""","""ENSG00000107929.16_8""",936490,936491,"""A""","""G""","""/s/project/rep/processed/train…",,,,,,935773,910209,935774,"""ENST00000693234.1_1""",-0.019352,true,false,"""Uterus"""
"""chr10""","""-""","""ENSG00000107929.16_8""",936490,936491,"""A""","""G""","""/s/project/rep/processed/train…",,,,,,935773,910209,935774,"""ENST00000693234.1_1""",-0.019352,true,false,"""Vagina"""


In [27]:
merged_output_path = Path('/s/project/promoter_prediction/kipoi_expression_prediction/veff/lm_merged_2kb.parquet/')
# merged_output_path.mkdir()
for vcf_file, group_df in merged_df.group_by("vcf_file"):
    vcf_file = Path(vcf_file[0]).name
    group_df.select(pl.exclude('vcf_file')).write_parquet(merged_output_path / f'{vcf_file}.parquet')