In [1]:
import polars as pl

## Match LM scores to genes and write variant-effect files

### APAs

In [2]:
apa_variant_path = '/s/project/promoter_prediction/kipoi_expression_prediction/variant_tables/apa_variants.parquet'
apa_influence_path = '/s/project/multispecies/fungi_code/ci_matrices/influence_score/data/apa_abex_inf_scores.tsv'

In [7]:
apa_variant_df = pl.read_parquet(apa_variant_path)
apa_influence_df = pl.read_csv(apa_influence_path, separator='\t'). \
    select(['Chromosome', 'variant_start', 'variant_end', 'ref', 'alt', 'influence_score']). \
    rename({'Chromosome': 'chrom',
            'influence_score': 'veff_score'})

In [8]:
apa_df = apa_variant_df.join(apa_influence_df, on=['chrom', 'variant_start', 'variant_end', 'ref', 'alt'], how='inner'). \
    filter(~pl.col('veff_score').is_null())

In [9]:
apa_df.shape

(3082141, 13)

In [10]:
# group by gene
apa_df = apa_df.sort("veff_score", descending=True).group_by(['chrom', 'strand', 'gene_id', 'variant_start', 'variant_end', 'ref', 'alt', 'vcf_file']). \
    agg(pl.col(['pas_pos', 'cse_pos', 'pas_id', 'transcript_id', 'veff_score']).first())

In [22]:
# make the score negative to match the pipeline (there is no directionality in the score)
apa_df = apa_df.with_columns(veff_score=-pl.col('veff_score'))

In [23]:
apa_df.write_parquet('/s/project/promoter_prediction/kipoi_expression_prediction/veff/lm_apa.parquet')

### TSS

In [13]:
tss_variants_path = '/s/project/promoter_prediction/kipoi_expression_prediction/variant_tables/tss_variants.parquet'
tss_influence_path = '/s/project/multispecies/fungi_code/ci_matrices/influence_score/data/tss_abex_inf_scores.tsv'

In [14]:
tss_variant_df = pl.read_parquet(tss_variants_path)
tss_influence_df = pl.read_csv(tss_influence_path, separator='\t'). \
    select(['Chromosome', 'variant_start', 'variant_end', 'ref', 'alt', 'influence_score']). \
    rename({'Chromosome': 'chrom',
            'influence_score': 'veff_score'})

In [15]:
tss_df = tss_variant_df.join(tss_influence_df, on=['chrom', 'variant_start', 'variant_end', 'ref', 'alt'], how='inner'). \
    filter(~pl.col('veff_score').is_null())

In [16]:
tss_df.shape

(3739997, 13)

In [17]:
# group by gene
tss_df = tss_df.sort("veff_score", descending=True).group_by(['chrom', 'strand', 'gene_id', 'variant_start', 'variant_end', 'ref', 'alt', 'vcf_file']). \
    agg(pl.col(['tss', 'transcript_start', 'transcript_end', 'transcript_id', 'veff_score']).first())

In [24]:
# make the score negative to match the pipeline (there is no directionality in the score)
tss_df = tss_df.with_columns(veff_score=-pl.col('veff_score'))

In [25]:
tss_df

chrom,strand,gene_id,variant_start,variant_end,ref,alt,vcf_file,tss,transcript_start,transcript_end,transcript_id,veff_score
str,str,str,i64,i64,str,str,str,i64,i64,i64,str,f64
"""chr7""","""+""","""ENSG00000174945.14_5""",2744061,2744062,"""T""","""C""","""/s/project/rep/processed/train…",2742109,2742109,2748748,"""ENST00000480560.5_1""",-0.024154
"""chr2""","""+""","""ENSG00000128654.14_7""",177133343,177133344,"""T""","""C""","""/s/project/rep/processed/train…",177134122,177134122,177202753,"""ENST00000420864.5_1""",-0.017664
"""chr5""","""+""","""ENSG00000145819.18_14""",142289332,142289333,"""G""","""A""","""/s/project/rep/processed/train…",142288286,142288286,142421455,"""ENST00000469396.5_2""",-0.013821
"""chr11""","""+""","""ENSG00000060749.16_10""",32979058,32979059,"""A""","""G""","""/s/project/rep/processed/train…",32979351,32979351,32990614,"""ENST00000528034.1_1""",-0.019565
"""chr12""","""+""","""ENSG00000118308.16_14""",25189746,25189747,"""A""","""G""","""/s/project/rep/processed/train…",25188413,25188413,25190952,"""ENST00000555165.1_1""",-0.015859
…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr1""","""+""","""ENSG00000197056.11_7""",35495773,35495774,"""T""","""C""","""/s/project/rep/processed/train…",35497772,35497772,35581225,"""ENST00000650449.1_2""",-0.014425
"""chr1""","""+""","""ENSG00000152061.24_9""",174606453,174606454,"""A""","""G""","""/s/project/rep/processed/train…",174606515,174606515,174674888,"""ENST00000474375.6_1""",-0.017078
"""chr1""","""+""","""ENSG00000058453.17_9""",17294467,17294468,"""G""","""C""","""/s/project/rep/processed/train…",17292522,17292522,17297237,"""ENST00000465021.5_1""",-0.053038
"""chr8""","""-""","""ENSG00000129422.15_12""",17512591,17512592,"""G""","""A""","""/s/project/rep/processed/train…",17512533,17504512,17512534,"""ENST00000523471.5_3""",-0.037099


In [26]:
tss_df.write_parquet('/s/project/promoter_prediction/kipoi_expression_prediction/veff/lm_tss.parquet')