# Enformer-GTEx results

Explore Enformer predictions on GTEx variants

## Setup

In [1]:
import polars as pl
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import plotnine as pn
from scipy.stats import ranksums
from datetime import datetime

%load_ext autoreload
%autoreload 2

In [2]:
plt.rcParams['figure.dpi'] = 150

In [3]:
pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [4]:
pn.theme_set(pn.theme_bw())
pn.theme_update(dpi=150)

In [5]:
# Files relevant to the GTEx analysis
veff_path = '/s/project/promoter_prediction/kipoi_expression_prediction/veff/lm_apa_2kb.parquet/*.parquet'
# GTEx variants
variant_path = '/s/project/rep/processed/training_results_v16/gtex_v8_old_dna/private_variants.parquet/rare_variants.vcf.parquet/**/*.parquet'
# abexp benchmark dataset
gtex_benchmark_with_annotation_path = "/s/project/rep/processed/training_results_v16/gtex_benchmark_with_annotation.parquet/*.parquet"

## Analysis

We are only loading ensembl canonical transcripts for this analysis. So there is 1 transcript per gene!

In [6]:
veff_transcript_ldf = (pl.scan_parquet(Path(veff_path)).
                       select(pl.col(['gene_id', 'pas_id', 'cse_pos']), 
                              pl.col('strand').cast(pl.Enum(['-', '+'])),
                              pl.col(['chrom', 'variant_start', 'variant_end', 'ref', 'alt', 'veff_score'])).
                       rename({'gene_id': 'gene', 'pas_id': 'pas'}).
                       with_columns(pl.col('gene').str.replace(r'([^\.]+)\..+$', "${1}").alias('gene')))

# It is possible that a gene comes multiple times (different versions)

In [7]:
variant_ldf = pl.scan_parquet(variant_path, hive_partitioning=True).select(['sampleId', 'chrom', 'start', 'end', 'ref', 'alt']).rename({'sampleId': 'individual',
                                                                                                                'start': 'variant_start',
                                                                                                                'end': 'variant_end'})

In [8]:
training_benchmark_ldf = (pl.scan_parquet(gtex_benchmark_with_annotation_path)
                          .select(['gene', 'individual', 'tissue', 'FDR', 
                                   'mu', 'zscore', 'l2fc'])
                          .unique()
                          .rename({'l2fc': 'l2fc_outrider'})
                          .with_columns((pl.when(pl.col('FDR') > 0.05)
                                         .then(pl.lit('normal'))
                                         .otherwise(
                                             pl.when(pl.col('zscore') > 0)
                                             .then(pl.lit('overexpressed'))
                                             .otherwise(
                                                 pl.when(pl.col('zscore') < 0)
                                                 .then(pl.lit('underexpressed'))
                                                 # this should never be the case
                                                 .otherwise(pl.lit('CHECK'))
                                             ))).cast(pl.Enum(['underexpressed', 'normal', 'overexpressed'])).alias('outlier_state')))

### What is the enformer variant-effect-score distribution around the TSS?

In [9]:
veff_transcript_ldf = (veff_transcript_ldf
                       .with_columns(cse_distance=(pl.when(pl.col('strand') == '+')
                                                   .then(pl.col('variant_start') - pl.col('cse_pos'))
                                                   .otherwise(pl.col('cse_pos') - pl.col('variant_start'))))
                       .with_columns(absolute_cse_distance=pl.col('cse_distance').abs(),                
                                     absolute_veff_score=pl.col('veff_score').abs()))

veff_transcript_ldf.select(upstream_tss=pl.col('cse_distance').min(), downstream_tss=pl.col('cse_distance').max()).collect()

upstream_tss,downstream_tss
i64,i64
-2000,2000


In [10]:
upstream=2000
downstream=2000

# filter out variants out of this range
veff_transcript_ldf = veff_transcript_ldf.filter((pl.col('cse_distance') >= -upstream) & (pl.col('cse_distance') <= downstream))

# join tss variants with individuals
veff_individual_ldf = (variant_ldf.join(veff_transcript_ldf, how='inner', on=['chrom', 'variant_start', 'variant_end', 'ref', 'alt']).
                              select([
                                  'individual', 'chrom', 'variant_start', 'variant_end', 'ref', 'alt',
                                  'gene', 'pas', 'strand', 'cse_distance', 'absolute_cse_distance', 
                                  'veff_score', 'absolute_veff_score'
                                ])
                             )

In [11]:
# keep the variant with the signed maximum absolute enformer score for each individual
veff_individual_ldf = veff_individual_ldf.sort('absolute_veff_score', descending=True). \
    group_by(['individual', 'gene', 'chrom', 'strand']). \
    agg(pl.col(['variant_start', 'variant_end', 'ref', 'alt', 'pas', 'cse_distance', 'veff_score']).first())

# join outrider with variants on individual
veff_outrider_ldf = (
    veff_individual_ldf
    .join(training_benchmark_ldf, how='inner', on=['individual', 'gene'])
    .select(['gene', 'tissue', 'individual', 'cse_distance', 'outlier_state', 
             'zscore', 'FDR', 'veff_score']))

In [12]:
df = veff_outrider_ldf.collect()

In [13]:
df.shape

(11924158, 8)

In [14]:
df.head()

gene,tissue,individual,cse_distance,outlier_state,zscore,FDR,veff_score
str,str,str,i64,enum,f32,f64,f64
"""ENSG00000186318""","""Brain - Frontal Cortex (BA9)""","""GTEX-NL4W""",1614,"""normal""",0.22586,1.0,-0.037641
"""ENSG00000097021""","""Cells - Cultured fibroblasts""","""GTEX-TKQ1""",1617,"""normal""",2.016294,1.0,-0.040787
"""ENSG00000164039""","""Artery - Coronary""","""GTEX-13VXT""",1550,"""normal""",0.720083,1.0,-0.035602
"""ENSG00000135241""","""Spleen""","""GTEX-POMQ""",5,"""normal""",0.865943,1.0,-0.051857
"""ENSG00000145348""","""Brain - Hypothalamus""","""GTEX-1AX9I""",-326,"""normal""",1.001836,1.0,-0.032431


In [15]:
df_bac = df

# cse_distance in bins
bin_size=10
cuts = list(range(-upstream + bin_size, downstream, bin_size))
cut_labels = [str(x) for x in [-upstream, *cuts]]
cse_distance_labels = {c: f'[{c}, {int(c) + bin_size})' for c in cut_labels}
df = (df.with_columns(cse_distance_bin=(pl.col('cse_distance').cut(cuts, labels=cut_labels))
                      .cast(pl.Enum(cut_labels)))
      .with_columns(is_underexpressed=(pl.col('outlier_state') == 'underexpressed')))

df = (df.with_columns(cse_distance_bin_label=pl.col("cse_distance_bin").replace_strict(cse_distance_labels), 
                                                      bin_size=pl.lit(50)))

In [16]:
df.head()

gene,tissue,individual,cse_distance,outlier_state,zscore,FDR,veff_score,cse_distance_bin,is_underexpressed,cse_distance_bin_label,bin_size
str,str,str,i64,enum,f32,f64,f64,enum,bool,str,i32
"""ENSG00000186318""","""Brain - Frontal Cortex (BA9)""","""GTEX-NL4W""",1614,"""normal""",0.22586,1.0,-0.037641,"""1610""",False,"""[1610, 1620)""",50
"""ENSG00000097021""","""Cells - Cultured fibroblasts""","""GTEX-TKQ1""",1617,"""normal""",2.016294,1.0,-0.040787,"""1610""",False,"""[1610, 1620)""",50
"""ENSG00000164039""","""Artery - Coronary""","""GTEX-13VXT""",1550,"""normal""",0.720083,1.0,-0.035602,"""1540""",False,"""[1540, 1550)""",50
"""ENSG00000135241""","""Spleen""","""GTEX-POMQ""",5,"""normal""",0.865943,1.0,-0.051857,"""0""",False,"""[0, 10)""",50
"""ENSG00000145348""","""Brain - Hypothalamus""","""GTEX-1AX9I""",-326,"""normal""",1.001836,1.0,-0.032431,"""-330""",False,"""[-330, -320)""",50


In [19]:
# set new bins
# new_bins = [(-2000, -500),
#             *[(i, i + 100) for i in range(-500, -100, 100)],
#             (-100, -50),
#             (-50, 0),
#             (0, 50),
#             (50, 100),
#             *[(i, i + 100) for i in range(100, 500, 100)],]
new_bins = [(-2000, -1000),
            (-1000, -500),
            *[(i, i + 100) for i in range(-500, -100, 100)],
            (-100, -50),
            (-50, 0),
            (0, 50),
            (50, 100),
            *[(i, i + 100) for i in range(100, 500, 100)],
            (500, 1000),
            (1000, 2000),
           ]
new_bin_labels  = [f'[{start}, {stop})' for start, stop in new_bins]

df_bac = df
for start, stop in new_bins:
    df = df.with_columns(cse_distance_bin_label=(pl.when((pl.col('cse_distance_bin').cast(pl.Int16) >= start) & (pl.col('cse_distance_bin').cast(pl.Int16) <= stop))
                                                 .then(pl.lit(f'[{start}, {stop})'))                                  
                                                 .otherwise(pl.col('cse_distance_bin_label'))),
                         bin_size = (pl.when((pl.col('cse_distance_bin').cast(pl.Int16) >= start) & (pl.col('cse_distance_bin').cast(pl.Int16) < stop))
                                     .then(pl.lit(stop - start))
                                     .otherwise(pl.col('bin_size'))))

df = df.with_columns(cse_distance_bin_label=pl.col('cse_distance_bin_label').cast(pl.Enum(new_bin_labels)))

In [20]:
df.head()

gene,tissue,individual,cse_distance,outlier_state,zscore,FDR,veff_score,cse_distance_bin,is_underexpressed,cse_distance_bin_label,bin_size
str,str,str,i64,enum,f32,f64,f64,enum,bool,enum,i32
"""ENSG00000186318""","""Brain - Frontal Cortex (BA9)""","""GTEX-NL4W""",1614,"""normal""",0.22586,1.0,-0.037641,"""1610""",False,"""[1000, 2000)""",1000
"""ENSG00000097021""","""Cells - Cultured fibroblasts""","""GTEX-TKQ1""",1617,"""normal""",2.016294,1.0,-0.040787,"""1610""",False,"""[1000, 2000)""",1000
"""ENSG00000164039""","""Artery - Coronary""","""GTEX-13VXT""",1550,"""normal""",0.720083,1.0,-0.035602,"""1540""",False,"""[1000, 2000)""",1000
"""ENSG00000135241""","""Spleen""","""GTEX-POMQ""",5,"""normal""",0.865943,1.0,-0.051857,"""0""",False,"""[0, 50)""",50
"""ENSG00000145348""","""Brain - Hypothalamus""","""GTEX-1AX9I""",-326,"""normal""",1.001836,1.0,-0.032431,"""-330""",False,"""[-400, -300)""",100


In [21]:
# calculate counts per bins
bin_count_df = (df.group_by(['outlier_state', 'cse_distance_bin_label', 'bin_size'])
                .agg((pl.len()).alias('count')))

# # calculate mean of each bin and then sum the means in each outlier state
totals_df = (bin_count_df.group_by('outlier_state').agg(pl.sum('count').alias('total_count')))
# # normalize each count by the mean calculated above
enrichment_df = (bin_count_df.join(totals_df, on='outlier_state')
                 .with_columns((pl.col('count') / pl.col('total_count')).alias('enrichment')))
ci_low, ci_high = sm.stats.proportion_confint(enrichment_df["count"], enrichment_df["total_count"])
enrichment_df = enrichment_df.with_columns(pl.Series(ci_low).alias('ci_low'), pl.Series(ci_high).alias('ci_high'))
# normalize by bin size
enrichment_df = enrichment_df.with_columns(enrichment = pl.col('enrichment') / pl.col('bin_size'),
                                           ci_low = pl.col('ci_low') / pl.col('bin_size'),
                                           ci_high = pl.col('ci_high') / pl.col('bin_size'))

In [22]:
enrichment_df

outlier_state,cse_distance_bin_label,bin_size,count,total_count,enrichment,ci_low,ci_high
enum,enum,i32,u32,u32,f64,f64,f64
"""normal""","""[-50, 0)""",50,130239,11918765,0.000219,0.000217,0.00022
"""underexpressed""","""[-50, 0)""",50,68,2851,0.000477,0.000365,0.000589
"""underexpressed""","""[-200, -100)""",100,56,2851,0.000196,0.000145,0.000247
"""normal""","""[-100, -50)""",50,130964,11918765,0.00022,0.000219,0.000221
"""normal""","""[50, 100)""",50,133884,11918765,0.000225,0.000223,0.000226
…,…,…,…,…,…,…,…
"""normal""","""[-300, -200)""",100,266740,11918765,0.000224,0.000223,0.000225
"""underexpressed""","""[300, 400)""",100,30,2851,0.000105,0.000068,0.000143
"""overexpressed""","""[-100, -50)""",50,38,2542,0.000299,0.000205,0.000393
"""normal""","""[500, 1000)""",500,1421304,11918765,0.000238,0.000238,0.000239


In [23]:
totals_df

outlier_state,total_count
enum,u32
"""normal""",11918765
"""underexpressed""",2851
"""overexpressed""",2542


In [24]:
base_path = Path('.')
df.write_parquet(base_path / 'pas_veff_bin.parquet', use_pyarrow=True)
enrichment_df.write_parquet(base_path / 'pas_enrichment.parquet', use_pyarrow=True)
base_path

PosixPath('.')