# Enformer-GTEx results

Explore Enformer predictions on GTEx variants

## Setup

In [1]:
import polars as pl
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import plotnine as pn
from scipy.stats import ranksums
from datetime import datetime

%load_ext autoreload
%autoreload 2

In [2]:
plt.rcParams['figure.dpi'] = 150

In [3]:
pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [4]:
pn.theme_set(pn.theme_bw())
pn.theme_update(dpi=150)

In [5]:
# Files relevant to the GTEx analysis
veff_path = '/s/project/promoter_prediction/kipoi_expression_prediction/veff/lm_tss_2kb.parquet/*.parquet'
# GTEx variants
variant_path = '/s/project/rep/processed/training_results_v16/gtex_v8_old_dna/private_variants.parquet/rare_variants.vcf.parquet/**/*.parquet'
# abexp benchmark dataset
gtex_benchmark_with_annotation_path = "/s/project/rep/processed/training_results_v16/gtex_benchmark_with_annotation.parquet/*.parquet"

## Analysis

We are only loading ensembl canonical transcripts for this analysis. So there is 1 transcript per gene!

In [6]:
veff_transcript_ldf = (pl.scan_parquet(Path(veff_path)).
                       select(pl.col(['tissue', 'gene_id', 'transcript_id', 'transcript_start', 'transcript_end']), 
                              pl.col('strand').cast(pl.Enum(['-', '+'])),
                              pl.col(['chrom', 'variant_start', 'variant_end', 'ref', 'alt', 'veff_score',])).
                       rename({'gene_id': 'gene', 'transcript_id': 'transcript'}).
                       with_columns(pl.col('gene').str.replace(r'([^\.]+)\..+$', "${1}").alias('gene'),
                                    pl.col('transcript').str.replace(r'([^\.]+)\..+$', "${1}").alias('transcript')))


# It is possible that a gene comes multiple times (different versions)

In [7]:
variant_ldf = pl.scan_parquet(variant_path, hive_partitioning=True).select(['sampleId', 'chrom', 'start', 'end', 'ref', 'alt']).rename({'sampleId': 'individual',
                                                                                                                'start': 'variant_start',
                                                                                                                'end': 'variant_end'})

In [8]:
training_benchmark_ldf = (pl.scan_parquet(gtex_benchmark_with_annotation_path)
                          .select(['gene', 'individual', 'tissue', 'FDR', 
                                   'mu', 'zscore', 'l2fc'])
                          .unique()
                          .rename({'l2fc': 'l2fc_outrider'})
                          .with_columns((pl.when(pl.col('FDR') > 0.05)
                                         .then(pl.lit('normal'))
                                         .otherwise(
                                             pl.when(pl.col('zscore') > 0)
                                             .then(pl.lit('overexpressed'))
                                             .otherwise(
                                                 pl.when(pl.col('zscore') < 0)
                                                 .then(pl.lit('underexpressed'))
                                                 # this should never be the case
                                                 .otherwise(pl.lit('CHECK'))
                                             ))).cast(pl.Enum(['underexpressed', 'normal', 'overexpressed'])).alias('outlier_state')))

### What is the enformer variant-effect-score distribution around the TSS?

In [9]:
veff_transcript_ldf = veff_transcript_ldf. with_columns(tss_distance=(
    pl.when(pl.col('strand') == '+')
    .then(pl.col('variant_start') - pl.col('transcript_start'))
    .otherwise(pl.col('transcript_end') - pl.col('variant_start'))
)).with_columns(absolute_tss_distance=pl.col('tss_distance').abs(),
                absolute_veff_score=pl.col('veff_score').abs())

veff_transcript_ldf.select(upstream_tss=pl.col('tss_distance').min(), downstream_tss=pl.col('tss_distance').max()).collect()

upstream_tss,downstream_tss
i64,i64
-2000,2001


In [10]:
upstream=2000
downstream=2000

# filter out variants out of this range
veff_transcript_ldf = veff_transcript_ldf.filter((pl.col('tss_distance') >= -upstream) & (pl.col('tss_distance') <= downstream))

# join tss variants with individuals
veff_individual_ldf = (variant_ldf.join(veff_transcript_ldf, how='inner', on=['chrom', 'variant_start', 'variant_end', 'ref', 'alt']).
                              select([
                                  'individual', 'chrom', 'variant_start', 'variant_end', 'ref', 'alt',
                                  'gene', 'transcript', 'strand', 'tss_distance', 'absolute_tss_distance', 
                                  'veff_score', 'absolute_veff_score'
                                ])
                             )

In [11]:
# keep the variant with the signed maximum absolute enformer score for each individual
veff_individual_ldf = veff_individual_ldf.sort('absolute_veff_score', descending=True). \
    group_by(['individual', 'gene', 'chrom', 'strand']). \
    agg(pl.col(['variant_start', 'variant_end', 'ref', 'alt', 'transcript', 'tss_distance', 'veff_score']).first())

# join outrider with variants on individual
veff_outrider_ldf = (
    veff_individual_ldf
    .join(training_benchmark_ldf, how='inner', on=['individual', 'gene'])
    .select(['gene', 'tissue', 'individual', 'tss_distance', 'outlier_state', 
             'zscore', 'FDR', 'veff_score']))

In [12]:
training_benchmark_ldf.collect().shape

(99459917, 8)

In [13]:
veff_individual_ldf.collect().shape

(1169408, 11)

In [14]:
veff_transcript_ldf.collect().shape

(1271282, 15)

In [15]:
df = veff_outrider_ldf.collect()

In [16]:
df.shape

(11332884, 8)

In [17]:
df.head()

gene,tissue,individual,tss_distance,outlier_state,zscore,FDR,veff_score
str,str,str,i64,enum,f32,f64,f64
"""ENSG00000204131""","""Artery - Aorta""","""GTEX-Y3IK""",926,"""normal""",0.088653,1.0,-0.031
"""ENSG00000178927""","""Skin - Sun Exposed (Lower leg)""","""GTEX-1EU9M""",-1576,"""normal""",2.536026,1.0,-0.081665
"""ENSG00000178980""","""Minor Salivary Gland""","""GTEX-ZQUD""",191,"""normal""",2.287483,1.0,-0.01621
"""ENSG00000221988""","""Adipose - Subcutaneous""","""GTEX-14PJ2""",1262,"""normal""",-1.608454,1.0,-0.019317
"""ENSG00000131051""","""Small Intestine - Terminal Ileum""","""GTEX-WRHK""",-1350,"""normal""",-0.862461,1.0,-0.018581


In [18]:
df_bac = df

# tss_distance in bins
bin_size=50
cuts = list(range(-upstream + bin_size, downstream, bin_size))
cut_labels = [str(x) for x in [-upstream, *cuts]]
tss_distance_labels = {c: f'[{c}, {int(c) + bin_size})' for c in cut_labels}
df = (df.with_columns(tss_distance_bin=(pl.col('tss_distance').cut(cuts, labels=cut_labels))
                      .cast(pl.Enum(cut_labels)))
      .with_columns(is_underexpressed=(pl.col('outlier_state') == 'underexpressed')))

df = (df.with_columns(tss_distance_bin_label=pl.col("tss_distance_bin").replace_strict(tss_distance_labels), 
                                                      bin_size=pl.lit(50)))

In [19]:
df.head()

gene,tissue,individual,tss_distance,outlier_state,zscore,FDR,veff_score,tss_distance_bin,is_underexpressed,tss_distance_bin_label,bin_size
str,str,str,i64,enum,f32,f64,f64,enum,bool,str,i32
"""ENSG00000204131""","""Artery - Aorta""","""GTEX-Y3IK""",926,"""normal""",0.088653,1.0,-0.031,"""900""",False,"""[900, 950)""",50
"""ENSG00000178927""","""Skin - Sun Exposed (Lower leg)""","""GTEX-1EU9M""",-1576,"""normal""",2.536026,1.0,-0.081665,"""-1600""",False,"""[-1600, -1550)""",50
"""ENSG00000178980""","""Minor Salivary Gland""","""GTEX-ZQUD""",191,"""normal""",2.287483,1.0,-0.01621,"""150""",False,"""[150, 200)""",50
"""ENSG00000221988""","""Adipose - Subcutaneous""","""GTEX-14PJ2""",1262,"""normal""",-1.608454,1.0,-0.019317,"""1250""",False,"""[1250, 1300)""",50
"""ENSG00000131051""","""Small Intestine - Terminal Ileum""","""GTEX-WRHK""",-1350,"""normal""",-0.862461,1.0,-0.018581,"""-1400""",False,"""[-1400, -1350)""",50


In [20]:
# set new bins
# new_bins = [(-2000, -500),
#             *[(i, i + 100) for i in range(-500, -100, 100)],
#             (-100, -50),
#             (-50, 0),
#             (0, 50),
#             (50, 100),
#             *[(i, i + 100) for i in range(100, 500, 100)],]
new_bins = [(-2000, -1000),
            (-1000, -500),
            *[(i, i + 100) for i in range(-500, -100, 100)],
            (-100, -50),
            (-50, 0),
            (0, 50),
            (50, 100),
            *[(i, i + 100) for i in range(100, 500, 100)],
            (500, 1000),
            (1000, 2000),
           ]
new_bin_labels  = [f'[{start}, {stop})' for start, stop in new_bins]

df_bac = df
for start, stop in new_bins:
    df = df.with_columns(tss_distance_bin_label=(pl.when((pl.col('tss_distance_bin').cast(pl.Int16) >= start) & (pl.col('tss_distance_bin').cast(pl.Int16) <= stop))
                                                 .then(pl.lit(f'[{start}, {stop})'))                                  
                                                 .otherwise(pl.col('tss_distance_bin_label'))),
                         bin_size = (pl.when((pl.col('tss_distance_bin').cast(pl.Int16) >= start) & (pl.col('tss_distance_bin').cast(pl.Int16) < stop))
                                     .then(pl.lit(stop - start))
                                     .otherwise(pl.col('bin_size'))))

df = df.with_columns(tss_distance_bin_label=pl.col('tss_distance_bin_label').cast(pl.Enum(new_bin_labels)))

In [21]:
df.head()

gene,tissue,individual,tss_distance,outlier_state,zscore,FDR,veff_score,tss_distance_bin,is_underexpressed,tss_distance_bin_label,bin_size
str,str,str,i64,enum,f32,f64,f64,enum,bool,enum,i32
"""ENSG00000204131""","""Artery - Aorta""","""GTEX-Y3IK""",926,"""normal""",0.088653,1.0,-0.031,"""900""",False,"""[500, 1000)""",500
"""ENSG00000178927""","""Skin - Sun Exposed (Lower leg)""","""GTEX-1EU9M""",-1576,"""normal""",2.536026,1.0,-0.081665,"""-1600""",False,"""[-2000, -1000)""",1000
"""ENSG00000178980""","""Minor Salivary Gland""","""GTEX-ZQUD""",191,"""normal""",2.287483,1.0,-0.01621,"""150""",False,"""[100, 200)""",100
"""ENSG00000221988""","""Adipose - Subcutaneous""","""GTEX-14PJ2""",1262,"""normal""",-1.608454,1.0,-0.019317,"""1250""",False,"""[1000, 2000)""",1000
"""ENSG00000131051""","""Small Intestine - Terminal Ileum""","""GTEX-WRHK""",-1350,"""normal""",-0.862461,1.0,-0.018581,"""-1400""",False,"""[-2000, -1000)""",1000


In [22]:
# calculate counts per bins
bin_count_df = (df.group_by(['outlier_state', 'tss_distance_bin_label', 'bin_size'])
                .agg((pl.len()).alias('count')))

# # calculate mean of each bin and then sum the means in each outlier state
totals_df = (bin_count_df.group_by('outlier_state').agg(pl.sum('count').alias('total_count')))
# # normalize each count by the mean calculated above
enrichment_df = (bin_count_df.join(totals_df, on='outlier_state')
                 .with_columns((pl.col('count') / pl.col('total_count')).alias('enrichment')))
ci_low, ci_high = sm.stats.proportion_confint(enrichment_df["count"], enrichment_df["total_count"])
enrichment_df = enrichment_df.with_columns(pl.Series(ci_low).alias('ci_low'), pl.Series(ci_high).alias('ci_high'))
# normalize by bin size
enrichment_df = enrichment_df.with_columns(enrichment = pl.col('enrichment') / pl.col('bin_size'),
                                           ci_low = pl.col('ci_low') / pl.col('bin_size'),
                                           ci_high = pl.col('ci_high') / pl.col('bin_size'))

In [23]:
enrichment_df

outlier_state,tss_distance_bin_label,bin_size,count,total_count,enrichment,ci_low,ci_high
enum,enum,i32,u32,u32,f64,f64,f64
"""overexpressed""","""[400, 500)""",100,73,2599,0.000281,0.000217,0.000344
"""underexpressed""","""[-500, -400)""",100,24,2962,0.000081,0.000049,0.000113
"""overexpressed""","""[1000, 2000)""",1000,644,2599,0.000248,0.000231,0.000264
"""underexpressed""","""[50, 100)""",50,97,2962,0.000655,0.000527,0.000783
"""normal""","""[-100, -50)""",50,145962,11327323,0.000258,0.000256,0.000259
…,…,…,…,…,…,…,…
"""underexpressed""","""[-50, 0)""",50,128,2962,0.000864,0.000718,0.001011
"""overexpressed""","""[-400, -300)""",100,62,2599,0.000239,0.00018,0.000297
"""overexpressed""","""[-200, -100)""",100,76,2599,0.000292,0.000228,0.000357
"""underexpressed""","""[-400, -300)""",100,66,2962,0.000223,0.00017,0.000276


In [24]:
totals_df

outlier_state,total_count
enum,u32
"""normal""",11327323
"""overexpressed""",2599
"""underexpressed""",2962


In [25]:
base_path = Path('.')
df.write_parquet(base_path / 'promoter_veff_bin.parquet', use_pyarrow=True)
enrichment_df.write_parquet(base_path / 'promoter_enrichment.parquet', use_pyarrow=True)
base_path

PosixPath('.')