# Enformer-GTEx results

Explore Enformer predictions on GTEx variants

## Setup

In [1]:
import polars as pl
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import plotnine as pn
from scipy.stats import ranksums
from datetime import datetime

%load_ext autoreload
%autoreload 2

In [2]:
plt.rcParams['figure.dpi'] = 150

In [3]:
pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [4]:
pn.theme_set(pn.theme_bw())
pn.theme_update(dpi=150)

In [5]:
# Files relevant to the GTEx analysis
enformer_run_name = 'aparent2_ensembl_deltapdui_wide_70_130'
# enformer variant effect on GTEx variants
enformer_path = f'/s/project/promoter_prediction/kipoi_expression_prediction/veff/aparent2.parquet/run={enformer_run_name}'
# GTEx variants
variant_path = '/s/project/rep/processed/training_results_v16/gtex_v8_old_dna/private_variants.parquet/rare_variants.vcf.parquet/**/*.parquet'
# abexp benchmark dataset
gtex_benchmark_with_annotation_path = "/s/project/rep/processed/training_results_v16/gtex_benchmark_with_annotation.parquet/*.parquet"

## Analysis

We are only loading ensembl canonical transcripts for this analysis. So there is 1 transcript per gene!

In [6]:
veff_transcript_ldf = (pl.scan_parquet(Path(enformer_path) / '*.parquet').
                       select(pl.col(['gene_id', 'pas_id', 'cse_pos']), 
                              pl.col('strand').cast(pl.Enum(['-', '+'])),
                              pl.col(['chrom', 'variant_start', 'variant_end', 'ref', 'alt', 'veff_score'])).
                       rename({'gene_id': 'gene', 'pas_id': 'pas'}).
                       with_columns(pl.col('gene').str.replace(r'([^\.]+)\..+$', "${1}").alias('gene')))

# It is possible that a gene comes multiple times (different versions)

In [7]:
variant_ldf = pl.scan_parquet(variant_path, hive_partitioning=True).select(['sampleId', 'chrom', 'start', 'end', 'ref', 'alt']).rename({'sampleId': 'individual',
                                                                                                                'start': 'variant_start',
                                                                                                                'end': 'variant_end'})

In [8]:
training_benchmark_ldf = (pl.scan_parquet(gtex_benchmark_with_annotation_path)
                          .select(['gene', 'individual', 'tissue', 'FDR', 
                                   'mu', 'zscore', 'l2fc'])
                          .unique()
                          .rename({'l2fc': 'l2fc_outrider'})
                          .with_columns((pl.when(pl.col('FDR') > 0.05)
                                         .then(pl.lit('normal'))
                                         .otherwise(
                                             pl.when(pl.col('zscore') > 0)
                                             .then(pl.lit('overexpressed'))
                                             .otherwise(
                                                 pl.when(pl.col('zscore') < 0)
                                                 .then(pl.lit('underexpressed'))
                                                 # this should never be the case
                                                 .otherwise(pl.lit('CHECK'))
                                             ))).cast(pl.Enum(['underexpressed', 'normal', 'overexpressed'])).alias('outlier_state')))

### What is the enformer variant-effect-score distribution around the TSS?

In [9]:
veff_transcript_ldf = (veff_transcript_ldf
                       .with_columns(cse_distance=(pl.when(pl.col('strand') == '+')
                                                   .then(pl.col('variant_start') - pl.col('cse_pos'))
                                                   .otherwise(pl.col('cse_pos') - pl.col('variant_start'))))
                       .with_columns(absolute_cse_distance=pl.col('cse_distance').abs(),                
                                     absolute_veff_score=pl.col('veff_score').abs()))

veff_transcript_ldf.select(upstream_tss=pl.col('cse_distance').min(), downstream_tss=pl.col('cse_distance').max()).collect()

upstream_tss,downstream_tss
i64,i64
-70,130


In [10]:
upstream=70
downstream=130

# filter out variants out of this range
veff_transcript_ldf = veff_transcript_ldf.filter((pl.col('cse_distance') >= -upstream) & (pl.col('cse_distance') <= downstream))

# join tss variants with individuals
veff_individual_ldf = (variant_ldf.join(veff_transcript_ldf, how='inner', on=['chrom', 'variant_start', 'variant_end', 'ref', 'alt']).
                              select([
                                  'individual', 'chrom', 'variant_start', 'variant_end', 'ref', 'alt',
                                  'gene', 'pas', 'strand', 'cse_distance', 'absolute_cse_distance', 
                                  'veff_score', 'absolute_veff_score'
                                ])
                             )

In [11]:
# keep the variant with the signed maximum absolute enformer score for each individual
veff_individual_ldf = veff_individual_ldf.sort('absolute_veff_score', descending=True). \
    group_by(['individual', 'gene', 'chrom', 'strand']). \
    agg(pl.col(['variant_start', 'variant_end', 'ref', 'alt', 'pas', 'cse_distance', 'veff_score']).first())

# join outrider with variants on individual
veff_outrider_ldf = (
    veff_individual_ldf
    .join(training_benchmark_ldf, how='inner', on=['individual', 'gene'])
    .select(['gene', 'tissue', 'individual', 'cse_distance', 'outlier_state', 
             'zscore', 'FDR', 'veff_score']))

In [26]:
df = veff_outrider_ldf.collect()

In [27]:
df.shape

(1209239, 8)

In [28]:
df.head()

gene,tissue,individual,cse_distance,outlier_state,zscore,FDR,veff_score
str,str,str,i64,enum,f32,f64,f64
"""ENSG00000162066""","""Pituitary""","""GTEX-13VXU""",59,"""normal""",-0.824748,1.0,0.0
"""ENSG00000159202""","""Brain - Cortex""","""GTEX-QMR6""",89,"""normal""",1.085511,1.0,0.002301
"""ENSG00000178397""","""Cells - EBV-transformed lymphocytes""","""GTEX-WFON""",19,"""normal""",0.306976,1.0,-0.079601
"""ENSG00000076924""","""Skin - Sun Exposed (Lower leg)""","""GTEX-XMD2""",61,"""normal""",-1.820325,1.0,-0.003106
"""ENSG00000119684""","""Brain - Frontal Cortex (BA9)""","""GTEX-12WSA""",94,"""normal""",0.029072,1.0,0.019442


In [29]:
df_bac = df

# cse_distance in bins
bin_size=20
cuts = list(range(-upstream + bin_size, downstream, bin_size))
cut_labels = [str(x) for x in [-upstream, *cuts]]
cse_distance_labels = {c: f'[{c}, {int(c) + bin_size})' for c in cut_labels}
df = (df.with_columns(cse_distance_bin=(pl.col('cse_distance').cut(cuts, labels=cut_labels))
                      .cast(pl.Enum(cut_labels)))
      .with_columns(is_underexpressed=(pl.col('outlier_state') == 'underexpressed')))

df = (df.with_columns(cse_distance_bin_label=pl.col("cse_distance_bin").replace_strict(cse_distance_labels), 
                                                      bin_size=pl.lit(50)))

In [16]:
df.head()

gene,tissue,individual,cse_distance,outlier_state,zscore,FDR,veff_score,cse_distance_bin,is_underexpressed,cse_distance_bin_label,bin_size
str,str,str,i64,enum,f32,f64,f64,enum,bool,str,i32
"""ENSG00000165661""","""Brain - Hippocampus""","""GTEX-14JIY""",119,"""normal""",0.080449,1.0,0.002283,"""110""",False,"""[110, 130)""",50
"""ENSG00000134698""","""Liver""","""GTEX-17F96""",6,"""normal""",0.134341,1.0,-0.063687,"""-10""",False,"""[-10, 10)""",50
"""ENSG00000204348""","""Skin - Sun Exposed (Lower leg)""","""GTEX-183FY""",-39,"""normal""",1.089896,1.0,0.005534,"""-50""",False,"""[-50, -30)""",50
"""ENSG00000092020""","""Adipose - Subcutaneous""","""GTEX-1FIGZ""",-68,"""normal""",-0.618512,1.0,0.000268,"""-70""",False,"""[-70, -50)""",50
"""ENSG00000082898""","""Brain - Anterior cingulate cortex (BA24)""","""GTEX-13VXU""",-44,"""normal""",-0.450936,1.0,-0.0001,"""-50""",False,"""[-50, -30)""",50


In [30]:
new_bins = [*[(i, i + 20) for i in range(-70, 130, 20)]]
new_bin_labels  = [f'[{start}, {stop})' for start, stop in new_bins]

df_bac = df
for start, stop in new_bins:
    df = df.with_columns(cse_distance_bin_label=(pl.when((pl.col('cse_distance_bin').cast(pl.Int16) >= start) & (pl.col('cse_distance_bin').cast(pl.Int16) <= stop))
                                                 .then(pl.lit(f'[{start}, {stop})'))                                  
                                                 .otherwise(pl.col('cse_distance_bin_label'))),
                         bin_size = (pl.when((pl.col('cse_distance_bin').cast(pl.Int16) >= start) & (pl.col('cse_distance_bin').cast(pl.Int16) < stop))
                                     .then(pl.lit(stop - start))
                                     .otherwise(pl.col('bin_size'))))

df = df.with_columns(cse_distance_bin_label=pl.col('cse_distance_bin_label').cast(pl.Enum(new_bin_labels)))

In [32]:
df.head()

gene,tissue,individual,cse_distance,outlier_state,zscore,FDR,veff_score,cse_distance_bin,is_underexpressed,cse_distance_bin_label,bin_size
str,str,str,i64,enum,f32,f64,f64,enum,bool,enum,i32
"""ENSG00000162066""","""Pituitary""","""GTEX-13VXU""",59,"""normal""",-0.824748,1.0,0.0,"""50""",False,"""[50, 70)""",20
"""ENSG00000159202""","""Brain - Cortex""","""GTEX-QMR6""",89,"""normal""",1.085511,1.0,0.002301,"""70""",False,"""[70, 90)""",20
"""ENSG00000178397""","""Cells - EBV-transformed lymphocytes""","""GTEX-WFON""",19,"""normal""",0.306976,1.0,-0.079601,"""10""",False,"""[10, 30)""",20
"""ENSG00000076924""","""Skin - Sun Exposed (Lower leg)""","""GTEX-XMD2""",61,"""normal""",-1.820325,1.0,-0.003106,"""50""",False,"""[50, 70)""",20
"""ENSG00000119684""","""Brain - Frontal Cortex (BA9)""","""GTEX-12WSA""",94,"""normal""",0.029072,1.0,0.019442,"""90""",False,"""[90, 110)""",20


In [33]:
# calculate counts per bins
bin_count_df = (df.group_by(['outlier_state', 'cse_distance_bin_label', 'bin_size'])
                .agg((pl.len()).alias('count')))

# # calculate mean of each bin and then sum the means in each outlier state
totals_df = (bin_count_df.group_by('outlier_state').agg(pl.sum('count').alias('total_count')))
# # normalize each count by the mean calculated above
enrichment_df = (bin_count_df.join(totals_df, on='outlier_state')
                 .with_columns((pl.col('count') / pl.col('total_count')).alias('enrichment')))
ci_low, ci_high = sm.stats.proportion_confint(enrichment_df["count"], enrichment_df["total_count"])
enrichment_df = enrichment_df.with_columns(pl.Series(ci_low).alias('ci_low'), pl.Series(ci_high).alias('ci_high'))
# normalize by bin size
enrichment_df = enrichment_df.with_columns(enrichment = pl.col('enrichment') / pl.col('bin_size'),
                                           ci_low = pl.col('ci_low') / pl.col('bin_size'),
                                           ci_high = pl.col('ci_high') / pl.col('bin_size'))

In [34]:
enrichment_df

outlier_state,cse_distance_bin_label,bin_size,count,total_count,enrichment,ci_low,ci_high
enum,enum,i32,u32,u32,f64,f64,f64
"""underexpressed""","""[30, 50)""",20,126,1042,0.006046,0.005056,0.007036
"""overexpressed""","""[-50, -30)""",20,33,369,0.004472,0.003016,0.005927
"""overexpressed""","""[30, 50)""",20,47,369,0.006369,0.004668,0.008069
"""normal""","""[-50, -30)""",20,122784,1207828,0.005083,0.005056,0.00511
"""normal""","""[-10, 10)""",20,116883,1207828,0.004839,0.004812,0.004865
…,…,…,…,…,…,…,…
"""underexpressed""","""[110, 130)""",20,81,1042,0.003887,0.003074,0.0047
"""overexpressed""","""[110, 130)""",20,38,369,0.005149,0.003599,0.0067
"""overexpressed""","""[10, 30)""",20,35,369,0.004743,0.003248,0.006237
"""normal""","""[30, 50)""",20,118820,1207828,0.004919,0.004892,0.004945


In [35]:
totals_df

outlier_state,total_count
enum,u32
"""normal""",1207828
"""underexpressed""",1042
"""overexpressed""",369


In [36]:
base_path = Path('.')
df.write_parquet(base_path / 'veff_bin.parquet', use_pyarrow=True)
enrichment_df.write_parquet(base_path / 'pas_enrichment.parquet', use_pyarrow=True)
base_path

PosixPath('.')