# GTEx rare variants analysis

## Imports

In [32]:
import polars as pl
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import statsmodels.api as sm
import plotnine as pn
import pyranges as pr

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
plt.rcParams['figure.dpi'] = 150

In [34]:
pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [35]:
pn.theme_set(pn.theme_bw())
pn.theme_update(dpi=150)

In [36]:
# tss variant path
tss_variant_path = '/s/project/promoter_prediction/kipoi_expression_prediction/variant_tables/tss_variants_2000_2000.parquet'
# GTEx variants
variant_path = '/s/project/rep/processed/training_results_v16/gtex_v8_old_dna/private_variants.parquet/rare_variants.vcf.parquet/**/*.parquet'
# abexp benchmark dataset
gtex_benchmark_with_annotation_path = "/s/project/rep/processed/training_results_v16/gtex_benchmark_with_annotation.parquet/*.parquet"

## Analysis

In [68]:
tss_variants_ldf = (pl.scan_parquet(Path(tss_variant_path)).
                       select(pl.col(['gene_id', 'transcript_id', 'transcript_start', 'transcript_end']), 
                              pl.col('strand').cast(pl.Enum(['-', '+'])),
                              pl.col(['chrom', 'variant_start', 'variant_end', 'ref', 'alt',])).
                       rename({'gene_id': 'gene', 'transcript_id': 'transcript'}).
                       with_columns(pl.col('gene').str.replace(r'([^\.]+)\..+$', "${1}").alias('gene'),
                                    pl.col('transcript').str.replace(r'([^\.]+)\..+$', "${1}").alias('transcript')))


# It is possible that a gene comes multiple times (different versions)

In [69]:
variant_ldf = (pl.scan_parquet(variant_path, hive_partitioning=True)
               .select(['sampleId', 'chrom', 'start', 'end', 'ref', 'alt'])
               .rename({'sampleId': 'individual','start': 'variant_start','end': 'variant_end'}))

In [70]:
benchmark_columns = pl.scan_parquet(gtex_benchmark_with_annotation_path).columns
feature_columns = [c for c in benchmark_columns if '@' in c]
training_benchmark_ldf = (pl.scan_parquet(gtex_benchmark_with_annotation_path)
                          .select(['gene', 'individual', 'tissue', 
                                   'FDR', 'zscore', 'AbExp',
                                   *feature_columns,
                                  ])
                          .unique()
                          .with_columns(outlier_state=(pl.when(pl.col('FDR') > 0.05)
                                                       .then(pl.lit('normal'))
                                                       .otherwise(
                                                           pl.when(pl.col('zscore') > 0)
                                                           .then(pl.lit('overexpressed'))
                                                           .otherwise(
                                                               pl.when(pl.col('zscore') < 0)
                                                               .then(pl.lit('underexpressed'))
                                                               # this should never be the case
                                                               .otherwise(pl.lit('CHECK'))
                                                           ))).cast(pl.Enum(['underexpressed', 'normal', 'overexpressed'])),
                                       AbExp=-pl.col('AbExp')))



In [71]:
feature_columns

['expected_expr@theta',
 'splice_ensemble@AbSplice',
 'splice_ensemble@MMSplice_SpliceMap_Psi_ref',
 'splice_ensemble@SpliceAI',
 'vep@cadd_raw.max',
 'vep@LoF_HC.proportion',
 'vep@transcript_ablation.proportion',
 'vep@stop_gained.proportion',
 'vep@frameshift_variant.proportion',
 'vep@coding_sequence_variant.proportion',
 'vep@missense_variant.proportion',
 'vep@inframe_deletion.proportion',
 'vep@inframe_insertion.proportion',
 'vep@stop_lost.proportion',
 'vep@3_prime_UTR_variant.proportion',
 'vep@5_prime_UTR_variant.proportion',
 'vep@NMD_transcript_variant.proportion',
 'vep@NMD_escaping_variant.proportion',
 'vep@start_lost.proportion',
 'vep@splice_donor_variant.proportion',
 'vep@splice_acceptor_variant.proportion',
 'vep@splice_region_variant.proportion']

### What is the enformer variant-effect-score distribution around the TSS?

In [72]:
tss_variants_ldf = tss_variants_ldf. with_columns(tss_distance=(
    pl.when(pl.col('strand') == '+')
    .then(pl.col('variant_start') - pl.col('transcript_start'))
    .otherwise(pl.col('transcript_end') - pl.col('variant_start'))
)).with_columns(absolute_tss_distance=pl.col('tss_distance').abs())

tss_variants_ldf.select(upstream_tss=pl.col('tss_distance').min(), downstream_tss=pl.col('tss_distance').max()).collect()

upstream_tss,downstream_tss
i64,i64
-2301,2195


In [73]:
upstream=2000
downstream=2000

# filter out variants out of this range
tss_variants_ldf = tss_variants_ldf.filter((pl.col('tss_distance') >= -upstream) & (pl.col('tss_distance') <= downstream))

# join tss variants with individuals
tss_individual_variant_ldf = (variant_ldf.join(tss_variants_ldf, how='inner', on=['chrom', 'variant_start', 'variant_end', 'ref', 'alt']).
                              select([
                                  'individual', 'chrom', 'variant_start', 'variant_end', 'ref', 'alt',
                                  'gene', 'transcript', 'strand', 'tss_distance', 'absolute_tss_distance'])
                             )

In [74]:
# keep the variant closest to the TSS for each individual and gene
# tss_individual_variant_ldf = tss_individual_variant_ldf.sort('absolute_tss_distance'). \
#     group_by(['individual', 'gene', 'chrom', 'strand']). \
#     agg(pl.col(['variant_start', 'variant_end', 'ref', 'alt', 'transcript', 'tss_distance']).first())

tss_individual_variant_ldf = tss_individual_variant_ldf. \
    group_by(['individual', 'gene', 'chrom', 'strand', 'ref', 'alt', 'variant_start', 'variant_end']). \
    agg(pl.col(['transcript', 'tss_distance']).first())

# join outrider with variants on individual
tss_individual_variant_outrider_ldf = (
    tss_individual_variant_ldf
    .join(training_benchmark_ldf, how='inner', on=['individual', 'gene'])
    .select(['gene', 'tissue', 'individual', 'tss_distance', 'outlier_state', 
             'zscore', 'FDR', 'AbExp', *feature_columns]))

In [75]:
training_benchmark_ldf.collect().shape

(99459917, 29)

In [76]:
tss_individual_variant_ldf.collect().shape

(1506893, 10)

In [77]:
tss_variants_ldf.head().collect()

gene,transcript,transcript_start,transcript_end,strand,chrom,variant_start,variant_end,ref,alt,tss_distance,absolute_tss_distance
str,str,i64,i64,enum,str,i64,i64,str,str,i64,i64
"""ENSG00000184481""","""ENST00000374259""",70315700,70323382,"""+""","""chrX""",70315840,70315841,"""C""","""G""",140,140
"""ENSG00000184481""","""ENST00000466874""",70316102,70320622,"""+""","""chrX""",70315840,70315841,"""C""","""G""",-262,262
"""ENSG00000184481""","""ENST00000341558""",70316378,70321934,"""+""","""chrX""",70315840,70315841,"""C""","""G""",-538,538
"""ENSG00000184481""","""ENST00000374259""",70315700,70323382,"""+""","""chrX""",70315994,70315995,"""A""","""G""",294,294
"""ENSG00000184481""","""ENST00000466874""",70316102,70320622,"""+""","""chrX""",70315994,70315995,"""A""","""G""",-108,108


In [78]:
tss_variants_ldf.collect().shape

(4081025, 12)

In [79]:
# pl.Config.set_streaming_chunk_size(100)
# print(veff_outrider_ldf.explain(streaming=True))

In [104]:
df = tss_individual_variant_outrider_ldf.collect()

In [105]:
df.shape

(14903908, 30)

In [106]:
df.head()

gene,tissue,individual,tss_distance,outlier_state,zscore,FDR,AbExp,expected_expr@theta,splice_ensemble@AbSplice,splice_ensemble@MMSplice_SpliceMap_Psi_ref,splice_ensemble@SpliceAI,vep@cadd_raw.max,vep@LoF_HC.proportion,vep@transcript_ablation.proportion,vep@stop_gained.proportion,vep@frameshift_variant.proportion,vep@coding_sequence_variant.proportion,vep@missense_variant.proportion,vep@inframe_deletion.proportion,vep@inframe_insertion.proportion,vep@stop_lost.proportion,vep@3_prime_UTR_variant.proportion,vep@5_prime_UTR_variant.proportion,vep@NMD_transcript_variant.proportion,vep@NMD_escaping_variant.proportion,vep@start_lost.proportion,vep@splice_donor_variant.proportion,vep@splice_acceptor_variant.proportion,vep@splice_region_variant.proportion
str,str,str,i64,enum,f32,f64,f64,f32,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""ENSG00000110090""","""Colon - Transverse""","""GTEX-WFG7""",-41,"""normal""",0.316841,1.0,0.00066,35.676929,0.000178,0.000559,0.0,-0.283923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""ENSG00000154845""","""Testis""","""GTEX-ZLFU""",-1048,"""normal""",-0.57041,1.0,0.000334,401.348145,,,,1.005231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068593,0.0,0.0,0.0,0.0,0.0
"""ENSG00000154845""","""Testis""","""GTEX-ZLFU""",-1028,"""normal""",-0.57041,1.0,0.000334,401.348145,,,,1.005231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068593,0.0,0.0,0.0,0.0,0.0
"""ENSG00000154845""","""Testis""","""GTEX-ZLFU""",-1157,"""normal""",-0.57041,1.0,0.000334,401.348145,,,,1.005231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068593,0.0,0.0,0.0,0.0,0.0
"""ENSG00000154845""","""Testis""","""GTEX-ZLFU""",-449,"""normal""",-0.57041,1.0,0.000334,401.348145,,,,1.005231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068593,0.0,0.0,0.0,0.0,0.0


In [107]:
df_bac = df

# tss_distance in bins
bin_size=50
cuts = list(range(-upstream + bin_size, downstream, bin_size))
cut_labels = [str(x) for x in [-upstream, *cuts]]
tss_distance_labels = {c: f'[{c}, {int(c) + bin_size})' for c in cut_labels}
df = (df.with_columns(tss_distance_bin=(pl.col('tss_distance').cut(cuts, labels=cut_labels))
                      .cast(pl.Enum(cut_labels)))
      .with_columns(is_underexpressed=(pl.col('outlier_state') == 'underexpressed')))

df = (df.with_columns(tss_distance_bin_label=pl.col("tss_distance_bin").replace_strict(tss_distance_labels), 
                                                      bin_size=pl.lit(50)))

In [108]:
df.head()

gene,tissue,individual,tss_distance,outlier_state,zscore,FDR,AbExp,expected_expr@theta,splice_ensemble@AbSplice,splice_ensemble@MMSplice_SpliceMap_Psi_ref,splice_ensemble@SpliceAI,vep@cadd_raw.max,vep@LoF_HC.proportion,vep@transcript_ablation.proportion,vep@stop_gained.proportion,vep@frameshift_variant.proportion,vep@coding_sequence_variant.proportion,vep@missense_variant.proportion,vep@inframe_deletion.proportion,vep@inframe_insertion.proportion,vep@stop_lost.proportion,vep@3_prime_UTR_variant.proportion,vep@5_prime_UTR_variant.proportion,vep@NMD_transcript_variant.proportion,vep@NMD_escaping_variant.proportion,vep@start_lost.proportion,vep@splice_donor_variant.proportion,vep@splice_acceptor_variant.proportion,vep@splice_region_variant.proportion,tss_distance_bin,is_underexpressed,tss_distance_bin_label,bin_size
str,str,str,i64,enum,f32,f64,f64,f32,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,enum,bool,str,i32
"""ENSG00000110090""","""Colon - Transverse""","""GTEX-WFG7""",-41,"""normal""",0.316841,1.0,0.00066,35.676929,0.000178,0.000559,0.0,-0.283923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""-50""",False,"""[-50, 0)""",50
"""ENSG00000154845""","""Testis""","""GTEX-ZLFU""",-1048,"""normal""",-0.57041,1.0,0.000334,401.348145,,,,1.005231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068593,0.0,0.0,0.0,0.0,0.0,"""-1050""",False,"""[-1050, -1000)""",50
"""ENSG00000154845""","""Testis""","""GTEX-ZLFU""",-1028,"""normal""",-0.57041,1.0,0.000334,401.348145,,,,1.005231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068593,0.0,0.0,0.0,0.0,0.0,"""-1050""",False,"""[-1050, -1000)""",50
"""ENSG00000154845""","""Testis""","""GTEX-ZLFU""",-1157,"""normal""",-0.57041,1.0,0.000334,401.348145,,,,1.005231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068593,0.0,0.0,0.0,0.0,0.0,"""-1200""",False,"""[-1200, -1150)""",50
"""ENSG00000154845""","""Testis""","""GTEX-ZLFU""",-449,"""normal""",-0.57041,1.0,0.000334,401.348145,,,,1.005231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068593,0.0,0.0,0.0,0.0,0.0,"""-450""",False,"""[-450, -400)""",50


In [109]:
# set new bins
# new_bins = [(-2000, -500),
#             *[(i, i + 100) for i in range(-500, -100, 100)],
#             (-100, -50),
#             (-50, 0),
#             (0, 50),
#             (50, 100),
#             *[(i, i + 100) for i in range(100, 500, 100)],]
# new_bins = [(-2000, -1000),
#             (-1000, -500),
#             *[(i, i + 100) for i in range(-500, -100, 100)],
#             (-100, 0),
#             (0, 100),
#             *[(i, i + 100) for i in range(100, 500, 100)],
#             (500, 1000),
#             (1000, 2000)]
new_bins = [(i, i + 250) for i in range(-2000, 2000, 250)]
new_bin_labels  = [f'[{start}, {stop})' for start, stop in new_bins]

df_bac = df
for start, stop in new_bins:
    df = df.with_columns(tss_distance_bin_label=(pl.when((pl.col('tss_distance_bin').cast(pl.Int16) >= start) & (pl.col('tss_distance_bin').cast(pl.Int16) <= stop))
                                                 .then(pl.lit(f'[{start}, {stop})'))                                  
                                                 .otherwise(pl.col('tss_distance_bin_label'))),
                         bin_size = (pl.when((pl.col('tss_distance_bin').cast(pl.Int16) >= start) & (pl.col('tss_distance_bin').cast(pl.Int16) < stop))
                                     .then(pl.lit(stop - start))
                                     .otherwise(pl.col('bin_size'))))

df = df.with_columns(tss_distance_bin_label=pl.col('tss_distance_bin_label').cast(pl.Enum(new_bin_labels)))

In [110]:
df.head()

gene,tissue,individual,tss_distance,outlier_state,zscore,FDR,AbExp,expected_expr@theta,splice_ensemble@AbSplice,splice_ensemble@MMSplice_SpliceMap_Psi_ref,splice_ensemble@SpliceAI,vep@cadd_raw.max,vep@LoF_HC.proportion,vep@transcript_ablation.proportion,vep@stop_gained.proportion,vep@frameshift_variant.proportion,vep@coding_sequence_variant.proportion,vep@missense_variant.proportion,vep@inframe_deletion.proportion,vep@inframe_insertion.proportion,vep@stop_lost.proportion,vep@3_prime_UTR_variant.proportion,vep@5_prime_UTR_variant.proportion,vep@NMD_transcript_variant.proportion,vep@NMD_escaping_variant.proportion,vep@start_lost.proportion,vep@splice_donor_variant.proportion,vep@splice_acceptor_variant.proportion,vep@splice_region_variant.proportion,tss_distance_bin,is_underexpressed,tss_distance_bin_label,bin_size
str,str,str,i64,enum,f32,f64,f64,f32,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,enum,bool,enum,i32
"""ENSG00000110090""","""Colon - Transverse""","""GTEX-WFG7""",-41,"""normal""",0.316841,1.0,0.00066,35.676929,0.000178,0.000559,0.0,-0.283923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""-50""",False,"""[-250, 0)""",250
"""ENSG00000154845""","""Testis""","""GTEX-ZLFU""",-1048,"""normal""",-0.57041,1.0,0.000334,401.348145,,,,1.005231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068593,0.0,0.0,0.0,0.0,0.0,"""-1050""",False,"""[-1250, -1000)""",250
"""ENSG00000154845""","""Testis""","""GTEX-ZLFU""",-1028,"""normal""",-0.57041,1.0,0.000334,401.348145,,,,1.005231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068593,0.0,0.0,0.0,0.0,0.0,"""-1050""",False,"""[-1250, -1000)""",250
"""ENSG00000154845""","""Testis""","""GTEX-ZLFU""",-1157,"""normal""",-0.57041,1.0,0.000334,401.348145,,,,1.005231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068593,0.0,0.0,0.0,0.0,0.0,"""-1200""",False,"""[-1250, -1000)""",250
"""ENSG00000154845""","""Testis""","""GTEX-ZLFU""",-449,"""normal""",-0.57041,1.0,0.000334,401.348145,,,,1.005231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068593,0.0,0.0,0.0,0.0,0.0,"""-450""",False,"""[-500, -250)""",250


In [112]:
# calculate counts per bins
bin_count_df = (df.group_by(['outlier_state', 'tss_distance_bin_label', 'bin_size'])
                .agg((pl.len()).alias('count')))

# # calculate mean of each bin and then sum the means in each outlier state
totals_df = (bin_count_df.group_by('outlier_state').agg(pl.sum('count').alias('total_count')))
# # normalize each count by the mean calculated above
enrichment_df = (bin_count_df.join(totals_df, on='outlier_state')
                 .with_columns((pl.col('count') / pl.col('total_count')).alias('enrichment')))
ci_low, ci_high = sm.stats.proportion_confint(enrichment_df["count"], enrichment_df["total_count"])
enrichment_df = enrichment_df.with_columns(pl.Series(ci_low).alias('ci_low'), pl.Series(ci_high).alias('ci_high'))
# normalize by bin size
enrichment_df = enrichment_df.with_columns(enrichment = pl.col('enrichment') / pl.col('bin_size'),
                                           ci_low = pl.col('ci_low') / pl.col('bin_size'),
                                           ci_high = pl.col('ci_high') / pl.col('bin_size'))

In [113]:
enrichment_df

outlier_state,tss_distance_bin_label,bin_size,count,total_count,enrichment,ci_low,ci_high
enum,enum,i32,u32,u32,f64,f64,f64
"""underexpressed""","""[-2000, -1750)""",250,260,5154,0.000202,0.000178,0.000226
"""overexpressed""","""[1250, 1500)""",250,219,3434,0.000255,0.000222,0.000288
"""overexpressed""","""[-250, 0)""",250,285,3434,0.000332,0.000295,0.000369
"""underexpressed""","""[750, 1000)""",250,289,5154,0.000224,0.000199,0.000249
"""normal""","""[1500, 1750)""",250,972201,14895320,0.000261,0.000261,0.000262
…,…,…,…,…,…,…,…
"""underexpressed""","""[-1000, -750)""",250,192,5154,0.000149,0.000128,0.00017
"""overexpressed""","""[1500, 1750)""",250,213,3434,0.000248,0.000216,0.00028
"""overexpressed""","""[1750, 2000)""",250,227,3434,0.000264,0.000231,0.000298
"""normal""","""[250, 500)""",250,882047,14895320,0.000237,0.000236,0.000237


In [114]:
totals_df

outlier_state,total_count
enum,u32
"""overexpressed""",3434
"""normal""",14895320
"""underexpressed""",5154


In [115]:
base_path = Path('.')
df.write_parquet(base_path / 'promoter_bin_abexp.parquet', use_pyarrow=True)
enrichment_df.write_parquet(base_path / 'promoter_enrichment.parquet', use_pyarrow=True)
base_path

PosixPath('.')