# Variant-effect prediction comparison

## Setup

In [1]:
if (! exists("snakemake")) {
    library(methods)
    Snakemake <- setClass(
        "Snakemake",
        slots = c(
            input = "list",
            output = "list",
            params = "list",
            wildcards = "list",
            threads = "numeric",
            log = "list",
            resources = "list",
            config = "list",
            rule = "character",
            bench_iteration = "numeric",
            scriptdir = "character",
            source = "function"
        )
    )
    snakemake <- Snakemake(
        input = list('output/smk/evaluation/prc.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/prc.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/prc.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/prc_tissue.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/prc_tissue.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/prc_tissue.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/prc_tissue_type.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/prc_tissue_type.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/prc_tissue_type.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/prc_fold.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/prc_fold.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/prc_fold.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/r2.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/r2.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/r2.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/r2_tissue.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/r2_tissue.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/r2_tissue.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/r2_tissue_type.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/r2_tissue_type.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/r2_tissue_type.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/r2_fold.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/r2_fold.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/r2_fold.parquet/run=enformer_veff3/data.parquet'),
        output = list(),
        params = list(),
        wildcards = list('dev1', "comparison_id" = 'dev1'),
        threads = 1,
        log = list('output/smk/comparison/notebooks/dev1.r.ipynb', "notebook" = 'output/smk/comparison/notebooks/dev1.r.ipynb'),
        resources = list('tmpdir', 'mem_mb', 'mem_mib', "tmpdir" = '/tmp', "mem_mb" = 121000, "mem_mib" = 115395),
        config = list("output_path" = 'output/smk', "debug" = TRUE, "test" = list("dataloader_size" = 100, "is_random_enformer" = TRUE, "precomputed_enformer_mapper_path" = 'example_files/gtex_enformer_lm_models_pseudocount1.pkl'), "benchmark" = list("genotypes_path" = 'example_files/gtex_samples/rare_variants.vcf.parquet', "annotation_path" = 'example_files/gtex_samples/benchmark_with_annotation.parquet', "folds_path" = 'example_files/gtex_samples/folds.parquet', "fdr_cutoff" = 0.2), "runs" = list("enformer_veff1" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'elasticnet_cage', "isoform_file" = 'example_files/isoform_proportions.tsv', "aggregation_mode" = 'logsumexp', "upstream_tss" = 2000, "downstream_tss" = 500), "enformer_veff2" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'elasticnet_cage', "isoform_file" = NULL, "aggregation_mode" = 'canonical', "upstream_tss" = 2000, "downstream_tss" = 500), "enformer_veff3" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'lightgbm_cage', "isoform_file" = 'example_files/isoform_proportions.tsv', "aggregation_mode" = 'logsumexp', "upstream_tss" = 2000, "downstream_tss" = 500), "enformer_veff4" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'lightgbm_cage', "isoform_file" = NULL, "aggregation_mode" = 'canonical', "upstream_tss" = 2000, "downstream_tss" = 500), "enformer_veff5" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'lightgbm_cage', "isoform_file" = 'example_files/isoform_proportions.tsv', "aggregation_mode" = 'logsumexp', "upstream_tss" = 100, "downstream_tss" = 500), "enformer_veff6" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'lightgbm_cage', "isoform_file" = NULL, "aggregation_mode" = 'canonical', "upstream_tss" = 100, "downstream_tss" = 500), "enformer_veff7" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'lightgbm_cage', "isoform_file" = NULL, "aggregation_mode" = 'canonical', "upstream_tss" = NULL, "downstream_tss" = NULL)), "comparisons" = list("dev1" = c('enformer_veff1', 'enformer_veff2', 'enformer_veff3')), "genomes" = list("chr21_22" = list("gtf_file" = 'example_files/annot.gtf.gz', "fasta_file" = 'example_files/seq.fa', "canonical_only" = FALSE, "protein_coding_only" = TRUE, "chromosomes" = c('chr21', 'chr22'))), "vcfs" = list("chr21_22_var" = list("path" = 'example_files/vcf', "variant_upstream_tss" = 2000, "variant_downstream_tss" = 500)), "enformer" = list("models" = list("short" = list("shift" = 43, "batch_size" = 2, "num_output_bins" = 21), "complete" = list("shift" = 43, "batch_size" = 2, "num_output_bins" = 896)), "references" = list("ref1" = list("genome" = 'chr21_22', "model" = 'short')), "alternatives" = list("alt1" = list("reference" = 'ref1', "vcf" = 'chr21_22_var')), "mappers" = list("elasticnet_cage" = list("num_agg_bins" = 3, "expression_path" = 'example_files/gtex_samples/transcripts_tpms.zarr', "tracks_path" = 'assets/enformer_tracks/cage_nonuniversal_enformer_tracks.yaml', "type" = 'ElasticNet', "params" = list("cv" = 2, "max_iter" = 1000)), "lightgbm_cage" = list("num_agg_bins" = 3, "expression_path" = 'example_files/gtex_samples/transcripts_tpms.zarr', "tracks_path" = 'assets/enformer_tracks/cage_nonuniversal_enformer_tracks.yaml', "type" = 'LightGBM')))),
        rule = 'comparison',
        bench_iteration = as.numeric(NA),
        scriptdir = '/home/george/Projects/kipoi_enformer/workflow/notebooks',
        source = function(...){
            wd <- getwd()
            setwd(snakemake@scriptdir)
            source(...)
            setwd(wd)
        }
    )
    setwd('/home/george/Projects/kipoi_enformer');
}

In [2]:
library('data.table')
library('arrow')
library('dplyr')
library('rstatix')
library('ggplot2')
library('cowplot')

“package ‘data.table’ was built under R version 4.3.3”
“package ‘arrow’ was built under R version 4.3.3”

Attaching package: ‘arrow’


The following object is masked from ‘package:utils’:

    timestamp


“package ‘dplyr’ was built under R version 4.3.2”

Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘rstatix’


The following object is masked from ‘package:stats’:

    filter


“package ‘ggplot2’ was built under R version 4.3.3”
“package ‘cowplot’ was built under R version 4.3.2”


In [3]:
LINESIZE = 1
FONTSIZE = 12
THEME = theme_cowplot(font_size = FONTSIZE, font_family = 'Helvetica') + theme(plot.tag = element_text(face = "bold"))

In [5]:
evaluation_path = file.path(snakemake@config[['output_path']], 'evaluation')
eval_list = list()
for (metric in c('prc', 'r2')){
    for (feature in c('', '_tissue', '_tissue_type', '_fold')){
        fname = paste0(metric, feature, '.parquet')
        eval_list[[fname]] = (file.path(evaluation_path, fname) %>%
                               open_dataset(partitioning = c("run")) %>%
                               as.data.table())
    }
}

## Analysis

In [6]:
types = c('total', 'obvious', 'not_obvious')

In [7]:
# # PRC curve faceted by type and colored by run
# prc_plot = (
#     ggplot(prc_df[is_binary == FALSE,], aes(x=`recall`, y=`precision`))
#     + geom_step(aes(color=reorder(`model`, `idx`)), linewidth=LINESIZE, direction="hv")
#     + scale_color_manual(name = element_blank(), values=nonbinary_model_colors)
#     # + new_scale("color")
#     + geom_point(
#         data=prc_df[is_binary == TRUE, .(`precision`, `recall`, `baseline model type` = `model`)],
#         aes(x=`recall`, y=`precision`),
#         # shape="x",
#         size=2
#         # size = 10
#     )
#     + geom_text_repel(
#         data=prc_df[`is_binary` == TRUE,],
#         aes(label=`model`),
#         nudge_x = 0.02,
#         nudge_y = 0.02,
#         min.segment.length = 0,
#         box.padding = 0.5
#         # nudge_x = 0.1,
#     )
#     + THEME
#     + background_grid(major = "xy", minor = "xy")
#     + scale_x_continuous(
#         limits = c(0, 0.35),
#         breaks = seq(0, 1, 0.1),
#         minor_breaks = seq(0, 1, 0.2)
#     )
#     + scale_y_continuous(
#         limits = c(-0.0, 0.3),
#         breaks = seq(0, 1, 0.05),
#         minor_breaks = seq(0, 1, 0.2)
#         # breaks = seq(0, 1, 0.05),
#         # minor_breaks = seq(0, 1, 0.025)
#     )
#     + scale_shape_manual(name=element_blank(), values=c(4, 4))
#     + guides(color=guide_legend(order = 1), shape=guide_legend(order = 2))
#     + theme(
#         #legend.position="right",
#         legend.position = c(0.95, 0.95),
#         legend.justification = c("right", "top")
#         # axis.text.x=element_text(angle=45, hjust=1),
#     )
#     + labs(
#         x='Recall',
#         y='Precision',
#         title="All tissues"
#     )
# )

In [None]:
# AUPRC Box plots across tissues with p-values
# only test best model against all others
# for each type

In [None]:
# AUPRC Box plots across tissue types with p-values
# only test best model against all others
# for each type

In [5]:
# AUPRC Box plots across folds with p-values
# only test best model against all others
# for each type

In [None]:
# R2 Box plots across tissues with p-values
# only test best model against all others
# for each type

In [None]:
# R2 Box plots across tissue types with p-values
# only test best model against all others
# for each type

In [5]:
# R2 Box plots across folds with p-values
# only test best model against all others
# for each type