# Variant-effect prediction comparison

## Setup

In [2]:
if (! exists("snakemake")) {
    library(methods)
    Snakemake <- setClass(
        "Snakemake",
        slots = c(
            input = "list",
            output = "list",
            params = "list",
            wildcards = "list",
            threads = "numeric",
            log = "list",
            resources = "list",
            config = "list",
            rule = "character",
            bench_iteration = "numeric",
            scriptdir = "character",
            source = "function"
        )
    )
    snakemake <- Snakemake(
        input = list('output/smk/evaluation/prc.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/prc.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/prc.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/prc_tissue.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/prc_tissue.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/prc_tissue.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/prc_tissue_type.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/prc_tissue_type.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/prc_tissue_type.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/prc_fold.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/prc_fold.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/prc_fold.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/r2.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/r2.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/r2.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/r2_tissue.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/r2_tissue.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/r2_tissue.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/r2_tissue_type.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/r2_tissue_type.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/r2_tissue_type.parquet/run=enformer_veff3/data.parquet', 'output/smk/evaluation/r2_fold.parquet/run=enformer_veff1/data.parquet', 'output/smk/evaluation/r2_fold.parquet/run=enformer_veff2/data.parquet', 'output/smk/evaluation/r2_fold.parquet/run=enformer_veff3/data.parquet'),
        output = list(),
        params = list(),
        wildcards = list('dev1', "comparison_id" = 'dev1'),
        threads = 1,
        log = list('output/smk/comparison/notebooks/dev1.r.ipynb', "notebook" = 'output/smk/comparison/notebooks/dev1.r.ipynb'),
        resources = list('tmpdir', 'mem_mb', 'mem_mib', "tmpdir" = '/tmp', "mem_mb" = 121000, "mem_mib" = 115395),
        config = list("output_path" = 'output/smk', "debug" = TRUE, "test" = list("dataloader_size" = 100, "is_random_enformer" = TRUE, "precomputed_enformer_mapper_path" = 'example_files/gtex_enformer_lm_models_pseudocount1.pkl'), "benchmark" = list("genotypes_path" = 'example_files/gtex_samples/rare_variants.vcf.parquet', "annotation_path" = 'example_files/gtex_samples/benchmark_with_annotation.parquet', "folds_path" = 'example_files/gtex_samples/folds.parquet', "fdr_cutoff" = 0.2), "runs" = list("enformer_veff1" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'elasticnet_cage', "isoform_file" = 'example_files/isoform_proportions.tsv', "aggregation_mode" = 'logsumexp', "upstream_tss" = 2000, "downstream_tss" = 500), "enformer_veff2" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'elasticnet_cage', "isoform_file" = NULL, "aggregation_mode" = 'canonical', "upstream_tss" = 2000, "downstream_tss" = 500), "enformer_veff3" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'lightgbm_cage', "isoform_file" = 'example_files/isoform_proportions.tsv', "aggregation_mode" = 'logsumexp', "upstream_tss" = 2000, "downstream_tss" = 500), "enformer_veff4" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'lightgbm_cage', "isoform_file" = NULL, "aggregation_mode" = 'canonical', "upstream_tss" = 2000, "downstream_tss" = 500), "enformer_veff5" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'lightgbm_cage', "isoform_file" = 'example_files/isoform_proportions.tsv', "aggregation_mode" = 'logsumexp', "upstream_tss" = 100, "downstream_tss" = 500), "enformer_veff6" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'lightgbm_cage', "isoform_file" = NULL, "aggregation_mode" = 'canonical', "upstream_tss" = 100, "downstream_tss" = 500), "enformer_veff7" = list("predictor" = 'enformer', "alternative" = 'alt1', "mapper" = 'lightgbm_cage', "isoform_file" = NULL, "aggregation_mode" = 'canonical', "upstream_tss" = NULL, "downstream_tss" = NULL)), "comparisons" = list("dev1" = c('enformer_veff1', 'enformer_veff2', 'enformer_veff3')), "genomes" = list("chr21_22" = list("gtf_file" = 'example_files/annot.gtf.gz', "fasta_file" = 'example_files/seq.fa', "canonical_only" = FALSE, "protein_coding_only" = TRUE, "chromosomes" = c('chr21', 'chr22'))), "vcfs" = list("chr21_22_var" = list("path" = 'example_files/vcf', "variant_upstream_tss" = 2000, "variant_downstream_tss" = 500)), "enformer" = list("models" = list("short" = list("shift" = 43, "batch_size" = 2, "num_output_bins" = 21), "complete" = list("shift" = 43, "batch_size" = 2, "num_output_bins" = 896)), "references" = list("ref1" = list("genome" = 'chr21_22', "model" = 'short')), "alternatives" = list("alt1" = list("reference" = 'ref1', "vcf" = 'chr21_22_var')), "mappers" = list("elasticnet_cage" = list("num_agg_bins" = 3, "expression_path" = 'example_files/gtex_samples/transcripts_tpms.zarr', "tracks_path" = 'assets/enformer_tracks/cage_nonuniversal_enformer_tracks.yaml', "type" = 'ElasticNet', "params" = list("cv" = 2, "max_iter" = 1000)), "lightgbm_cage" = list("num_agg_bins" = 3, "expression_path" = 'example_files/gtex_samples/transcripts_tpms.zarr', "tracks_path" = 'assets/enformer_tracks/cage_nonuniversal_enformer_tracks.yaml', "type" = 'LightGBM')))),
        rule = 'comparison',
        bench_iteration = as.numeric(NA),
        scriptdir = '/home/george/Projects/kipoi_enformer/workflow/notebooks',
        source = function(...){
            wd <- getwd()
            setwd(snakemake@scriptdir)
            source(...)
            setwd(wd)
        }
    )
    setwd('/home/george/Projects/kipoi_enformer');
}

In [23]:
library('data.table')
library('arrow')
library('dplyr')
library('rstatix')


Attaching package: ‘rstatix’


The following object is masked from ‘package:stats’:

    filter




In [4]:
snakemake@input

In [14]:
prc_df = as.data.table(open_dataset("output/smk/evaluation/prc.parquet", partitioning = c("run")))
prc_tissue_df = as.data.table(open_dataset("output/smk/evaluation/prc_tissue.parquet", partitioning = c("run")))
prc_tissue_type_df = as.data.table(open_dataset("output/smk/evaluation/prc_tissue_type.parquet", partitioning = c("run")))
prc_fold_df = as.data.table(open_dataset("output/smk/evaluation/prc_fold.parquet", partitioning = c("run")))
r2_df = as.data.table(open_dataset("output/smk/evaluation/r2.parquet", partitioning = c("run")))
r2_tissue_df = as.data.table(open_dataset("output/smk/evaluation/r2_tissue.parquet", partitioning = c("run")))
r2_tissue_type_df = as.data.table(open_dataset("output/smk/evaluation/r2_tissue_type.parquet", partitioning = c("run")))
r2_fold_df = as.data.table(open_dataset("output/smk/evaluation/r2_fold.parquet", partitioning = c("run")))

## Analysis

In [27]:
types = c('total', 'obvious', 'not_obvious')

In [18]:
head(prc_tissue_df)

precision,recall,threshold,is_binary,auc,tissue,type_,run
<dbl>,<dbl>,<dbl>,<lgl>,<dbl>,<chr>,<chr>,<chr>
0.0,0,1.9486269,False,0.0001287333,Adipose - Subcutaneous,total,enformer_veff1
0.0001,1,0.0,False,0.0001287333,Adipose - Subcutaneous,total,enformer_veff1
0.0,0,2.174142,False,6.333924e-05,Adipose - Visceral (Omentum),total,enformer_veff1
0.0001,1,0.0,False,6.333924e-05,Adipose - Visceral (Omentum),total,enformer_veff1
0.0,0,0.4322566,True,0.0001009285,Adrenal Gland,total,enformer_veff1
0.0,0,0.0,True,0.0,Artery - Aorta,total,enformer_veff1


In [29]:
stat.test <- (
    prc_tissue_df
    %>% filter(type_=='total')
    %>% wilcox_test(`auc` ~ `run`, paired=TRUE)
    %>% add_significance("p")
)

In [34]:
stat.test

.y.,group1,group2,n1,n2,statistic,p,p.adj,p.adj.signif,p.signif
<chr>,<chr>,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
auc,enformer_veff1,enformer_veff2,41,41,0,,,,
auc,enformer_veff1,enformer_veff3,41,41,0,,,,
auc,enformer_veff2,enformer_veff3,41,41,0,,,,


In [35]:
(prc_tissue_df
%>% filter(type_=='obvious')
%>% wilcox_test(`auc` ~ `run`, paired=TRUE))

Unnamed: 0_level_0,.y.,group1,group2,n1,n2,statistic,p,p.adj,p.adj.signif
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<chr>
1,auc,enformer_veff1,enformer_veff2,30,30,0,,,
2,auc,enformer_veff1,enformer_veff3,30,30,0,,,
3,auc,enformer_veff2,enformer_veff3,30,30,0,,,
