# Evaluation

How often do regular differential expression analysis vs sophie prioritize the specific vs generic genes?

1. Simulate 1 template perturbation experiment using the technique above
2. Apply SOPHIE to get ranking of specific and generic genes based on their z-score.
3. Apply traditional DE analysis and get ranking of specific and generic genes based on their log fold change value
4. Compare the difference in ranking between specific and generic genes using SOPHIE vs traditional metrics.

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
import os
import pickle
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from rpy2.robjects import pandas2ri
from ponyo import utils, train_vae_modules, simulate_expression_data
from generic_expression_patterns_modules import (
    process,
    new_experiment_process,  # REMOVE
    stats,
    ranking,
)

np.random.seed(1)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))







Using TensorFlow backend.


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

config_filename = "config_sophie_vs_trad.tsv"

params = utils.read_config(config_filename)

In [3]:
# Load config params

# Local directory to store intermediate files
local_dir = params["local_dir"]

#
dataset_name = params["dataset_name"]

# File containing un-normalized template experiment
raw_template_filename = params["raw_template_filename"]

# Un-normalized compendium filename
raw_compendium_filename = params["raw_compendium_filename"]

# Normalized compendium filename
normalized_compendium_filename = params["normalized_compendium_filename"]

# ID for template experiment to be selected
project_id = params["project_id"]

# Number of simulated experiments to generate
num_runs = params["num_simulated"]

# Directory containing trained VAE model
vae_model_dir = params["vae_model_dir"]

# Size of the latent dimension
latent_dim = params["latent_dim"]

# Scaler transform used to scale compendium data into 0-1 range for training
scaler_filename = params["scaler_filename"]

# Which DE method to use
# We recommend that if data is RNA-seq then use DESeq2
# If data is microarray then use Limma
de_method = params["DE_method"]

# If using DE-seq, setting this parameter will
# remove genes below a certain threshold
count_threshold = params["count_threshold"]

# Metadata file that specifies which samples to keep for DE analysis (Optional)
template_process_samples_filename = params["template_process_samples_filename"]

# Metadata file that specifies sample grouping for DE analysis
template_DE_grouping_filename = params["template_DE_grouping_filename"]

# Statistic to use to rank genes or pathways by
# Choices are {} FILL IN
col_to_rank_genes = params["rank_genes_by"]

# Pickle files saving specific and generic gene ids
template_specific_gene_ids_filename = params["template_specific_gene_ids_filename"]
generic_gene_ids_filename = "generic_gene_ids.pickle"

In [4]:
# Files generated by this notebook

# File storing template experiment with gene ids mapped to compendium gene ids
mapped_template_filename = params["mapped_template_filename"]

# File storing normalized template experiment
normalized_template_filename = params["normalized_template_filename"]

# File storing processed template experiment,
# after samples have been selected for comparison in DE analysis
processed_template_filename = params["processed_template_filename"]

# Output summary file
output_filename = params["output_filename"]

## SOPHIE

In [5]:
# Process template
new_experiment_process.process_template_experiment(
    raw_template_filename,
    raw_compendium_filename,
    scaler_filename,
    mapped_template_filename,
    normalized_template_filename,
)

(8, 1000)
(720, 1000)


In [6]:
# Simulate multiple experiments UPDATE COMMENT
# This step creates the following files in "<local_dir>/pseudo_experiment/" directory:
#   - selected_simulated_data_SRP012656_<n>.txt
#   - selected_simulated_encoded_data_SRP012656_<n>.txt
#   - template_normalized_data_SRP012656_test.txt
# in which "<n>" is an integer in the range of [0, num_runs-1]

# REMOVE LATER
# dataset_name = "pre_model_unseen_template"
# Load pickled file
scaler = pickle.load(open(scaler_filename, "rb"))

# Update simulated dir
os.makedirs(os.path.join(local_dir, "pseudo_experiment"), exist_ok=True)

# Update to take in file to be consisten
normalized_compendium = pd.read_csv(
    normalized_compendium_filename, header=0, sep="\t", index_col=0
)
normalized_template = pd.read_csv(
    normalized_template_filename, header=0, sep="\t", index_col=0
)
# ------------
# Update call when new version of ponyo
for run_id in range(num_runs):
    new_experiment_process.embed_shift_template_experiment(
        normalized_compendium,
        normalized_template,
        vae_model_dir,
        project_id,
        scaler_filename,
        local_dir,
        latent_dim,
        run_id,
    )

Instructions for updating:
If using Keras pass *_constraint arguments to layers.



In [7]:
## Update simulated dir
if not os.path.exists(template_process_samples_filename):
    template_process_samples_filename = None

if de_method == "deseq":
    # Process template data
    stats.process_samples_for_DESeq(
        raw_template_filename,
        template_DE_grouping_filename,
        processed_template_filename,
        count_threshold,
        template_process_samples_filename,
    )

    # Process simulated data
    for i in range(num_runs):
        simulated_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_data_{project_id}_{i}.txt",
        )
        out_simulated_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_data_{project_id}_{i}_processed.txt",
        )
        stats.process_samples_for_DESeq(
            simulated_filename,
            template_DE_grouping_filename,
            out_simulated_filename,
            count_threshold,
            template_process_samples_filename,
        )
else:
    stats.process_samples_for_limma(
        raw_template_filename,
        template_DE_grouping_filename,
        processed_template_filename,
        template_process_samples_filename,
    )

    for i in range(num_runs):
        simulated_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_data_{project_id}_{i}.txt",
        )
        stats.process_samples_for_limma(
            simulated_filename,
            template_DE_grouping_filename,
            None,
            template_process_samples_filename,
        )

sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly


In [8]:
# Create subdirectory: "<local_dir>/DE_stats/"
os.makedirs(os.path.join(local_dir, "DE_stats"), exist_ok=True)

In [9]:
%%R -i template_DE_grouping_filename -i project_id -i processed_template_filename -i local_dir -i base_dir -i de_method

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# File created: "<local_dir>/DE_stats/DE_stats_template_data_<project_id>_real.txt"
if (de_method == "deseq"){
    get_DE_stats_DESeq(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}
else{
    get_DE_stats_limma(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}

R[write to console]: Loading required package: S4Vectors

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following object is masked from ‘package:limma’:

    plotMA


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply

[1] "Checking sample ordering..."
[1] TRUE


In [10]:
%%R -i template_DE_grouping_filename -i project_id -i base_dir -i local_dir -i num_runs -i de_method

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# Files created: "<local_dir>/DE_stats/DE_stats_simulated_data_<project_id>_<n>.txt"
for (i in 0:(num_runs-1)){
    simulated_data_filename <- paste(
        local_dir,
        "pseudo_experiment/selected_simulated_data_",
        project_id,
        "_",
        i,
        "_processed.txt",
        sep = ""
    )
    if (de_method == "deseq"){
        get_DE_stats_DESeq(
            template_DE_grouping_filename,
            project_id,
            simulated_data_filename,
            "simulated",
            local_dir,
            i
            )
    }
    else {
        get_DE_stats_limma(
            template_DE_grouping_filename,
            project_id,
            simulated_data_filename,
            "simulated",
            local_dir,
            i
            )
        }
    }

[1] "Checking sample ordering..."
[1] TRUE


R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



In [11]:
analysis_type = "DE"
template_DE_stats_filename = os.path.join(
    local_dir, "DE_stats", f"DE_stats_template_data_{project_id}_real.txt"
)

# Added
if de_method == "deseq":
    logFC_name = "log2FoldChange"
    pvalue_name = "padj"
else:
    logFC_name = "logFC"
    pvalue_name = "adj.P.Val"

template_DE_stats, simulated_DE_summary_stats = ranking.process_and_rank_genes_pathways(
    template_DE_stats_filename,
    local_dir,
    num_runs,
    project_id,
    analysis_type,
    col_to_rank_genes,
    logFC_name,
    pvalue_name,
)



In [12]:
# Get summary table
summary_gene_ranks = ranking.generate_summary_table(
    template_DE_stats_filename,
    template_DE_stats,
    simulated_DE_summary_stats,
    col_to_rank_genes,
    local_dir,
    "gene",
    params,
)

summary_gene_ranks.sort_values(by="Z score", ascending=False).head(10)

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),abs(log2FoldChange) (Real),log2FoldChange (Real),Median adj p-value (simulated),Rank (simulated),Percentile (simulated),Percent DE (simulated),Mean abs(log2FoldChange) (simulated),Std deviation (simulated),Number of experiments (simulated),Z score
G_937,G_937,0.00011,988.0,5.570529,5.570529,0.712998,316.0,31.531532,0.0,1.147347,0.289673,10,15.269569
G_336,G_336,0.000189,986.0,5.084454,5.084454,0.857904,174.0,17.317317,0.0,0.867031,0.355073,10,11.877627
G_243,G_243,7e-06,990.5,6.244824,6.244824,0.70789,694.0,69.369369,0.0,1.46997,0.409972,10,11.646779
G_815,G_815,0.000119,987.0,5.523477,5.523477,0.660331,201.0,20.02002,0.0,1.0301,0.423226,10,10.616967
G_881,G_881,0.000162,982.0,4.47974,4.47974,0.835015,567.0,56.656657,0.0,1.275472,0.318806,10,10.050831
G_644,G_644,0.000163,980.0,3.941796,3.941796,0.710424,504.0,50.35035,0.0,1.241366,0.283321,10,9.531341
G_391,G_391,7e-06,992.0,6.268838,6.268838,0.70789,971.0,97.097097,0.0,2.304053,0.416849,10,9.511322
G_488,G_488,7e-06,990.5,6.244824,6.244824,0.70789,732.0,73.173173,0.0,1.645245,0.637893,10,7.210577
G_236,G_236,2e-05,989.0,5.902202,5.902202,0.782481,870.0,86.986987,0.0,1.544682,0.619812,10,7.030395
G_645,G_645,0.000417,983.0,4.826379,4.826379,0.782481,948.0,94.794795,0.0,2.010571,0.413573,10,6.808484


In [13]:
summary_gene_ranks_sorted = summary_gene_ranks.sort_values(
    by="Z score", ascending=False
)

In [14]:
# Add ranking based on Z-score
summary_gene_ranks_sorted["rank"] = summary_gene_ranks_sorted["Z score"].rank(
    ascending=True
)

In [15]:
summary_gene_ranks_sorted.head(10)

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),abs(log2FoldChange) (Real),log2FoldChange (Real),Median adj p-value (simulated),Rank (simulated),Percentile (simulated),Percent DE (simulated),Mean abs(log2FoldChange) (simulated),Std deviation (simulated),Number of experiments (simulated),Z score,rank
G_937,G_937,0.00011,988.0,5.570529,5.570529,0.712998,316.0,31.531532,0.0,1.147347,0.289673,10,15.269569,992.0
G_336,G_336,0.000189,986.0,5.084454,5.084454,0.857904,174.0,17.317317,0.0,0.867031,0.355073,10,11.877627,991.0
G_243,G_243,7e-06,990.5,6.244824,6.244824,0.70789,694.0,69.369369,0.0,1.46997,0.409972,10,11.646779,990.0
G_815,G_815,0.000119,987.0,5.523477,5.523477,0.660331,201.0,20.02002,0.0,1.0301,0.423226,10,10.616967,989.0
G_881,G_881,0.000162,982.0,4.47974,4.47974,0.835015,567.0,56.656657,0.0,1.275472,0.318806,10,10.050831,988.0
G_644,G_644,0.000163,980.0,3.941796,3.941796,0.710424,504.0,50.35035,0.0,1.241366,0.283321,10,9.531341,987.0
G_391,G_391,7e-06,992.0,6.268838,6.268838,0.70789,971.0,97.097097,0.0,2.304053,0.416849,10,9.511322,986.0
G_488,G_488,7e-06,990.5,6.244824,6.244824,0.70789,732.0,73.173173,0.0,1.645245,0.637893,10,7.210577,985.0
G_236,G_236,2e-05,989.0,5.902202,5.902202,0.782481,870.0,86.986987,0.0,1.544682,0.619812,10,7.030395,984.0
G_645,G_645,0.000417,983.0,4.826379,4.826379,0.782481,948.0,94.794795,0.0,2.010571,0.413573,10,6.808484,983.0


## Traditional DE

In [16]:
%%R -i template_DE_grouping_filename -i project_id -i processed_template_filename -i local_dir -i base_dir -i de_method

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# File created: "<local_dir>/DE_stats/DE_stats_template_data_<project_id>_real.txt"
if (de_method == "deseq"){
    get_DE_stats_DESeq(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}
else{
    get_DE_stats_limma(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}

[1] "Checking sample ordering..."
[1] TRUE


In [17]:
# Load DE statistics file
trad_de_stats_filename = os.path.join(
    local_dir, "DE_stats", f"DE_stats_template_data_{project_id}_real.txt"
)

trad_de_stats = pd.read_csv(trad_de_stats_filename, sep="\t", index_col=0, header=0)

In [18]:
# Sort by log fold change
trad_de_stats_sorted = trad_de_stats.sort_values(by="log2FoldChange", ascending=False)

In [19]:
# Add ranking based on log2FoldChange
trad_de_stats_sorted["rank"] = trad_de_stats_sorted["log2FoldChange"].rank(
    ascending=True
)

In [20]:
trad_de_stats_sorted.head(10)

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,rank
G_391,7.125108,6.268838,1.115622,5.619143,1.919067e-08,7e-06,992.0
G_488,7.007174,6.244824,1.116568,5.592876,2.233383e-08,7e-06,990.5
G_243,7.007174,6.244824,1.116568,5.592876,2.233383e-08,7e-06,990.5
G_236,9.241785,5.902202,1.100117,5.365066,8.091953e-08,2e-05,989.0
G_937,7.362413,5.570529,1.112559,5.006952,5.52987e-07,0.00011,988.0
G_815,7.137844,5.523477,1.114443,4.956265,7.186114e-07,0.000119,987.0
G_336,8.766909,5.084454,1.097677,4.632012,3.621293e-06,0.000189,986.0
G_649,7.616165,4.872302,1.105285,4.408185,1.042404e-05,0.000383,985.0
G_383,7.48993,4.850577,1.106579,4.3834,1.168413e-05,0.000386,984.0
G_645,7.373417,4.826379,1.107106,4.359456,1.303862e-05,0.000417,983.0


## Compare

1. mean rank of specific genes - mean rank of generic genes for template experiment
2. We will need to re-run this notebook for each template experiment and then plot the distribution of difference scores

We want to compare the mean ranking of specific genes vs the mean ranking of generic genes. If the mean difference is large then yes, that would indicate that there is a difference between the specific and generic genes that we can detect. In addition to the difference we want the specific genes to be higher ranked compared to the generic ones, so we want to see a large positive value if the method is performing better.

In [21]:
# Load pickled file
with open(template_specific_gene_ids_filename, "rb") as specific_fh:
    specific_gene_ids = pickle.load(specific_fh)

with open(generic_gene_ids_filename, "rb") as generic_fh:
    generic_gene_ids = pickle.load(generic_fh)

In [22]:
# Get mean of specific gene ranks
sophie_specific_mean = summary_gene_ranks_sorted.loc[specific_gene_ids, "rank"].mean()
trad_specific_mean = trad_de_stats_sorted.loc[specific_gene_ids, "rank"].mean()

In [23]:
print(sophie_specific_mean)
print(trad_specific_mean)

842.1
893.9


In [24]:
summary_gene_ranks_sorted.loc[specific_gene_ids, "rank"]

G_487    967.0
G_453    177.0
G_236    984.0
G_833    914.0
G_6      927.0
G_755    903.0
G_936    981.0
G_914    915.0
G_582    695.0
G_186    958.0
Name: rank, dtype: float64

In [25]:
trad_de_stats_sorted.loc[specific_gene_ids, "rank"]

G_487    972.0
G_453    531.0
G_236    989.0
G_833    949.0
G_6      930.0
G_755    929.0
G_936    964.0
G_914    941.0
G_582    795.0
G_186    939.0
Name: rank, dtype: float64

In [26]:
# Get mean of generic gene ranks
sophie_generic_mean = summary_gene_ranks_sorted.loc[generic_gene_ids, "rank"].mean()
trad_generic_mean = trad_de_stats_sorted.loc[generic_gene_ids, "rank"].mean()

In [27]:
print(sophie_generic_mean)
print(trad_generic_mean)

655.05
839.85


In [28]:
summary_gene_ranks_sorted.loc[generic_gene_ids, "rank"]

G_716    895.0
G_146    960.0
G_351    800.0
G_886     27.0
G_318     55.0
G_336    991.0
G_389    757.0
G_472    922.0
G_371    970.0
G_315    844.0
G_288    187.0
G_113     26.0
G_109     34.0
G_858    972.0
G_542    787.0
G_70     943.0
G_903    976.0
G_645    983.0
G_649    980.0
G_20     901.0
G_313    332.0
G_918    929.0
G_270    968.0
G_671    969.0
G_489    912.0
G_259    932.0
G_338    161.0
G_269    160.0
G_200    838.0
G_209    532.0
         ...  
G_878     91.0
G_127    535.0
G_937    992.0
G_214    595.0
G_357     50.0
G_419    872.0
G_34     722.0
G_108    657.0
G_383    982.0
G_780    211.0
G_228    973.0
G_815    989.0
G_901    444.0
G_506    979.0
G_102    674.0
G_905    894.0
G_992     36.0
G_890    971.0
G_666    687.0
G_792     12.0
G_813     33.0
G_860    952.0
G_317    849.0
G_539    199.0
G_644    987.0
G_996    563.0
G_414    966.0
G_942    951.0
G_965    881.0
G_359    168.0
Name: rank, Length: 100, dtype: float64

In [29]:
trad_de_stats_sorted.loc[generic_gene_ids, "rank"]

G_716    947.0
G_146    950.0
G_351    876.0
G_886    903.0
G_318    634.0
G_336    986.0
G_389    890.0
G_472    960.0
G_371    952.0
G_315    920.0
G_288    452.0
G_113    340.0
G_109    747.0
G_858    975.0
G_542    878.0
G_70     954.0
G_903    974.0
G_645    983.0
G_649    985.0
G_20     961.0
G_313    410.0
G_918    940.0
G_270    969.0
G_671    979.0
G_489    948.0
G_259    946.0
G_338    843.0
G_269    702.0
G_200    902.0
G_209    866.0
         ...  
G_878    678.0
G_127    806.0
G_937    988.0
G_214    778.0
G_357    861.0
G_419    935.0
G_34     918.0
G_108    816.0
G_383    984.0
G_780    277.0
G_228    978.0
G_815    987.0
G_901    647.0
G_506    981.0
G_102    936.0
G_905    910.0
G_992    641.0
G_890    976.0
G_666    895.0
G_792    767.0
G_813    326.0
G_860    968.0
G_317    922.0
G_539    363.0
G_644    980.0
G_996    856.0
G_414    958.0
G_942    963.0
G_965    932.0
G_359    280.0
Name: rank, Length: 100, dtype: float64

In [30]:
# Difference
diff_sophie = sophie_specific_mean - sophie_generic_mean
diff_trad = trad_specific_mean - trad_generic_mean

print("sophie difference: ", diff_sophie)
print("traditional difference: ", diff_trad)

sophie difference:  187.05000000000007
traditional difference:  54.049999999999955
