# Evaluation

How often do regular differential expression analysis vs sophie prioritize the specific vs generic genes?

1. Simulate 1 template perturbation experiment using the technique above
2. Apply SOPHIE to get ranking of specific and generic genes based on their z-score.
3. Apply traditional DE analysis and get ranking of specific and generic genes based on their log fold change value
4. Compare the difference in ranking between specific and generic genes using SOPHIE vs traditional metrics.

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
import os
import pickle
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from rpy2.robjects import pandas2ri
from ponyo import utils, train_vae_modules, simulate_expression_data
from generic_expression_patterns_modules import (
    process,
    new_experiment_process,  # REMOVE
    stats,
    ranking,
)

np.random.seed(1)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))







Using TensorFlow backend.


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

config_filename = "config_sophie_vs_trad.tsv"

params = utils.read_config(config_filename)

In [3]:
# Load config params

# Local directory to store intermediate files
local_dir = params["local_dir"]

#
dataset_name = params["dataset_name"]

# File containing un-normalized template experiment
raw_template_filename = params["raw_template_filename"]

# Un-normalized compendium filename
raw_compendium_filename = params["raw_compendium_filename"]

# Normalized compendium filename
normalized_compendium_filename = params["normalized_compendium_filename"]

# ID for template experiment to be selected
project_id = params["project_id"]

# Number of simulated experiments to generate
num_runs = params["num_simulated"]

# Directory containing trained VAE model
vae_model_dir = params["vae_model_dir"]

# Size of the latent dimension
latent_dim = params["latent_dim"]

# Scaler transform used to scale compendium data into 0-1 range for training
scaler_filename = params["scaler_filename"]

# Which DE method to use
# We recommend that if data is RNA-seq then use DESeq2
# If data is microarray then use Limma
de_method = params["DE_method"]

# If using DE-seq, setting this parameter will
# remove genes below a certain threshold
count_threshold = params["count_threshold"]

# Metadata file that specifies which samples to keep for DE analysis (Optional)
template_process_samples_filename = params["template_process_samples_filename"]

# Metadata file that specifies sample grouping for DE analysis
template_DE_grouping_filename = params["template_DE_grouping_filename"]

# Statistic to use to rank genes or pathways by
# Choices are {} FILL IN
col_to_rank_genes = params["rank_genes_by"]

# Pickle files saving specific and generic gene ids
template_specific_gene_ids_filename = params["template_specific_gene_ids_filename"]
generic_gene_ids_filename = "generic_gene_ids.pickle"

In [4]:
# Files generated by this notebook

# File storing template experiment with gene ids mapped to compendium gene ids
mapped_template_filename = params["mapped_template_filename"]

# File storing normalized template experiment
normalized_template_filename = params["normalized_template_filename"]

# File storing processed template experiment,
# after samples have been selected for comparison in DE analysis
processed_template_filename = params["processed_template_filename"]

# Output summary file
output_filename = params["output_filename"]

## SOPHIE

In [5]:
# Process template
new_experiment_process.process_template_experiment(
    raw_template_filename,
    raw_compendium_filename,
    scaler_filename,
    mapped_template_filename,
    normalized_template_filename,
)

(4, 1000)
(360, 1000)


In [6]:
# Simulate multiple experiments UPDATE COMMENT
# This step creates the following files in "<local_dir>/pseudo_experiment/" directory:
#   - selected_simulated_data_SRP012656_<n>.txt
#   - selected_simulated_encoded_data_SRP012656_<n>.txt
#   - template_normalized_data_SRP012656_test.txt
# in which "<n>" is an integer in the range of [0, num_runs-1]

# REMOVE LATER
# dataset_name = "pre_model_unseen_template"
# Load pickled file
scaler = pickle.load(open(scaler_filename, "rb"))

# Update simulated dir
os.makedirs(os.path.join(local_dir, "pseudo_experiment"), exist_ok=True)

# Update to take in file to be consisten
normalized_compendium = pd.read_csv(
    normalized_compendium_filename, header=0, sep="\t", index_col=0
)
normalized_template = pd.read_csv(
    normalized_template_filename, header=0, sep="\t", index_col=0
)
# ------------
# Update call when new version of ponyo
for run_id in range(num_runs):
    new_experiment_process.embed_shift_template_experiment(
        normalized_compendium,
        normalized_template,
        vae_model_dir,
        project_id,
        scaler_filename,
        local_dir,
        latent_dim,
        run_id,
    )

Instructions for updating:
If using Keras pass *_constraint arguments to layers.



In [7]:
## Update simulated dir
if not os.path.exists(template_process_samples_filename):
    template_process_samples_filename = None

if de_method == "deseq":
    # Process template data
    stats.process_samples_for_DESeq(
        raw_template_filename,
        template_DE_grouping_filename,
        processed_template_filename,
        count_threshold,
        template_process_samples_filename,
    )

    # Process simulated data
    for i in range(num_runs):
        simulated_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_data_{project_id}_{i}.txt",
        )
        out_simulated_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_data_{project_id}_{i}_processed.txt",
        )
        stats.process_samples_for_DESeq(
            simulated_filename,
            template_DE_grouping_filename,
            out_simulated_filename,
            count_threshold,
            template_process_samples_filename,
        )
else:
    stats.process_samples_for_limma(
        raw_template_filename,
        template_DE_grouping_filename,
        processed_template_filename,
        template_process_samples_filename,
    )

    for i in range(num_runs):
        simulated_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_data_{project_id}_{i}.txt",
        )
        stats.process_samples_for_limma(
            simulated_filename,
            template_DE_grouping_filename,
            None,
            template_process_samples_filename,
        )

sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly


In [8]:
# Create subdirectory: "<local_dir>/DE_stats/"
os.makedirs(os.path.join(local_dir, "DE_stats"), exist_ok=True)

In [9]:
%%R -i template_DE_grouping_filename -i project_id -i processed_template_filename -i local_dir -i base_dir -i de_method

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# File created: "<local_dir>/DE_stats/DE_stats_template_data_<project_id>_real.txt"
if (de_method == "deseq"){
    get_DE_stats_DESeq(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}
else{
    get_DE_stats_limma(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}

R[write to console]: Loading required package: S4Vectors

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following object is masked from ‘package:limma’:

    plotMA


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply

[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



In [10]:
%%R -i template_DE_grouping_filename -i project_id -i base_dir -i local_dir -i num_runs -i de_method

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# Files created: "<local_dir>/DE_stats/DE_stats_simulated_data_<project_id>_<n>.txt"
for (i in 0:(num_runs-1)){
    simulated_data_filename <- paste(
        local_dir,
        "pseudo_experiment/selected_simulated_data_",
        project_id,
        "_",
        i,
        "_processed.txt",
        sep = ""
    )
    if (de_method == "deseq"){
        get_DE_stats_DESeq(
            template_DE_grouping_filename,
            project_id,
            simulated_data_filename,
            "simulated",
            local_dir,
            i
            )
    }
    else {
        get_DE_stats_limma(
            template_DE_grouping_filename,
            project_id,
            simulated_data_filename,
            "simulated",
            local_dir,
            i
            )
        }
    }

[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: Error in estimateDispersionsFit(object, fitType = fitType, quiet = quiet) : 
  all gene-wise dispersion estimates are within 2 orders of magnitude
  from the minimum value, and so the standard curve fitting techniques will not work.
  One can instead use the gene-wise estimates as final estimates:
  dds <- estimateDispersionsGeneEst(dds)
  dispersions(dds) <- mcols(dds)$dispGeneEst
  ...then continue with testing using nbinomWaldTest or nbinomLRT

R[write to console]: In addition: 
R[write to console]: 




Error in estimateDispersionsFit(object, fitType = fitType, quiet = quiet) : 
  all gene-wise dispersion estimates are within 2 orders of magnitude
  from the minimum value, and so the standard curve fitting techniques will not work.
  One can instead use the gene-wise estimates as final estimates:
  dds <- estimateDispersionsGeneEst(dds)
  dispersions(dds) <- mcols(dds)$dispGeneEst
  ...then continue with testing using nbinomWaldTest or nbinomLRT


In [11]:
analysis_type = "DE"
template_DE_stats_filename = os.path.join(
    local_dir, "DE_stats", f"DE_stats_template_data_{project_id}_real.txt"
)

# Added
if de_method == "deseq":
    logFC_name = "log2FoldChange"
    pvalue_name = "padj"
else:
    logFC_name = "logFC"
    pvalue_name = "adj.P.Val"

template_DE_stats, simulated_DE_summary_stats = ranking.process_and_rank_genes_pathways(
    template_DE_stats_filename,
    local_dir,
    num_runs,
    project_id,
    analysis_type,
    col_to_rank_genes,
    logFC_name,
    pvalue_name,
)



In [12]:
# Get summary table
summary_gene_ranks = ranking.generate_summary_table(
    template_DE_stats_filename,
    template_DE_stats,
    simulated_DE_summary_stats,
    col_to_rank_genes,
    local_dir,
    "gene",
    params,
)

summary_gene_ranks.sort_values(by="Z score", ascending=False).head(10)

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),abs(log2FoldChange) (Real),log2FoldChange (Real),Median adj p-value (simulated),Rank (simulated),Percentile (simulated),Percent DE (simulated),Mean abs(log2FoldChange) (simulated),Std deviation (simulated),Number of experiments (simulated),Z score
G_94,G_94,0.999304,972.0,0.242436,0.242436,0.999405,46.0,4.504505,0.0,0.008432,0.005742,10,40.750666
G_677,G_677,0.999304,980.0,0.261352,0.261352,0.999405,4.0,0.3003,0.0,0.007409,0.008002,10,31.733043
G_452,G_452,0.999304,765.0,0.123959,0.123959,0.999405,61.0,6.006006,0.0,0.007558,0.003923,10,29.673908
G_481,G_481,0.999304,962.0,0.22616,0.22616,0.999405,342.0,34.134134,0.0,0.014111,0.007596,10,27.914393
G_299,G_299,0.999304,953.0,0.21887,0.21887,0.999405,388.0,38.738739,0.0,0.017358,0.007378,10,27.31168
G_780,G_780,0.999304,971.0,0.239619,0.239619,0.999405,263.0,26.226226,0.0,0.015758,0.008373,10,26.734516
G_27,G_27,0.999304,998.0,0.394053,0.394053,0.999405,827.0,82.682683,0.0,0.023328,0.01405,10,26.386273
G_674,G_674,0.999304,990.0,0.293388,-0.293388,0.999405,537.0,53.653654,0.0,0.019042,0.010638,10,25.788839
G_461,G_461,0.999304,986.0,0.2761,-0.2761,0.999405,59.0,5.805806,0.0,0.011628,0.010431,10,25.355455
G_986,G_986,0.999304,976.0,0.251112,-0.251112,0.999405,88.0,8.708709,0.0,0.010601,0.009685,10,24.83327


In [13]:
summary_gene_ranks_sorted = summary_gene_ranks.sort_values(
    by="Z score", ascending=False
)

In [14]:
# Add ranking based on Z-score
summary_gene_ranks_sorted["rank"] = summary_gene_ranks_sorted["Z score"].rank(
    ascending=True
)

In [15]:
summary_gene_ranks_sorted.head(10)

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),abs(log2FoldChange) (Real),log2FoldChange (Real),Median adj p-value (simulated),Rank (simulated),Percentile (simulated),Percent DE (simulated),Mean abs(log2FoldChange) (simulated),Std deviation (simulated),Number of experiments (simulated),Z score,rank
G_94,G_94,0.999304,972.0,0.242436,0.242436,0.999405,46.0,4.504505,0.0,0.008432,0.005742,10,40.750666,1000.0
G_677,G_677,0.999304,980.0,0.261352,0.261352,0.999405,4.0,0.3003,0.0,0.007409,0.008002,10,31.733043,999.0
G_452,G_452,0.999304,765.0,0.123959,0.123959,0.999405,61.0,6.006006,0.0,0.007558,0.003923,10,29.673908,998.0
G_481,G_481,0.999304,962.0,0.22616,0.22616,0.999405,342.0,34.134134,0.0,0.014111,0.007596,10,27.914393,997.0
G_299,G_299,0.999304,953.0,0.21887,0.21887,0.999405,388.0,38.738739,0.0,0.017358,0.007378,10,27.31168,996.0
G_780,G_780,0.999304,971.0,0.239619,0.239619,0.999405,263.0,26.226226,0.0,0.015758,0.008373,10,26.734516,995.0
G_27,G_27,0.999304,998.0,0.394053,0.394053,0.999405,827.0,82.682683,0.0,0.023328,0.01405,10,26.386273,994.0
G_674,G_674,0.999304,990.0,0.293388,-0.293388,0.999405,537.0,53.653654,0.0,0.019042,0.010638,10,25.788839,993.0
G_461,G_461,0.999304,986.0,0.2761,-0.2761,0.999405,59.0,5.805806,0.0,0.011628,0.010431,10,25.355455,992.0
G_986,G_986,0.999304,976.0,0.251112,-0.251112,0.999405,88.0,8.708709,0.0,0.010601,0.009685,10,24.83327,991.0


## Traditional DE

In [16]:
%%R -i template_DE_grouping_filename -i project_id -i processed_template_filename -i local_dir -i base_dir -i de_method

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# File created: "<local_dir>/DE_stats/DE_stats_template_data_<project_id>_real.txt"
if (de_method == "deseq"){
    get_DE_stats_DESeq(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}
else{
    get_DE_stats_limma(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}

[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



In [17]:
# Load DE statistics file
trad_de_stats_filename = os.path.join(
    local_dir, "DE_stats", f"DE_stats_template_data_{project_id}_real.txt"
)

trad_de_stats = pd.read_csv(trad_de_stats_filename, sep="\t", index_col=0, header=0)

In [18]:
# Sort by log fold change
trad_de_stats_sorted = trad_de_stats.sort_values(by="log2FoldChange", ascending=False)

In [19]:
# Add ranking based on log2FoldChange
trad_de_stats_sorted["rank"] = trad_de_stats_sorted["log2FoldChange"].rank(
    ascending=True
)

In [20]:
trad_de_stats_sorted.head(10)

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,rank
G_409,247.896269,0.426834,0.341703,1.249135,0.211616,0.999304,1000.0
G_455,256.032597,0.422385,0.497494,0.849026,0.395867,0.999304,999.0
G_27,244.078791,0.394053,0.408895,0.963701,0.335196,0.999304,998.0
G_225,254.563331,0.37569,0.511622,0.734312,0.462758,0.999304,997.0
G_775,247.954304,0.348207,0.322448,1.079887,0.280193,0.999304,996.0
G_732,268.965494,0.307362,0.204365,1.503986,0.132585,0.999304,995.0
G_526,270.449262,0.294261,0.205304,1.433293,0.151774,0.999304,994.0
G_423,246.780815,0.285675,0.349865,0.81653,0.414197,0.999304,993.0
G_243,254.108202,0.283231,0.2827,1.001875,0.316404,0.999304,992.0
G_786,271.956177,0.279859,0.455698,0.614133,0.539127,0.999304,991.0


## Compare

1. mean rank of specific genes - mean rank of generic genes for template experiment
2. We will need to re-run this notebook for each template experiment and then plot the distribution of difference scores

In [21]:
# Load pickled file
with open(template_specific_gene_ids_filename, "rb") as specific_fh:
    specific_gene_ids = pickle.load(specific_fh)

with open(generic_gene_ids_filename, "rb") as generic_fh:
    generic_gene_ids = pickle.load(generic_fh)

In [22]:
# Get mean of specific gene ranks
sophie_specific_mean = summary_gene_ranks_sorted.loc[specific_gene_ids, "rank"].mean()
trad_specific_mean = trad_de_stats_sorted.loc[specific_gene_ids, "rank"].mean()

In [23]:
print(sophie_specific_mean)
print(trad_specific_mean)

514.0
543.5


In [24]:
summary_gene_ranks_sorted.loc[specific_gene_ids, "rank"]

G_77     892.0
G_483    776.0
G_455    927.0
G_263    117.0
G_549    261.0
G_901    675.0
G_751    384.0
G_911    280.0
G_381    372.0
G_689    456.0
Name: rank, dtype: float64

In [25]:
trad_de_stats_sorted.loc[specific_gene_ids, "rank"]

G_77      70.0
G_483    905.0
G_455    999.0
G_263    553.0
G_549    355.0
G_901    202.0
G_751    697.0
G_911    610.0
G_381    336.0
G_689    708.0
Name: rank, dtype: float64

In [26]:
# Get mean of generic gene ranks
sophie_generic_mean = summary_gene_ranks_sorted.loc[generic_gene_ids, "rank"].mean()
trad_generic_mean = trad_de_stats_sorted.loc[generic_gene_ids, "rank"].mean()

In [27]:
print(sophie_generic_mean)
print(trad_generic_mean)

466.43
509.89


In [28]:
summary_gene_ranks_sorted.loc[generic_gene_ids, "rank"]

G_668    952.0
G_229    928.0
G_341    469.0
G_15     119.0
G_314    606.0
G_925     48.0
G_57     674.0
G_198    815.0
G_87     970.0
G_47     653.0
G_567    737.0
G_86     514.0
G_75     982.0
G_246     67.0
G_974    250.0
G_659    596.0
G_322     87.0
G_495     34.0
G_203    487.0
G_458    344.0
G_0      124.0
G_192    411.0
G_954    458.0
G_296    353.0
G_490    406.0
G_62     316.0
G_843    789.0
G_480    870.0
G_810    265.0
G_872    203.0
         ...  
G_200    489.0
G_861    235.0
G_918    122.0
G_806    253.0
G_541    678.0
G_646    491.0
G_466    578.0
G_811    972.0
G_213    447.0
G_821     27.0
G_238    697.0
G_621    597.0
G_76     695.0
G_180    317.0
G_570    692.0
G_261    284.0
G_558    479.0
G_574    365.0
G_698    595.0
G_662    733.0
G_712    292.0
G_794    435.0
G_818    562.0
G_516    884.0
G_329    502.0
G_728    817.0
G_210    318.0
G_840    193.0
G_223    552.0
G_823    925.0
Name: rank, Length: 100, dtype: float64

In [29]:
trad_de_stats_sorted.loc[generic_gene_ids, "rank"]

G_668     44.0
G_229    841.0
G_341    719.0
G_15     443.0
G_314    887.0
G_925    502.0
G_57     145.0
G_198    874.0
G_87     979.0
G_47     785.0
G_567    925.0
G_86     302.0
G_75     985.0
G_246    470.0
G_974    401.0
G_659    188.0
G_322    554.0
G_495    512.0
G_203    712.0
G_458    748.0
G_0      656.0
G_192    670.0
G_954    804.0
G_296    263.0
G_490    179.0
G_62     737.0
G_843    913.0
G_480     62.0
G_810    630.0
G_872    340.0
         ...  
G_200    270.0
G_861    345.0
G_918    406.0
G_806    344.0
G_541    826.0
G_646     94.0
G_466    131.0
G_811    975.0
G_213    726.0
G_821    516.0
G_238    878.0
G_621    694.0
G_76     919.0
G_180    372.0
G_570    149.0
G_261    284.0
G_558    206.0
G_574    713.0
G_698    102.0
G_662     56.0
G_712    310.0
G_794    747.0
G_818    226.0
G_516     14.0
G_329    609.0
G_728    888.0
G_210    332.0
G_840    381.0
G_223    820.0
G_823     35.0
Name: rank, Length: 100, dtype: float64

In [30]:
# Difference
diff_sophie = sophie_specific_mean - sophie_generic_mean
diff_trad = trad_specific_mean - trad_generic_mean

print("sophie difference: ", diff_sophie)
print("traditional difference: ", diff_trad)

sophie difference:  47.56999999999999
traditional difference:  33.610000000000014
