# Evaluation

How often do regular differential expression analysis vs sophie prioritize the specific vs generic genes?

1. Simulate 1 template perturbation experiment using the technique above
2. Apply SOPHIE to get ranking of specific and generic genes based on their z-score.
3. Apply traditional DE analysis and get ranking of specific and generic genes based on their log fold change value
4. Compare the difference in ranking between specific and generic genes using SOPHIE vs traditional metrics.

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
import os
import pickle
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from rpy2.robjects import pandas2ri
from ponyo import utils, train_vae_modules, simulate_expression_data
from generic_expression_patterns_modules import (
    process,
    new_experiment_process,  # REMOVE
    stats,
    ranking,
)

np.random.seed(1)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))







Using TensorFlow backend.


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

config_filename = "config_sophie_vs_trad.tsv"

params = utils.read_config(config_filename)

In [3]:
# Load config params

# Local directory to store intermediate files
local_dir = params["local_dir"]

#
dataset_name = params["dataset_name"]

# File containing un-normalized template experiment
raw_template_filename = params["raw_template_filename"]

# Un-normalized compendium filename
raw_compendium_filename = params["raw_compendium_filename"]

# Normalized compendium filename
normalized_compendium_filename = params["normalized_compendium_filename"]

# ID for template experiment to be selected
project_id = params["project_id"]

# Number of simulated experiments to generate
num_runs = params["num_simulated"]

# Directory containing trained VAE model
vae_model_dir = params["vae_model_dir"]

# Size of the latent dimension
latent_dim = params["latent_dim"]

# Scaler transform used to scale compendium data into 0-1 range for training
scaler_filename = params["scaler_filename"]

# Which DE method to use
# We recommend that if data is RNA-seq then use DESeq2
# If data is microarray then use Limma
de_method = params["DE_method"]

# If using DE-seq, setting this parameter will
# remove genes below a certain threshold
count_threshold = params["count_threshold"]

# Metadata file that specifies which samples to keep for DE analysis (Optional)
template_process_samples_filename = params["template_process_samples_filename"]

# Metadata file that specifies sample grouping for DE analysis
template_DE_grouping_filename = params["template_DE_grouping_filename"]

# Statistic to use to rank genes or pathways by
# Choices are {} FILL IN
col_to_rank_genes = params["rank_genes_by"]

# Pickle files saving specific and generic gene ids
template_specific_gene_ids_filename = params["template_specific_gene_ids_filename"]
generic_gene_ids_filename = "generic_gene_ids.pickle"

In [4]:
# Files generated by this notebook

# File storing template experiment with gene ids mapped to compendium gene ids
mapped_template_filename = params["mapped_template_filename"]

# File storing normalized template experiment
normalized_template_filename = params["normalized_template_filename"]

# File storing processed template experiment,
# after samples have been selected for comparison in DE analysis
processed_template_filename = params["processed_template_filename"]

# Output summary file
output_filename = params["output_filename"]

## SOPHIE

In [5]:
# Process template
new_experiment_process.process_template_experiment(
    raw_template_filename,
    raw_compendium_filename,
    scaler_filename,
    mapped_template_filename,
    normalized_template_filename,
)

(4, 1000)
(360, 1000)


In [6]:
# Simulate multiple experiments UPDATE COMMENT
# This step creates the following files in "<local_dir>/pseudo_experiment/" directory:
#   - selected_simulated_data_SRP012656_<n>.txt
#   - selected_simulated_encoded_data_SRP012656_<n>.txt
#   - template_normalized_data_SRP012656_test.txt
# in which "<n>" is an integer in the range of [0, num_runs-1]

# REMOVE LATER
# dataset_name = "pre_model_unseen_template"
# Load pickled file
scaler = pickle.load(open(scaler_filename, "rb"))

# Update simulated dir
os.makedirs(os.path.join(local_dir, "pseudo_experiment"), exist_ok=True)

# Update to take in file to be consisten
normalized_compendium = pd.read_csv(
    normalized_compendium_filename, header=0, sep="\t", index_col=0
)
normalized_template = pd.read_csv(
    normalized_template_filename, header=0, sep="\t", index_col=0
)
# ------------
# Update call when new version of ponyo
for run_id in range(num_runs):
    new_experiment_process.embed_shift_template_experiment(
        normalized_compendium,
        normalized_template,
        vae_model_dir,
        project_id,
        scaler_filename,
        local_dir,
        latent_dim,
        run_id,
    )

Instructions for updating:
If using Keras pass *_constraint arguments to layers.



In [7]:
## Update simulated dir
if not os.path.exists(template_process_samples_filename):
    template_process_samples_filename = None

if de_method == "deseq":
    # Process template data
    stats.process_samples_for_DESeq(
        raw_template_filename,
        template_DE_grouping_filename,
        processed_template_filename,
        count_threshold,
        template_process_samples_filename,
    )

    # Process simulated data
    for i in range(num_runs):
        simulated_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_data_{project_id}_{i}.txt",
        )
        out_simulated_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_data_{project_id}_{i}_processed.txt",
        )
        stats.process_samples_for_DESeq(
            simulated_filename,
            template_DE_grouping_filename,
            out_simulated_filename,
            count_threshold,
            template_process_samples_filename,
        )
else:
    stats.process_samples_for_limma(
        raw_template_filename,
        template_DE_grouping_filename,
        processed_template_filename,
        template_process_samples_filename,
    )

    for i in range(num_runs):
        simulated_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_data_{project_id}_{i}.txt",
        )
        stats.process_samples_for_limma(
            simulated_filename,
            template_DE_grouping_filename,
            None,
            template_process_samples_filename,
        )

sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly


In [8]:
# Create subdirectory: "<local_dir>/DE_stats/"
os.makedirs(os.path.join(local_dir, "DE_stats"), exist_ok=True)

In [9]:
%%R -i template_DE_grouping_filename -i project_id -i processed_template_filename -i local_dir -i base_dir -i de_method

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# File created: "<local_dir>/DE_stats/DE_stats_template_data_<project_id>_real.txt"
if (de_method == "deseq"){
    get_DE_stats_DESeq(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}
else{
    get_DE_stats_limma(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}

R[write to console]: Loading required package: S4Vectors

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following object is masked from ‘package:limma’:

    plotMA


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply

[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



In [10]:
%%R -i template_DE_grouping_filename -i project_id -i base_dir -i local_dir -i num_runs -i de_method

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# Files created: "<local_dir>/DE_stats/DE_stats_simulated_data_<project_id>_<n>.txt"
for (i in 0:(num_runs-1)){
    simulated_data_filename <- paste(
        local_dir,
        "pseudo_experiment/selected_simulated_data_",
        project_id,
        "_",
        i,
        "_processed.txt",
        sep = ""
    )
    if (de_method == "deseq"){
        get_DE_stats_DESeq(
            template_DE_grouping_filename,
            project_id,
            simulated_data_filename,
            "simulated",
            local_dir,
            i
            )
    }
    else {
        get_DE_stats_limma(
            template_DE_grouping_filename,
            project_id,
            simulated_data_filename,
            "simulated",
            local_dir,
            i
            )
        }
    }

[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



In [11]:
analysis_type = "DE"
template_DE_stats_filename = os.path.join(
    local_dir, "DE_stats", f"DE_stats_template_data_{project_id}_real.txt"
)

# Added
if de_method == "deseq":
    logFC_name = "log2FoldChange"
    pvalue_name = "padj"
else:
    logFC_name = "logFC"
    pvalue_name = "adj.P.Val"

template_DE_stats, simulated_DE_summary_stats = ranking.process_and_rank_genes_pathways(
    template_DE_stats_filename,
    local_dir,
    num_runs,
    project_id,
    analysis_type,
    col_to_rank_genes,
    logFC_name,
    pvalue_name,
)



In [12]:
# Get summary table
summary_gene_ranks = ranking.generate_summary_table(
    template_DE_stats_filename,
    template_DE_stats,
    simulated_DE_summary_stats,
    col_to_rank_genes,
    local_dir,
    "gene",
    params,
)

summary_gene_ranks.sort_values(by="Z score", ascending=False).head(10)

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),abs(log2FoldChange) (Real),log2FoldChange (Real),Median adj p-value (simulated),Rank (simulated),Percentile (simulated),Percent DE (simulated),Mean abs(log2FoldChange) (simulated),Std deviation (simulated),Number of experiments (simulated),Z score
G_304,G_304,0.998894,634.0,0.2411,0.2411,0.999851,509.0,50.850851,0.0,0.027875,1e-06,2,149949.993472
G_409,G_409,0.998894,821.0,0.372228,-0.372228,0.999851,226.0,22.522523,0.0,0.015319,2.8e-05,2,12726.785644
G_718,G_718,0.998894,989.0,0.850727,0.850727,0.999851,62.0,6.106106,0.0,0.007334,0.000189,2,4455.762233
G_425,G_425,0.998894,823.0,0.373348,-0.373348,0.999851,348.0,34.734735,0.0,0.019537,0.000108,2,3286.667271
G_72,G_72,0.998894,399.0,0.134655,-0.134655,0.999851,296.0,29.52953,0.0,0.017578,3.8e-05,2,3105.611932
G_650,G_650,0.998894,644.0,0.250344,0.250344,0.999851,188.0,18.718719,0.0,0.013463,0.000101,2,2342.390442
G_638,G_638,0.998894,957.0,0.637908,-0.637908,0.999851,947.0,94.694695,0.0,0.077306,0.0003,2,1871.061021
G_454,G_454,0.998894,963.0,0.674718,-0.674718,0.999851,932.0,93.193193,0.0,0.071221,0.000349,2,1727.039845
G_822,G_822,0.998894,513.0,0.179884,-0.179884,0.999851,2.0,0.1001,0.0,0.001759,0.000136,2,1313.896804
G_829,G_829,0.998894,994.0,0.938991,-0.938991,0.999851,613.0,61.261261,0.0,0.032964,0.000724,2,1251.903427


In [13]:
summary_gene_ranks_sorted = summary_gene_ranks.sort_values(
    by="Z score", ascending=False
)

In [14]:
# Add ranking based on Z-score
summary_gene_ranks_sorted["rank"] = summary_gene_ranks_sorted["Z score"].rank(
    ascending=False
)

In [15]:
summary_gene_ranks_sorted.head(10)

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),abs(log2FoldChange) (Real),log2FoldChange (Real),Median adj p-value (simulated),Rank (simulated),Percentile (simulated),Percent DE (simulated),Mean abs(log2FoldChange) (simulated),Std deviation (simulated),Number of experiments (simulated),Z score,rank
G_304,G_304,0.998894,634.0,0.2411,0.2411,0.999851,509.0,50.850851,0.0,0.027875,1e-06,2,149949.993472,1.0
G_409,G_409,0.998894,821.0,0.372228,-0.372228,0.999851,226.0,22.522523,0.0,0.015319,2.8e-05,2,12726.785644,2.0
G_718,G_718,0.998894,989.0,0.850727,0.850727,0.999851,62.0,6.106106,0.0,0.007334,0.000189,2,4455.762233,3.0
G_425,G_425,0.998894,823.0,0.373348,-0.373348,0.999851,348.0,34.734735,0.0,0.019537,0.000108,2,3286.667271,4.0
G_72,G_72,0.998894,399.0,0.134655,-0.134655,0.999851,296.0,29.52953,0.0,0.017578,3.8e-05,2,3105.611932,5.0
G_650,G_650,0.998894,644.0,0.250344,0.250344,0.999851,188.0,18.718719,0.0,0.013463,0.000101,2,2342.390442,6.0
G_638,G_638,0.998894,957.0,0.637908,-0.637908,0.999851,947.0,94.694695,0.0,0.077306,0.0003,2,1871.061021,7.0
G_454,G_454,0.998894,963.0,0.674718,-0.674718,0.999851,932.0,93.193193,0.0,0.071221,0.000349,2,1727.039845,8.0
G_822,G_822,0.998894,513.0,0.179884,-0.179884,0.999851,2.0,0.1001,0.0,0.001759,0.000136,2,1313.896804,9.0
G_829,G_829,0.998894,994.0,0.938991,-0.938991,0.999851,613.0,61.261261,0.0,0.032964,0.000724,2,1251.903427,10.0


## Traditional DE

In [16]:
%%R -i template_DE_grouping_filename -i project_id -i processed_template_filename -i local_dir -i base_dir -i de_method

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# File created: "<local_dir>/DE_stats/DE_stats_template_data_<project_id>_real.txt"
if (de_method == "deseq"){
    get_DE_stats_DESeq(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}
else{
    get_DE_stats_limma(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}

[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



In [17]:
# Load DE statistics file
trad_de_stats_filename = os.path.join(
    local_dir, "DE_stats", f"DE_stats_template_data_{project_id}_real.txt"
)

trad_de_stats = pd.read_csv(trad_de_stats_filename, sep="\t", index_col=0, header=0)

In [18]:
# Sort by log fold change
trad_de_stats_sorted = trad_de_stats.sort_values(by="log2FoldChange", ascending=False)

In [19]:
# Add ranking based on log2FoldChange
trad_de_stats_sorted["rank"] = trad_de_stats_sorted["log2FoldChange"].rank(
    ascending=False
)

In [20]:
trad_de_stats_sorted.head(10)

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,rank
G_513,68.425319,1.030356,1.286037,0.801187,0.423023,0.998894,1.0
G_753,68.934742,0.857123,1.261278,0.679567,0.496779,0.998894,2.0
G_718,71.891717,0.850727,1.145588,0.742612,0.457717,0.998894,3.0
G_135,79.934183,0.832334,0.907726,0.916944,0.359172,0.998894,4.0
G_86,67.862238,0.801011,1.305851,0.613401,0.539611,0.998894,5.0
G_633,76.890754,0.797921,0.985482,0.809676,0.418126,0.998894,6.0
G_705,68.78566,0.787412,1.264556,0.622678,0.533496,0.998894,7.0
G_545,74.512134,0.778864,1.05742,0.73657,0.461384,0.998894,8.0
G_16,74.315754,0.757656,1.061977,0.713439,0.475574,0.998894,9.0
G_555,68.282549,0.747342,1.28594,0.581164,0.56113,0.998894,10.0


## Compare

1. mean rank of specific genes - mean rank of generic genes for template experiment
2. We will need to re-run this notebook for each template experiment and then plot the distribution of difference scores

In [21]:
# Load pickled file
with open(template_specific_gene_ids_filename, "rb") as specific_fh:
    specific_gene_ids = pickle.load(specific_fh)

with open(generic_gene_ids_filename, "rb") as generic_fh:
    generic_gene_ids = pickle.load(generic_fh)

In [22]:
# Get mean of specific gene ranks
sophie_specific_mean = summary_gene_ranks_sorted.loc[specific_gene_ids, "rank"].mean()
trad_specific_mean = trad_de_stats_sorted.loc[specific_gene_ids, "rank"].mean()

In [23]:
summary_gene_ranks_sorted.loc[specific_gene_ids, "rank"]

G_456    233.0
G_692    487.0
G_800     16.0
G_504    710.0
G_846    229.0
G_833     23.0
G_269    786.0
G_19     227.0
G_387    784.0
G_476    242.0
Name: rank, dtype: float64

In [24]:
trad_de_stats_sorted.loc[specific_gene_ids, "rank"]

G_456     14.0
G_692    745.0
G_800     62.0
G_504    321.0
G_846    729.0
G_833    836.0
G_269    367.0
G_19     306.0
G_387    580.0
G_476    900.0
Name: rank, dtype: float64

In [25]:
# Get mean of generic gene ranks
sophie_generic_mean = summary_gene_ranks_sorted.loc[generic_gene_ids, "rank"].mean()
trad_generic_mean = trad_de_stats_sorted.loc[generic_gene_ids, "rank"].mean()

In [26]:
summary_gene_ranks_sorted.loc[generic_gene_ids, "rank"]

G_368    872.0
G_809    597.0
G_961    744.0
G_664    102.0
G_750    507.0
G_785    614.0
G_554     19.0
G_585    666.0
G_287    336.0
G_746    273.0
G_636    602.0
G_592    539.0
G_494    652.0
G_916     35.0
G_830    904.0
G_169    352.0
G_463    470.0
G_787    819.0
G_466    434.0
G_967    277.0
G_956    157.0
G_61     303.0
G_469    278.0
G_210    632.0
G_983    237.0
G_426    555.0
G_189    781.0
G_706    716.0
G_662    885.0
G_23     354.0
         ...  
G_667    215.0
G_721    275.0
G_910    358.0
G_874    919.0
G_252    702.0
G_179    768.0
G_259    844.0
G_717    505.0
G_515    953.0
G_461    587.0
G_121    579.0
G_251    601.0
G_812    635.0
G_999    812.0
G_219    461.0
G_363    385.0
G_425      4.0
G_875    825.0
G_489    307.0
G_539    570.0
G_938    980.0
G_253    261.0
G_583    322.0
G_770     38.0
G_280    508.0
G_864    894.0
G_724    588.0
G_949    177.0
G_274    967.0
G_352     75.0
Name: rank, Length: 100, dtype: float64

In [27]:
trad_de_stats_sorted.loc[generic_gene_ids, "rank"]

G_368    477.0
G_809    932.0
G_961    578.0
G_664    121.0
G_750    287.0
G_785    730.0
G_554    122.0
G_585    717.0
G_287    156.0
G_746    195.0
G_636    180.0
G_592    201.0
G_494    718.0
G_916    123.0
G_830    465.0
G_169    961.0
G_463    225.0
G_787    629.0
G_466    840.0
G_967    731.0
G_956     41.0
G_61     679.0
G_469     84.0
G_210    301.0
G_983    105.0
G_426    851.0
G_189    277.0
G_706    204.0
G_662    438.0
G_23     959.0
         ...  
G_667    310.0
G_721     16.0
G_910    960.0
G_874    483.0
G_252    598.0
G_179    622.0
G_259    364.0
G_717    390.0
G_515    593.0
G_461    194.0
G_121    777.0
G_251    243.0
G_812    738.0
G_999    482.0
G_219     93.0
G_363     73.0
G_425    914.0
G_875    387.0
G_489    241.0
G_539    240.0
G_938    334.0
G_253    974.0
G_583    165.0
G_770    969.0
G_280    271.0
G_864    564.0
G_724    305.0
G_949    200.0
G_274    490.0
G_352     98.0
Name: rank, Length: 100, dtype: float64

In [28]:
# Difference
diff_sophie = sophie_specific_mean - sophie_generic_mean
diff_trad = trad_specific_mean - trad_generic_mean

print("sophie difference: ", diff_sophie)
print("traditional difference: ", diff_trad)

sophie difference:  -124.15000000000003
traditional difference:  24.319999999999993
