# Evaluation

How often do regular differential expression analysis vs sophie prioritize the specific vs generic genes?

1. Simulate 1 template perturbation experiment using the technique above
2. Apply SOPHIE to get ranking of specific and generic genes based on their z-score.
3. Apply traditional DE analysis and get ranking of specific and generic genes based on their log fold change value
4. Compare the difference in ranking between specific and generic genes using SOPHIE vs traditional metrics.

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
import os
import pickle
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from rpy2.robjects import pandas2ri
from ponyo import utils, train_vae_modules, simulate_expression_data
from generic_expression_patterns_modules import (
    process,
    new_experiment_process,  # REMOVE
    stats,
    ranking,
)

np.random.seed(1)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))







Using TensorFlow backend.


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

config_filename = "config_sophie_vs_trad.tsv"

params = utils.read_config(config_filename)

In [3]:
# Load config params

# Local directory to store intermediate files
local_dir = params["local_dir"]

#
dataset_name = params["dataset_name"]

# File containing un-normalized template experiment
raw_template_filename = params["raw_template_filename"]

# Un-normalized compendium filename
raw_compendium_filename = params["raw_compendium_filename"]

# Normalized compendium filename
normalized_compendium_filename = params["normalized_compendium_filename"]

# ID for template experiment to be selected
project_id = params["project_id"]

# Number of simulated experiments to generate
num_runs = params["num_simulated"]

# Directory containing trained VAE model
vae_model_dir = params["vae_model_dir"]

# Size of the latent dimension
latent_dim = params["latent_dim"]

# Scaler transform used to scale compendium data into 0-1 range for training
scaler_filename = params["scaler_filename"]

# Which DE method to use
# We recommend that if data is RNA-seq then use DESeq2
# If data is microarray then use Limma
de_method = params["DE_method"]

# If using DE-seq, setting this parameter will
# remove genes below a certain threshold
count_threshold = params["count_threshold"]

# Metadata file that specifies which samples to keep for DE analysis (Optional)
template_process_samples_filename = params["template_process_samples_filename"]

# Metadata file that specifies sample grouping for DE analysis
template_DE_grouping_filename = params["template_DE_grouping_filename"]

# Statistic to use to rank genes or pathways by
# Choices are {} FILL IN
col_to_rank_genes = params["rank_genes_by"]

# Pickle files saving specific and generic gene ids
template_specific_gene_ids_filename = params["template_specific_gene_ids_filename"]
generic_gene_ids_filename = "generic_gene_ids.pickle"

In [4]:
# Files generated by this notebook

# File storing template experiment with gene ids mapped to compendium gene ids
mapped_template_filename = params["mapped_template_filename"]

# File storing normalized template experiment
normalized_template_filename = params["normalized_template_filename"]

# File storing processed template experiment,
# after samples have been selected for comparison in DE analysis
processed_template_filename = params["processed_template_filename"]

# Output summary file
output_filename = params["output_filename"]

## SOPHIE

In [5]:
# Process template
new_experiment_process.process_template_experiment(
    raw_template_filename,
    raw_compendium_filename,
    scaler_filename,
    mapped_template_filename,
    normalized_template_filename,
)

(4, 1000)
(360, 1000)


In [6]:
# Simulate multiple experiments UPDATE COMMENT
# This step creates the following files in "<local_dir>/pseudo_experiment/" directory:
#   - selected_simulated_data_SRP012656_<n>.txt
#   - selected_simulated_encoded_data_SRP012656_<n>.txt
#   - template_normalized_data_SRP012656_test.txt
# in which "<n>" is an integer in the range of [0, num_runs-1]

# REMOVE LATER
# dataset_name = "pre_model_unseen_template"
# Load pickled file
scaler = pickle.load(open(scaler_filename, "rb"))

# Update simulated dir
os.makedirs(os.path.join(local_dir, "pseudo_experiment"), exist_ok=True)

# Update to take in file to be consisten
normalized_compendium = pd.read_csv(
    normalized_compendium_filename, header=0, sep="\t", index_col=0
)
normalized_template = pd.read_csv(
    normalized_template_filename, header=0, sep="\t", index_col=0
)
# ------------
# Update call when new version of ponyo
for run_id in range(num_runs):
    new_experiment_process.embed_shift_template_experiment(
        normalized_compendium,
        normalized_template,
        vae_model_dir,
        project_id,
        scaler_filename,
        local_dir,
        latent_dim,
        run_id,
    )

Instructions for updating:
If using Keras pass *_constraint arguments to layers.



In [7]:
## Update simulated dir
if not os.path.exists(template_process_samples_filename):
    template_process_samples_filename = None

if de_method == "deseq":
    # Process template data
    stats.process_samples_for_DESeq(
        raw_template_filename,
        template_DE_grouping_filename,
        processed_template_filename,
        count_threshold,
        template_process_samples_filename,
    )

    # Process simulated data
    for i in range(num_runs):
        simulated_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_data_{project_id}_{i}.txt",
        )
        out_simulated_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_data_{project_id}_{i}_processed.txt",
        )
        stats.process_samples_for_DESeq(
            simulated_filename,
            template_DE_grouping_filename,
            out_simulated_filename,
            count_threshold,
            template_process_samples_filename,
        )
else:
    stats.process_samples_for_limma(
        raw_template_filename,
        template_DE_grouping_filename,
        processed_template_filename,
        template_process_samples_filename,
    )

    for i in range(num_runs):
        simulated_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_data_{project_id}_{i}.txt",
        )
        stats.process_samples_for_limma(
            simulated_filename,
            template_DE_grouping_filename,
            None,
            template_process_samples_filename,
        )

sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly


In [8]:
# Create subdirectory: "<local_dir>/DE_stats/"
os.makedirs(os.path.join(local_dir, "DE_stats"), exist_ok=True)

In [9]:
%%R -i template_DE_grouping_filename -i project_id -i processed_template_filename -i local_dir -i base_dir -i de_method

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# File created: "<local_dir>/DE_stats/DE_stats_template_data_<project_id>_real.txt"
if (de_method == "deseq"){
    get_DE_stats_DESeq(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}
else{
    get_DE_stats_limma(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}

R[write to console]: Loading required package: S4Vectors

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following object is masked from ‘package:limma’:

    plotMA


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply

[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



In [10]:
%%R -i template_DE_grouping_filename -i project_id -i base_dir -i local_dir -i num_runs -i de_method

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# Files created: "<local_dir>/DE_stats/DE_stats_simulated_data_<project_id>_<n>.txt"
for (i in 0:(num_runs-1)){
    simulated_data_filename <- paste(
        local_dir,
        "pseudo_experiment/selected_simulated_data_",
        project_id,
        "_",
        i,
        "_processed.txt",
        sep = ""
    )
    if (de_method == "deseq"){
        get_DE_stats_DESeq(
            template_DE_grouping_filename,
            project_id,
            simulated_data_filename,
            "simulated",
            local_dir,
            i
            )
    }
    else {
        get_DE_stats_limma(
            template_DE_grouping_filename,
            project_id,
            simulated_data_filename,
            "simulated",
            local_dir,
            i
            )
        }
    }

[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



In [11]:
analysis_type = "DE"
template_DE_stats_filename = os.path.join(
    local_dir, "DE_stats", f"DE_stats_template_data_{project_id}_real.txt"
)

# Added
if de_method == "deseq":
    logFC_name = "log2FoldChange"
    pvalue_name = "padj"
else:
    logFC_name = "logFC"
    pvalue_name = "adj.P.Val"

template_DE_stats, simulated_DE_summary_stats = ranking.process_and_rank_genes_pathways(
    template_DE_stats_filename,
    local_dir,
    num_runs,
    project_id,
    analysis_type,
    col_to_rank_genes,
    logFC_name,
    pvalue_name,
)



In [12]:
# Get summary table
summary_gene_ranks = ranking.generate_summary_table(
    template_DE_stats_filename,
    template_DE_stats,
    simulated_DE_summary_stats,
    col_to_rank_genes,
    local_dir,
    "gene",
    params,
)

summary_gene_ranks.sort_values(by="Z score", ascending=False).head(10)

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),abs(log2FoldChange) (Real),log2FoldChange (Real),Median adj p-value (simulated),Rank (simulated),Percentile (simulated),Percent DE (simulated),Mean abs(log2FoldChange) (simulated),Std deviation (simulated),Number of experiments (simulated),Z score
G_137,G_137,0.998503,998.0,0.183802,-0.183802,0.942993,267.0,26.626627,0.0,0.045722,0.030321,10,4.553959
G_696,G_696,0.998503,999.0,0.194072,-0.194072,0.928404,166.0,16.516517,0.0,0.060837,0.033908,10,3.929277
G_564,G_564,0.998503,961.0,0.12941,0.12941,0.970573,20.0,1.901902,0.0,0.03179,0.025275,10,3.862334
G_328,G_328,0.998503,980.0,0.145465,-0.145465,0.936806,58.0,5.705706,0.0,0.039655,0.029553,10,3.580394
G_286,G_286,0.998503,984.0,0.148703,-0.148703,0.89181,231.0,23.023023,0.0,0.054301,0.032582,10,2.89733
G_875,G_875,0.998503,964.0,0.133251,-0.133251,0.89959,191.0,19.019019,0.0,0.053604,0.028587,10,2.786105
G_521,G_521,0.998503,983.0,0.148425,0.148425,0.944842,292.0,29.129129,0.0,0.047234,0.036612,10,2.763869
G_618,G_618,0.998503,977.0,0.142026,-0.142026,0.922467,306.0,30.530531,0.0,0.056194,0.033217,10,2.583978
G_8,G_8,0.998503,968.0,0.134471,0.134471,0.900976,125.0,12.412412,0.0,0.053907,0.031843,10,2.530033
G_913,G_913,0.998503,987.0,0.151699,0.151699,0.946122,107.0,10.610611,0.0,0.052862,0.039358,10,2.511202


In [13]:
summary_gene_ranks_sorted = summary_gene_ranks.sort_values(
    by="Z score", ascending=False
)

In [14]:
# Add ranking based on Z-score
summary_gene_ranks_sorted["rank"] = summary_gene_ranks_sorted["Z score"].rank(
    ascending=False
)

In [15]:
summary_gene_ranks_sorted.head(10)

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),abs(log2FoldChange) (Real),log2FoldChange (Real),Median adj p-value (simulated),Rank (simulated),Percentile (simulated),Percent DE (simulated),Mean abs(log2FoldChange) (simulated),Std deviation (simulated),Number of experiments (simulated),Z score,rank
G_137,G_137,0.998503,998.0,0.183802,-0.183802,0.942993,267.0,26.626627,0.0,0.045722,0.030321,10,4.553959,1.0
G_696,G_696,0.998503,999.0,0.194072,-0.194072,0.928404,166.0,16.516517,0.0,0.060837,0.033908,10,3.929277,2.0
G_564,G_564,0.998503,961.0,0.12941,0.12941,0.970573,20.0,1.901902,0.0,0.03179,0.025275,10,3.862334,3.0
G_328,G_328,0.998503,980.0,0.145465,-0.145465,0.936806,58.0,5.705706,0.0,0.039655,0.029553,10,3.580394,4.0
G_286,G_286,0.998503,984.0,0.148703,-0.148703,0.89181,231.0,23.023023,0.0,0.054301,0.032582,10,2.89733,5.0
G_875,G_875,0.998503,964.0,0.133251,-0.133251,0.89959,191.0,19.019019,0.0,0.053604,0.028587,10,2.786105,6.0
G_521,G_521,0.998503,983.0,0.148425,0.148425,0.944842,292.0,29.129129,0.0,0.047234,0.036612,10,2.763869,7.0
G_618,G_618,0.998503,977.0,0.142026,-0.142026,0.922467,306.0,30.530531,0.0,0.056194,0.033217,10,2.583978,8.0
G_8,G_8,0.998503,968.0,0.134471,0.134471,0.900976,125.0,12.412412,0.0,0.053907,0.031843,10,2.530033,9.0
G_913,G_913,0.998503,987.0,0.151699,0.151699,0.946122,107.0,10.610611,0.0,0.052862,0.039358,10,2.511202,10.0


## Traditional DE

In [16]:
%%R -i template_DE_grouping_filename -i project_id -i processed_template_filename -i local_dir -i base_dir -i de_method

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# File created: "<local_dir>/DE_stats/DE_stats_template_data_<project_id>_real.txt"
if (de_method == "deseq"){
    get_DE_stats_DESeq(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}
else{
    get_DE_stats_limma(
        template_DE_grouping_filename,
        project_id,
        processed_template_filename,
        "template",
        local_dir,
        "real"
    )
}

[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

R[write to console]: -- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.



In [17]:
# Load DE statistics file
trad_de_stats_filename = os.path.join(
    local_dir, "DE_stats", f"DE_stats_template_data_{project_id}_real.txt"
)

trad_de_stats = pd.read_csv(trad_de_stats_filename, sep="\t", index_col=0, header=0)

In [18]:
# Sort by log fold change
trad_de_stats_sorted = trad_de_stats.sort_values(by="log2FoldChange", ascending=False)

In [19]:
# Add ranking based on log2FoldChange
trad_de_stats_sorted["rank"] = trad_de_stats_sorted["log2FoldChange"].rank(
    ascending=False
)

In [20]:
trad_de_stats_sorted.head(10)

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,rank
G_862,2816.870147,0.173619,0.101177,1.715991,0.086164,0.998503,1.0
G_187,2913.237654,0.165636,0.087628,1.890223,0.058728,0.998503,2.0
G_548,2890.56162,0.162476,0.08806,1.845048,0.065031,0.998503,3.0
G_938,2894.179951,0.159485,0.093224,1.710759,0.087126,0.998503,4.0
G_537,2815.13977,0.155824,0.104766,1.487351,0.136922,0.998503,5.0
G_200,2849.108244,0.152376,0.202281,0.753289,0.451276,0.998503,6.0
G_913,2857.709181,0.151699,0.099102,1.53073,0.125836,0.998503,7.0
G_322,2737.129054,0.15067,0.127496,1.181759,0.237301,0.998503,8.0
G_521,2685.154759,0.148425,0.151785,0.977858,0.328145,0.998503,9.0
G_807,2814.465233,0.147657,0.108098,1.365951,0.171954,0.998503,10.0


## Compare

1. mean rank of specific genes - mean rank of generic genes for template experiment
2. We will need to re-run this notebook for each template experiment and then plot the distribution of difference scores

In [21]:
# Load pickled file
with open(template_specific_gene_ids_filename, "rb") as specific_fh:
    specific_gene_ids = pickle.load(specific_fh)

with open(generic_gene_ids_filename, "rb") as generic_fh:
    generic_gene_ids = pickle.load(generic_fh)

In [22]:
# Get mean of specific gene ranks
sophie_specific_mean = summary_gene_ranks_sorted.loc[specific_gene_ids, "rank"].mean()
trad_specific_mean = trad_de_stats_sorted.loc[specific_gene_ids, "rank"].mean()

In [23]:
print(sophie_specific_mean)
print(trad_specific_mean)

486.6
624.6


In [24]:
summary_gene_ranks_sorted.loc[specific_gene_ids, "rank"]

G_248     247.0
G_584     626.0
G_894     130.0
G_455    1000.0
G_462     846.0
G_841     284.0
G_277     275.0
G_885     945.0
G_590     210.0
G_848     303.0
Name: rank, dtype: float64

In [25]:
trad_de_stats_sorted.loc[specific_gene_ids, "rank"]

G_248    868.0
G_584    592.0
G_894    950.0
G_455    566.0
G_462    682.0
G_841    232.0
G_277    149.0
G_885    607.0
G_590    940.0
G_848    660.0
Name: rank, dtype: float64

In [26]:
# Get mean of generic gene ranks
sophie_generic_mean = summary_gene_ranks_sorted.loc[generic_gene_ids, "rank"].mean()
trad_generic_mean = trad_de_stats_sorted.loc[generic_gene_ids, "rank"].mean()

In [27]:
print(sophie_generic_mean)
print(trad_generic_mean)

498.06
483.69


In [28]:
summary_gene_ranks_sorted.loc[generic_gene_ids, "rank"]

G_668    793.0
G_229    925.0
G_341    752.0
G_15     294.0
G_314    900.0
G_925    331.0
G_57     540.0
G_198    264.0
G_87      14.0
G_47     559.0
G_567     31.0
G_86     335.0
G_75     614.0
G_246    126.0
G_974    771.0
G_659    315.0
G_322    112.0
G_495    489.0
G_203     60.0
G_458    324.0
G_0      795.0
G_192     65.0
G_954    168.0
G_296    334.0
G_490    114.0
G_62     138.0
G_843    106.0
G_480    441.0
G_810    591.0
G_872    670.0
         ...  
G_200     16.0
G_861    586.0
G_918    532.0
G_806    884.0
G_541    177.0
G_646    722.0
G_466    153.0
G_811    281.0
G_213    768.0
G_821     32.0
G_238    680.0
G_621    630.0
G_76     715.0
G_180     80.0
G_570    178.0
G_261    914.0
G_558    952.0
G_574    799.0
G_698    595.0
G_662    851.0
G_712    505.0
G_794    475.0
G_818    224.0
G_516     38.0
G_329    689.0
G_728    855.0
G_210    868.0
G_840    813.0
G_223    568.0
G_823    109.0
Name: rank, Length: 100, dtype: float64

In [29]:
trad_de_stats_sorted.loc[generic_gene_ids, "rank"]

G_668    606.0
G_229    565.0
G_341    649.0
G_15     901.0
G_314    435.0
G_925    852.0
G_57     270.0
G_198    881.0
G_87     995.0
G_47     259.0
G_567    993.0
G_86      92.0
G_75     604.0
G_246    952.0
G_974    792.0
G_659    889.0
G_322      8.0
G_495    758.0
G_203     33.0
G_458    237.0
G_0      468.0
G_192     43.0
G_954    105.0
G_296    305.0
G_490     61.0
G_62      14.0
G_843    922.0
G_480    775.0
G_810    665.0
G_872    274.0
         ...  
G_200      6.0
G_861    224.0
G_918    208.0
G_806    362.0
G_541     76.0
G_646    368.0
G_466    898.0
G_811    779.0
G_213    371.0
G_821     58.0
G_238    560.0
G_621    342.0
G_76     316.0
G_180     73.0
G_570    236.0
G_261    633.0
G_558    537.0
G_574    357.0
G_698    180.0
G_662    937.0
G_712    117.0
G_794    827.0
G_818    896.0
G_516     17.0
G_329    553.0
G_728    746.0
G_210    514.0
G_840    458.0
G_223    701.0
G_823     86.0
Name: rank, Length: 100, dtype: float64

In [30]:
# Difference
diff_sophie = sophie_specific_mean - sophie_generic_mean
diff_trad = trad_specific_mean - trad_generic_mean

print("sophie difference: ", diff_sophie)
print("traditional difference: ", diff_trad)

sophie difference:  -11.45999999999998
traditional difference:  140.91000000000003
