# Identify generic genes and pathways

This notebook is meant to identify common differentially expressed genes in the PAO1 and PA14 compendia

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import os
import sys
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import scipy.stats as ss
from keras.models import load_model
from rpy2.robjects import pandas2ri
from ponyo import utils, simulate_expression_data
from generic_expression_patterns_modules import process, stats, ranking

pandas2ri.activate()

np.random.seed(123)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))
Using TensorFlow backend.


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

config_filename = os.path.abspath(
    os.path.join(base_dir, "configs", "config_pseudomonas_pa14_rnaseq.tsv")
)
params = utils.read_config(config_filename)

In [3]:
# Load params
local_dir = params["local_dir"]
dataset_name = params["dataset_name"]
NN_architecture = params["NN_architecture"]
num_runs = params["num_simulated"]
project_id = params["project_id"]
metadata_col_id = params["metadata_colname"]
raw_template_filename = params["raw_template_filename"]
processed_template_filename = params["processed_template_filename"]
normalized_compendium_filename = params["normalized_compendium_filename"]
scaler_filename = params["scaler_filename"]
col_to_rank_genes = params["rank_genes_by"]
logFC_name = params["DE_logFC_name"]
pvalue_name = params["DE_pvalue_name"]
latent_dim = params["latent_dim"]

# Load metadata file with grouping assignments for samples
sample_id_metadata_filename = os.path.join(
    base_dir, dataset_name, "data", "metadata", f"{project_id}_process_samples.tsv"
)

# Load metadata file with grouping assignments for samples
metadata_filename = os.path.join(
    base_dir, dataset_name, "data", "metadata", f"{project_id}_groups.tsv"
)

# Load pickled file
scaler = pickle.load(open(scaler_filename, "rb"))

# Percentile threshold to identify generic genes
percentile_threshold = 80.0

metadata_simulate_filename = "data/metadata/SraRunTable.csv"
metadata_delimiter = ","
experiment_id_colname = "SRA_study"

In [4]:
# Output files
gene_summary_filename = os.path.join(
    base_dir, dataset_name, f"generic_gene_summary_{project_id}.tsv"
)

## Need to customize code from ponyo

The current simulation-related function in ponyo, `get_sample_ids` assumes that the user is using one of two different metadata files (one associated with the pseudomonas compendium and another associated with recount2). The compendium dataset we are using here has a slightly different format for their metadata file.

Here we are temporarily writing our own function customized for this Pa RNA-seq compendia

In [5]:
def get_sample_ids(
    metadata_filename, delimiter, experiment_colname, experiment_id, sample_id_colname
):
    """
    Returns sample ids (found in gene expression df) associated with
    a given list of experiment ids (found in the metadata)

    Arguments
    ----------
    metadata_filename: str
        Metadata file path. An example metadata file can be found
        here: https://github.com/greenelab/ponyo/blob/master/human_tests/data/metadata/recount2_metadata.tsv

    delimiter: str
        Delimiter for metadata file

    experiment_colname: str
        Column header that contains the experiment ids

    experiment_id: str
        Experiment id selected to retrieve sample ids for

    sample_id_colname: str
        Column header that contains sample id that maps expression data
        and metadata

    """

    # Read in metadata
    metadata = pd.read_csv(metadata_filename, header=0, sep=delimiter, index_col=None)

    # Set index column to experiment id column
    metadata.set_index(experiment_colname, inplace=True)

    # Select samples associated with experiment id
    selected_metadata = metadata.loc[experiment_id]
    sample_ids = list(selected_metadata[sample_id_colname])

    return sample_ids

In [6]:
def shift_template_experiment_custom(
    normalized_data_filename,
    NN_architecture,
    latent_dim,
    dataset_name,
    scaler,
    metadata_filename,
    metadata_delimiter,
    experiment_id_colname,
    sample_id_colname,
    selected_experiment_id,
    local_dir,
    base_dir,
    num_runs,
):
    """
    Generate new simulated experiment using the selected_experiment_id as a template
    experiment using the same workflow as `simulate_by_latent_transform`

    This will return a file with a single simulated experiment following the workflow mentioned.
    This function can be run multiple times to generate multiple simulated experiments from a
    single selected_experiment_id.

    Arguments
    ----------
    normalized_data_filename: str
        File containing normalized gene expression data

        ------------------------------| PA0001 | PA0002 |...
        05_PA14000-4-2_5-10-07_S2.CEL | 0.8533 | 0.7252 |...
        54375-4-05.CEL                | 0.7789 | 0.7678 |...
        ...                           | ...    | ...    |...

    NN_architecture: str
        Name of neural network architecture to use.
        Format 'NN_<intermediate layer>_<latent layer>'

    latent_dim: int
        The number of dimensions in the latent space

    dataset_name: str
        Name for analysis directory. Either "Human" or "Pseudomonas"

    scaler: minmax model
        Model used to transform data into a different range

    metadata_filename: str
        Metadata file path. Note: The format of this metadata file
        requires the index column to contain experiment ids.

    metadata_delimiter: str
        Delimiter for metadata file

    experiment_colname: str
        Column header that contains the experiment ids

    sample_id_colname: str
        Column header that contains sample id that maps expression data
        and metadata

    selected_experiment_id: str
        Experiment id selected as template

    local_dir: str
        Parent directory on local machine to store intermediate results

    base_dir: str
        Root directory containing analysis subdirectories

    num_runs: int
        Number of experiments to simulate

    Returns
    --------
    simulated_data_filename: str
        File containing simulated gene expression data

    """

    # Files
    NN_dir = os.path.join(base_dir, dataset_name, "models", NN_architecture)

    model_encoder_filename = glob.glob(os.path.join(NN_dir, "*_encoder_model.h5"))[0]

    weights_encoder_filename = glob.glob(os.path.join(NN_dir, "*_encoder_weights.h5"))[
        0
    ]

    model_decoder_filename = glob.glob(os.path.join(NN_dir, "*_decoder_model.h5"))[0]

    weights_decoder_filename = glob.glob(os.path.join(NN_dir, "*_decoder_weights.h5"))[
        0
    ]

    # Load saved models
    loaded_model = load_model(model_encoder_filename, compile=False)
    loaded_decode_model = load_model(model_decoder_filename, compile=False)

    loaded_model.load_weights(weights_encoder_filename)
    loaded_decode_model.load_weights(weights_decoder_filename)

    # Read data
    normalized_data = pd.read_csv(
        normalized_data_filename, header=0, sep="\t", index_col=0
    )

    # Get corresponding sample ids
    sample_ids = get_sample_ids(
        metadata_filename,
        metadata_delimiter,
        experiment_id_colname,
        selected_experiment_id,
        sample_id_colname,
    )

    # Gene expression data for selected samples
    selected_data_df = normalized_data.loc[sample_ids]

    for run in range(num_runs):
        simulated_data_decoded_df, simulated_data_encoded_df = run_shift_template(
            loaded_model,
            loaded_decode_model,
            normalized_data,
            selected_data_df,
            latent_dim,
        )

        # Un-normalize the data in order to run DE analysis downstream
        simulated_data_scaled = scaler.inverse_transform(simulated_data_decoded_df)

        simulated_data_scaled_df = pd.DataFrame(
            simulated_data_scaled,
            columns=simulated_data_decoded_df.columns,
            index=simulated_data_decoded_df.index,
        )

        # Save
        out_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            "selected_simulated_data_"
            + selected_experiment_id
            + "_"
            + str(run)
            + ".txt",
        )
        simulated_data_scaled_df.to_csv(out_filename, float_format="%.3f", sep="\t")

        out_encoded_filename = os.path.join(
            local_dir,
            "pseudo_experiment",
            f"selected_simulated_encoded_data_{selected_experiment_id}_{run}.txt",
        )

        simulated_data_encoded_df.to_csv(
            out_encoded_filename, float_format="%.3f", sep="\t"
        )

    # Save template data for visualization validation
    test_filename = os.path.join(
        local_dir,
        "pseudo_experiment",
        "template_normalized_data_" + selected_experiment_id + "_test.txt",
    )
    selected_data_df.to_csv(test_filename, float_format="%.3f", sep="\t")


def run_shift_template(encoder, decoder, normalized_data, selected_data_df, latent_dim):
    """
    This function does the template shifting used in `shift_template_experiment`.

    Arguments
    ---------
    encoder: keras.models.Model
        The encoder half of the VAE. `encoder` takes in a (samples x genes) dataframe of
        gene expression data and encodes it into a latent space

    decoder: keras.models.Model
        The decoder half of the VAE. `decoder` takes a dataframe of means and standard deviations
        and uses them to simulate gene expression data close to the distribution of normalized_data

    normalized_data: pd.DataFrame
        The data to be used to train the VAE

    selected_data_df: pd.DataFrame
        The samples to be shifted in the latent space

    latent_dim: int
        The dimension of the latent space the samples will be shifted in

    Returns
    -------
    simulated_data_decoded_df: pd.DataFrame
        The simulated data created by shifting the samples in the latent space

    simulated_data_encoded_df: pd.DataFrame
        The latent means and standard deviations in the latent space used to simulate the data
    """
    # Encode selected experiment into latent space
    data_encoded = encoder.predict_on_batch(selected_data_df)
    data_encoded_df = pd.DataFrame(data_encoded, index=selected_data_df.index)

    # Get centroid of original data
    centroid = data_encoded_df.mean(axis=0)

    # Add individual vectors(centroid, sample point) to new_centroid

    # Encode original gene expression data into latent space
    data_encoded_all = encoder.predict_on_batch(normalized_data)
    data_encoded_all_df = pd.DataFrame(data_encoded_all, index=normalized_data.index)

    data_encoded_all_df.head()

    # Find a new location in the latent space by sampling from the latent space
    encoded_means = data_encoded_all_df.mean(axis=0)
    encoded_stds = data_encoded_all_df.std(axis=0)

    latent_dim = int(latent_dim)
    new_centroid = np.zeros(latent_dim)

    for j in range(latent_dim):
        new_centroid[j] = np.random.normal(encoded_means[j], encoded_stds[j])

    shift_vec_df = new_centroid - centroid

    simulated_data_encoded_df = data_encoded_df.apply(
        lambda x: x + shift_vec_df, axis=1
    )

    # Decode simulated data into raw gene space
    simulated_data_decoded = decoder.predict_on_batch(simulated_data_encoded_df)

    simulated_data_decoded_df = pd.DataFrame(
        simulated_data_decoded,
        index=simulated_data_encoded_df.index,
        columns=selected_data_df.columns,
    )

    return simulated_data_decoded_df, simulated_data_encoded_df

### Simulate experiments using selected template experiment
Workflow:

1. Get the gene expression data for the selected template experiment
2. Encode this experiment into a latent space using the trained VAE model
3. Linearly shift the encoded template experiment in the latent space
4. Decode the samples. This results in a new experiment
5. Repeat steps 1-4 to get multiple simulated experiments

In [7]:
# Simulate multiple experiments
# This step creates the following files in "<local_dir>/pseudo_experiment/" directory:
#   - selected_simulated_data_SRP012656_<n>.txt
#   - selected_simulated_encoded_data_SRP012656_<n>.txt
#   - template_normalized_data_SRP012656_test.txt
# in which "<n>" is an integer in the range of [0, num_runs-1]
os.makedirs(os.path.join(local_dir, "pseudo_experiment"), exist_ok=True)
shift_template_experiment_custom(
    normalized_compendium_filename,
    NN_architecture,
    latent_dim,
    dataset_name,
    scaler,
    metadata_simulate_filename,
    metadata_delimiter,
    experiment_id_colname,
    metadata_col_id,
    project_id,
    local_dir,
    base_dir,
    num_runs,
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.



In [8]:
simulated_filename = os.path.join(
    local_dir, "pseudo_experiment", f"selected_simulated_data_{project_id}_1.txt"
)

test = pd.read_csv(simulated_filename, sep="\t", index_col=0, header=0)

In [9]:
test.head()

Unnamed: 0,PA14_55610,PA14_55600,PA14_55590,PA14_55580,PA14_55570,PA14_55560,PA14_55550,PA14_55540,PA14_55530,PA14_55520,...,PA14_19205,PA14_17675,PA14_67975,PA14_36345,PA14_43405,PA14_38825,PA14_24245,PA14_28895,PA14_55117,PA14_59845
SRX1740204,111.777,50.537,10.08,19.057,21.094,19.976,17.336,18.764,16.352,4.021,...,44.504,162.106,633.776,144.577,0.171,29.336,137.091,683.27,155.982,209.853
SRX1740205,103.666,51.088,10.955,26.069,16.42,16.012,12.438,15.369,10.092,2.846,...,41.855,160.967,799.159,103.454,0.135,18.419,116.171,550.327,152.655,87.486
SRX1740206,102.711,49.355,9.203,15.811,18.975,19.545,14.986,18.0,18.599,4.126,...,48.051,176.501,683.735,174.97,0.236,43.901,134.203,682.758,140.616,211.644
SRX1740207,123.578,25.647,10.035,12.35,37.381,10.266,13.395,21.082,54.47,3.969,...,37.466,64.821,314.995,250.59,0.176,38.048,202.545,432.848,146.042,801.247
SRX1740208,127.446,41.818,11.924,28.756,43.823,21.7,17.51,27.53,37.405,6.179,...,44.97,102.648,465.611,257.01,0.473,56.969,189.951,570.901,139.93,400.68


### Process template and simulated data

* Remove samples not required for comparison.
* Make sure ordering of samples matches metadata for proper comparison

In [10]:
if not os.path.exists(sample_id_metadata_filename):
    sample_id_metadata_filename = None

stats.process_samples_for_DESeq(
    raw_template_filename,
    metadata_filename,
    processed_template_filename,
    sample_id_metadata_filename,
)

for i in range(num_runs):
    simulated_filename = os.path.join(
        local_dir, "pseudo_experiment", f"selected_simulated_data_{project_id}_{i}.txt"
    )
    stats.process_samples_for_DESeq(
        simulated_filename,
        metadata_filename,
        None,
        sample_id_metadata_filename,
    )

sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly
sample ids are ordered correctly


In [11]:
# Quick check
template_data = pd.read_csv(
    processed_template_filename, header=0, index_col=0, sep="\t"
)

assert template_data.shape[0] == 6

In [12]:
template_data.head()

Unnamed: 0,PA14_55610,PA14_55600,PA14_55590,PA14_55580,PA14_55570,PA14_55560,PA14_55550,PA14_55540,PA14_55530,PA14_55520,...,PA14_19205,PA14_17675,PA14_67975,PA14_36345,PA14_43405,PA14_38825,PA14_24245,PA14_28895,PA14_55117,PA14_59845
SRX1740204,179,76,33,1142,356,104,31,85,75,5,...,48,19,261,275,0,56,135,156,63,261
SRX1740205,160,65,31,1395,435,115,37,101,58,4,...,45,25,389,251,0,116,125,164,58,296
SRX1740206,156,69,34,695,207,56,20,52,55,4,...,38,23,253,341,1,126,133,136,56,306
SRX1740207,230,54,20,474,174,39,15,47,74,5,...,20,24,173,483,0,113,197,102,72,390
SRX1740208,199,64,32,562,169,59,19,52,86,8,...,27,21,165,368,1,95,143,125,71,330


### Differential expression analysis

In [13]:
# Create subdirectory: "<local_dir>/DE_stats/"
os.makedirs(os.path.join(local_dir, "DE_stats"), exist_ok=True)

In [14]:
%%R -i metadata_filename -i project_id -i processed_template_filename -i local_dir -i base_dir

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

get_DE_stats_DESeq(metadata_filename,
                   project_id,
                   processed_template_filename,
                   "template",
                   local_dir,
                   "real")

R[write to console]: Loading required package: S4Vectors

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following object is masked from ‘package:limma’:

    plotMA


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply

[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



In [15]:
%%R -i metadata_filename -i project_id -i base_dir -i local_dir -i num_runs -o num_sign_DEGs_simulated

source(paste0(base_dir,'/generic_expression_patterns_modules/DE_analysis.R'))

num_sign_DEGs_simulated <- c()

for (i in 0:(num_runs-1)){
    simulated_data_filename <- paste(
        local_dir,
        "pseudo_experiment/selected_simulated_data_",
        project_id,
        "_",
        i,
        ".txt",
        sep=""
    )

    run_output <- get_DE_stats_DESeq(
        metadata_filename,
        project_id,
        simulated_data_filename,
        "simulated",
        local_dir,
        i
    )
    num_sign_DEGs_simulated <- c(num_sign_DEGs_simulated, run_output)
}

[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



[1] "Checking sample ordering..."
[1] TRUE


R[write to console]:   the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function



### Rank genes

In [16]:
analysis_type = "DE"
template_DE_stats_filename = os.path.join(
    local_dir, "DE_stats", f"DE_stats_template_data_{project_id}_real.txt"
)
template_DE_stats, simulated_DE_summary_stats = ranking.process_and_rank_genes_pathways(
    template_DE_stats_filename,
    local_dir,
    num_runs,
    project_id,
    analysis_type,
    col_to_rank_genes,
    logFC_name,
    pvalue_name,
)



### Gene summary table

In [17]:
summary_gene_ranks = ranking.generate_summary_table(
    template_DE_stats_filename,
    template_DE_stats,
    simulated_DE_summary_stats,
    col_to_rank_genes,
    local_dir,
    "gene",
    params,
)

summary_gene_ranks.sort_values(by="Z score", ascending=False).head()

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),abs(log2FoldChange) (Real),log2FoldChange (Real),Median adj p-value (simulated),Rank (simulated),Percentile (simulated),Percent DE (simulated),Mean abs(log2FoldChange) (simulated),Std deviation (simulated),Number of experiments (simulated),Z score
PA14_53300,PA14_53300,3.2774899999999997e-44,5890.0,3.820634,-3.820634,0.615595,4244.0,72.037351,0.0,0.2497,0.194925,25,18.319543
PA14_21530,PA14_21530,1.6171430000000001e-28,5887.0,3.273077,-3.273077,0.757436,4379.0,74.329372,0.0,0.222476,0.172918,25,17.641922
PA14_66480,PA14_66480,5.056304e-07,5700.0,1.299171,1.299171,0.646066,1318.0,22.359932,0.0,0.127432,0.066487,25,17.62367
PA14_12590,PA14_12590,0.2669583,5791.0,1.517689,1.517689,0.949002,1069.0,18.132428,0.0,0.11592,0.080223,25,17.473402
PA14_10500,PA14_10500,2.4008669999999998e-30,5884.0,3.173688,-3.173688,0.286293,4426.0,75.127334,0.0,0.254018,0.17393,25,16.786449


In [18]:
# Add gene name as column to summary dataframe
summary_gene_ranks = ranking.add_pseudomonas_gene_name_col(summary_gene_ranks, base_dir)
summary_gene_ranks.sort_values(by="Z score", ascending=False).head()

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),abs(log2FoldChange) (Real),log2FoldChange (Real),Median adj p-value (simulated),Rank (simulated),Percentile (simulated),Percent DE (simulated),Mean abs(log2FoldChange) (simulated),Std deviation (simulated),Number of experiments (simulated),Z score,Gene Name
PA14_53300,PA14_53300,3.2774899999999997e-44,5890.0,3.820634,-3.820634,0.615595,4244.0,72.037351,0.0,0.2497,0.194925,25,18.319543,
PA14_21530,PA14_21530,1.6171430000000001e-28,5887.0,3.273077,-3.273077,0.757436,4379.0,74.329372,0.0,0.222476,0.172918,25,17.641922,
PA14_66480,PA14_66480,5.056304e-07,5700.0,1.299171,1.299171,0.646066,1318.0,22.359932,0.0,0.127432,0.066487,25,17.62367,
PA14_12590,PA14_12590,0.2669583,5791.0,1.517689,1.517689,0.949002,1069.0,18.132428,0.0,0.11592,0.080223,25,17.473402,
PA14_10500,PA14_10500,2.4008669999999998e-30,5884.0,3.173688,-3.173688,0.286293,4426.0,75.127334,0.0,0.254018,0.17393,25,16.786449,


In [19]:
summary_gene_ranks.sort_values(by="Percentile (simulated)", ascending=False).head()

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),abs(log2FoldChange) (Real),log2FoldChange (Real),Median adj p-value (simulated),Rank (simulated),Percentile (simulated),Percent DE (simulated),Mean abs(log2FoldChange) (simulated),Std deviation (simulated),Number of experiments (simulated),Z score,Gene Name
PA14_06940,PA14_06940,0.9948748,57.0,0.005601,0.005601,0.000785,5891.0,100.0,0.44,1.127401,0.700321,25,-1.601835,
PA14_33800,PA14_33800,0.9161133,525.0,0.051189,0.051189,0.16041,5890.0,99.983022,0.24,0.900162,0.557125,25,-1.523848,
PA14_17700,PA14_17700,2.629764e-52,5883.0,3.131006,3.131006,0.001289,5889.0,99.966044,0.4,0.947139,0.650671,25,3.356331,
PA14_13280,PA14_13280,0.05929881,5008.0,0.728369,0.728369,0.038319,5888.0,99.949066,0.2,0.751504,0.399483,25,-0.057912,
PA14_27690,PA14_27690,0.6656786,1929.0,0.192381,0.192381,0.461541,5887.0,99.932088,0.26087,1.025154,0.780311,23,-1.067233,


In [20]:
# Check if there is an NaN values, there should not be
summary_gene_ranks.isna().any()

Gene ID                                 False
Adj P-value (Real)                       True
Rank (Real)                              True
abs(log2FoldChange) (Real)               True
log2FoldChange (Real)                    True
Median adj p-value (simulated)          False
Rank (simulated)                        False
Percentile (simulated)                  False
Percent DE (simulated)                  False
Mean abs(log2FoldChange) (simulated)    False
Std deviation (simulated)               False
Number of experiments (simulated)       False
Z score                                  True
Gene Name                                True
dtype: bool

In [21]:
# Create `gene_summary_filename`
summary_gene_ranks.to_csv(gene_summary_filename, sep="\t")

## Compare gene ranking

We can only compare the ranking between the PAO1 RNA-seq compendium vs GAPE, where we still see good concordance as expected.

When we look for common genes, we do this based on percentiles generated by SOPHIE for both the PAO1 and PA14 compendia to be consistent.

In [22]:
if "pao1" in config_filename:
    # Get generic genes identified by Crow et. al.
    GAPE_filename = params["reference_gene_filename"]
    ref_gene_col = params["reference_gene_name_col"]
    ref_rank_col = params["reference_rank_col"]

    figure_filename = f"gene_ranking_{col_to_rank_genes}.svg"

    corr, shared_ranking = ranking.compare_gene_ranking(
        summary_gene_ranks, GAPE_filename, ref_gene_col, ref_rank_col, figure_filename
    )