# Application: new experiment

This notebook allows users to find generic genes in their experiment of interest using an existing VAE model

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

In [2]:
import os
import pandas as pd

from generic_expression_patterns_modules import process, new_experiment_process

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))
Using TensorFlow backend.


## User input

User needs to define the following:

1. Template experiment. This is the experiment you are interested in studying
2. Training compendium used to train VAE, including unnormalized gene mapped version and normalized version
3. Scaler transform used to normalize the training compendium
4. Directory containing trained VAE model
5. Experiment id to use to label newly create simulated experiments

## TO DO:
#If want to train a new model then need to have gene expression data of what form, gene ids of what form? Instructions???

In [3]:
# Local directory to store intermediate files
local_dir = "/home/alexandra/Documents/Data/Generic_expression_patterns/"

# Template experiment filename
template_filename = os.path.join(
    local_dir,
    "Costello_BladderCancer_ResistantCells_Counts_12-8-20.txt"
)

# Training dataset used for existing VAE model
mapped_compendium_filename = os.path.join(
    local_dir,
    "mapped_recount2_compendium.tsv"
)

# Normalized compendium filename
normalized_compendium_filename = "https://recount2.s3.amazonaws.com/normalized_recount2_compendium_data.tsv"

# Scaler transform
scaler_filename = "../human_general_analysis/data/scaler_transform_human.pickle"

# Directory containing VAE model files
vae_model_dir = "../human_general_analysis/models/NN_2500_30"

# ID for template experiment
# This ID will be used to label new simulated experiments
selected_experiment_id = "costello"

# Number of simulated experiments to create
num_runs = 25
latent_dim=30

In [4]:
# Template experiment needs to be of the form sample x gene
transposed_template_filename = "/home/alexandra/Documents/Data/Generic_expression_patterns/Costello_BladderCancer_ResistantCells_Counts_12-8-20_transposed.txt"

new_experiment_process.transpose_save(template_filename, transposed_template_filename)

## Process

In [None]:
mapped_template_experiment = new_experiment_process.compare_match_features(
    transposed_template_filename,
    normalized_compendium_filename
)

(72, 58528)


In [None]:
# Scale template experiment to be within the same range as the
# normalized training dataset used for the VAE model
processed_template_experiment = new_experiment_process.normalize_template_experiment(
    mapped_template_experiment,
    scaler_filename
)    

## Simulate experiments based on template experiment

Embed template experiment into learned latent space and linearly shift template experiment to different locations of the latent space to create new experiments

In [None]:
# Simulate experiments based on template experiment
# Embed template experiment into learned latent space
# Linearly shift template experiment to different locations of the latent space

normalized_data = pd.read_csv(normalized_compendium_filename, sep="\t", index_col=0, header=0)

for run_id in range(num_runs):
    new_experiment_process.embed_shift_template_experiment(
        normalized_data,
        processed_template_experiment,
        vae_model_dir,
        selected_experiment_id,
        scaler_filename,
        local_dir,
        latent_dim,
        run_id
    )

## Differential expression analysis

* If data is RNA-seq then use DESeq2
* If data is microarray then use Limma

NOTE: Eventually provide the ability to use other methods

In [None]:
# Modify template and simulated experiments based on comparison
# Drop samples

In [None]:
"""# Load metadata file with grouping assignments for samples
metadata_filename = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "metadata",
    f"{project_id}_groups.tsv"
)"""

In [None]:
# Check whether ordering of sample ids is consistent between gene expression data and metadata
#process.compare_and_reorder_samples(processed_template_filename, metadata_filename)

In [None]:
# Create subdirectory: "<local_dir>/DE_stats/"
#os.makedirs(os.path.join(local_dir, "DE_stats"), exist_ok=True)

In [None]:
"""%%R -i metadata_filename -i project_id -i processed_template_filename -i local_dir -i base_dir

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# File created: "<local_dir>/DE_stats/DE_stats_template_data_SRP012656_real.txt"
get_DE_stats_DESeq(metadata_filename,
                   project_id, 
                   processed_template_filename,
                   "template",
                   local_dir,
                   "real")"""

In [None]:
"""# Check whether ordering of sample ids is consistent between gene expression data and metadata
for i in range(num_runs):
    simulated_data_filename = os.path.join(
        local_dir,
        "pseudo_experiment",
        f"selected_simulated_data_{project_id}_{i}.txt"
    )
        
    process.compare_and_reorder_samples(simulated_data_filename, metadata_filename)"""

In [None]:
"""%%R -i metadata_filename -i project_id -i base_dir -i local_dir -i num_runs

source(paste0(base_dir, '/generic_expression_patterns_modules/DE_analysis.R'))

# Files created: "<local_dir>/DE_stats/DE_stats_simulated_data_SRP012656_<n>.txt"
for (i in 0:(num_runs-1)){
    simulated_data_filename <- paste(local_dir, 
                                     "pseudo_experiment/selected_simulated_data_",
                                     project_id,
                                     "_", 
                                     i,
                                     ".txt",
                                     sep = "")
    
    get_DE_stats_DESeq(metadata_filename,
                       project_id, 
                       simulated_data_filename,
                       "simulated",
                       local_dir,
                       i)
}"""

In [None]:
# Quick validation

# Look at volcano of template experiment and simulated experiments

## Rank genes

Add description

In [None]:
"""# Concatenate simulated experiments
simulated_DE_stats_all = process.concat_simulated_data(local_dir, num_runs, project_id, 'DE')

print(simulated_DE_stats_all.shape)"""

In [None]:
# Take absolute value of logFC and t statistic
#simulated_DE_stats_all = process.abs_value_stats(simulated_DE_stats_all)

In [None]:
"""# Aggregate statistics across all simulated experiments
simulated_DE_summary_stats = calc.aggregate_stats(
    col_to_rank_genes,
    simulated_DE_stats_all,
    'DE'
)"""

In [None]:
"""# Take absolute value of logFC and t statistic
template_DE_stats = process.abs_value_stats(template_DE_stats)

# Rank genes in template experiment
template_DE_stats = calc.rank_genes_or_pathways(
    col_to_rank_genes,      
    template_DE_stats,
    True
)"""

In [None]:
"""# Rank genes in simulated experiments
simulated_DE_summary_stats = calc.rank_genes_or_pathways(
    col_to_rank_genes,
    simulated_DE_summary_stats,
    False
)"""

## Summary table

Description of table columns

In [None]:
"""# Get summary table
# Description of table columns here
summary_gene_ranks = process.generate_summary_table(
    template_DE_stats,
    simulated_DE_summary_stats,
    col_to_rank_genes,
    local_dir,
    'gene',
    params
)

summary_gene_ranks.head()"""

In [None]:
# Save