# Test shifting template experiments

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pandas as pd
import umap
import glob
import pickle
from keras.models import load_model

import warnings
warnings.filterwarnings(action='ignore')

from ponyo import utils, train_vae_modules, simulate_expression_data

Using TensorFlow backend.







In [2]:
# Set seeds to get reproducible VAE trained models
train_vae_modules.set_all_seeds()




In [3]:
# Read in config variables
config_filename = "config_test_1.tsv"
params = utils.read_config(config_filename)

In [4]:
# Load parameters
local_dir = params["local_dir"]
raw_compendium_filename = params["raw_compendium_filename"]
normalized_compendium_filename = params["normalized_compendium_filename"]
metadata_filename = params["metadata_filename"]
experiment_id_filename = params["experiment_ids_filename"]
latent_dim = params['latent_dim']
scaler_filename = params['scaler_transform_filename']
num_simulated_samples = params['num_simulated']
#num_simulated_experiments = params['num_simulated_experiments']
#num_runs = params['num_simulated']
metadata_delimiter = params["metadata_delimiter"]
experiment_id_colname = params['metadata_experiment_colname']
sample_id_colname = params['metadata_sample_colname']
project_id = params['project_id']
training_stats_dir = params["training_stats_dir"]
vae_model_dir = params["vae_model_dir"]

## 1. Random simulation

In [5]:
# Run simulation
simulated_data = simulate_expression_data.simulate_by_random_sampling(
    normalized_compendium_filename,
    num_simulated_samples,
    vae_model_dir,
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Normalized gene expression data contains 56 samples and 5000 genes

Return: simulated gene expression data containing 100 samples and 5000 genes


## 2. Latent transformation simulation

In [6]:
"""utils.create_experiment_id_file(metadata_filename,
                                normalized_compendium_filename,
                                experiment_id_filename,
                                config_filename)"""

'utils.create_experiment_id_file(metadata_filename,\n                                normalized_compendium_filename,\n                                experiment_id_filename,\n                                config_filename)'

In [7]:
"""# Run simulation
simulated_data = simulate_expression_data.simulate_by_latent_transformation(
    num_simulated_experiments,
    normalized_compendium_filename,
    vae_model_dir,
    latent_dim,
    metadata_filename,
    metadata_delimiter,
    experiment_id_colname,
    sample_id_colname,
    experiment_id_filename,
    local_dir,
)"""

'# Run simulation\nsimulated_data = simulate_expression_data.simulate_by_latent_transformation(\n    num_simulated_experiments,\n    normalized_compendium_filename,\n    vae_model_dir,\n    latent_dim,\n    metadata_filename,\n    metadata_delimiter,\n    experiment_id_colname,\n    sample_id_colname,\n    experiment_id_filename,\n    local_dir,\n)'

## 3. Template shift simulation

In [8]:
# Load pickled file
# scaler = pickle.load(open(scaler_filename, "rb"))

In [9]:
"""# Run simulation
simulate_expression_data.shift_template_experiment(
    normalized_compendium_filename,
    vae_model_dir,
    latent_dim,
    scaler_filename,
    metadata_filename,
    metadata_delimiter,
    experiment_id_colname,
    sample_id_colname,
    project_id,
    local_dir,
    simulated_data_dir,
    num_runs)"""

'# Run simulation\nsimulate_expression_data.shift_template_experiment(\n    normalized_compendium_filename,\n    vae_model_dir,\n    latent_dim,\n    scaler_filename,\n    metadata_filename,\n    metadata_delimiter,\n    experiment_id_colname,\n    sample_id_colname,\n    project_id,\n    local_dir,\n    simulated_data_dir,\n    num_runs)'

## 4. Embed shift simulation

In [10]:
# Read in config variables
config_filename = "config_test_2.tsv"
params = utils.read_config(config_filename)

In [11]:
# Load parameters
local_dir = params["local_dir"]
raw_compendium_filename = params["raw_compendium_filename"]
normalized_compendium_filename = params["normalized_compendium_filename"]
raw_template_filename = params["raw_template_filename"]
mapped_template_filename = params["mapped_template_filename"]
normalized_template_filename = params["normalized_template_filename"]
metadata_filename = params["metadata_filename"]
latent_dim = params['latent_dim']
scaler_filename = params['scaler_transform_filename']
num_runs = params['num_simulated']
metadata_delimiter = params["metadata_delimiter"]
experiment_id_colname = params['metadata_experiment_colname']
sample_id_colname = params['metadata_sample_colname']
project_id = params['project_id']
vae_model_dir = params["vae_model_dir"]
simulated_data_dir = params["simulated_data_dir"]

### Process template experiment

In order to simulate a new gene expression experiment, we will need to encode this experiment into the learned latent space. This requires that the feature space (i.e. genes) in the template experiment match the features in the compendium used to train the VAE model. These cells process the template experiment to be of the expected input format:

* Template data is expected to be a matrix that is sample x gene
* Template experiment is expected to have the same genes as the compendium experiment. Genes that are in the template experiment but not in the compendium are removed. Genes that are in the compendium but missing in the template experiment are added and the gene expression value is set to the median gene expression value of that gene across the samples in the compendium.
* Template values are expected to be within the range (i.e. both the template and compendium expression measurements are estimated counts)

In [12]:
simulate_expression_data.process_template_experiment(
    raw_template_filename,
    normalized_compendium_filename,
    scaler_filename,
    mapped_template_filename,
    normalized_template_filename,
)

(5, 5000)
(56, 5000)


In [13]:
template = pd.read_csv(raw_template_filename, sep="\t", index_col=0, header=0)
print(template.shape)
template.head()

(5, 5000)


Unnamed: 0,ENSG00000000003.14,ENSG00000000005.5,ENSG00000000419.12,ENSG00000000457.13,ENSG00000000460.16,ENSG00000000938.12,ENSG00000000971.15,ENSG00000001036.13,ENSG00000001084.10,ENSG00000001167.14,...,ENSG00000119953.12,ENSG00000119965.12,ENSG00000119969.14,ENSG00000119973.5,ENSG00000119977.20,ENSG00000119979.16,ENSG00000119986.6,ENSG00000120008.15,ENSG00000120029.12,ENSG00000120049.18
SRR592745,52338,5,449,959,237,5,21814,1784,1228,690,...,587,256,58,0,2602,605,327,2120,1222,77
SRR592746,27719,2,630,429,195,45,14051,832,933,825,...,748,199,62,0,2066,458,635,1331,1236,79
SRR592747,34242,2,1329,487,183,4,17662,1411,1383,1015,...,1320,234,59,0,2698,568,706,1595,1558,45
SRR592748,11768,88,612,860,370,8,8229,1927,451,1922,...,689,349,503,1,3163,586,113,2903,1099,225
SRR592749,16803,713,855,799,447,4,5676,2099,379,2185,...,864,431,797,0,4314,659,128,2332,971,123


### Simulation

In [14]:
# Run simulation
simulate_expression_data.embed_shift_template_experiment(
    normalized_compendium_filename,
    normalized_template_filename,
    vae_model_dir,
    project_id,
    scaler_filename,
    local_dir,
    latent_dim,
    num_runs,
    simulated_data_dir
)

FileNotFoundError: [Errno 2] No such file or directory: 'test_results/selected_simulated_data_SRP016140_0.tsv'