# Simulation experiment using noisy data

Run entire simulation experiment multiple times to generate confidence interval.  The simulation experiment can be found in ```functions/pipeline.py```

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../../")
from functions import pipelines, utils

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [None]:
# Read in config variables
config_file = os.path.abspath(os.path.join(os.getcwd(),"../../configs", "config_Human_experiment.tsv"))
params = utils.read_config(config_file)

In [None]:
# Load parameters
dataset_name = params["dataset_name"]
analysis_name = params["analysis_name"]
NN_architecture = params["NN_architecture"]
num_simulated_experiments = params["num_simulated_samples"]
lst_num_partitions = params["lst_num_experiments"]
use_pca = params["use_pca"]
num_PCs = params["num_PCs"]
local_dir = params["local_dir"]
correction_method = params["correction_method"]

iterations = params["iterations"] 
num_cores = params["num_cores"]

In [None]:
# Additional parameters
file_prefix = "Partition"
corrected = False

In [3]:
# Input
base_dir = os.path.abspath(
      os.path.join(
          os.getcwd(), "../.."))

normalized_data_file = os.path.join(
    base_dir,
    dataset_name,    
    "data",
    "input",
    "recount2_gene_normalized_data.tsv.xz")

experiment_ids_file = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "metadata",
    "recount2_experiment_ids.txt")

In [4]:
# Output files
similarity_uncorrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    dataset_name +"_experiment_lvl_sim_similarity_uncorrected_"+correction_method+".pickle")

ci_uncorrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    dataset_name +"_experiment_lvl_sim_ci_uncorrected_"+correction_method+".pickle")

similarity_permuted_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    dataset_name +"_experiment_lvl_sim_permuted")

In [5]:
# Run multiple simulations - uncorrected
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.experiment_level_simulation_uncorrected)(i,
                                                           NN_architecture,
                                                           dataset_name,
                                                           analysis_name,
                                                           num_simulated_experiments,
                                                           lst_num_partitions,
                                                           corrected,
                                                           use_pca,
                                                           num_PCs,
                                                           file_prefix,
                                                           normalized_data_file,
                                                           experiment_ids_file,
                                                           local_dir) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 68.2min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 73.0min remaining: 109.5min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 86.6min remaining: 57.7min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 112.6min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 112.6min finished


In [6]:
base_dir

'/home/alexandra/Documents/Repos/Batch_effects_simulation'

In [7]:
# permuted score
permuted_score = results[0][0]

In [8]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.999974,0.999974,0.99998,0.999979,0.999984
2,0.919656,0.944906,0.900015,0.920417,0.914635
3,0.843559,0.85899,0.846841,0.867891,0.865735
5,0.742672,0.778355,0.731371,0.735416,0.785546
10,0.602175,0.656925,0.607181,0.646752,0.61376
20,0.694427,0.66785,0.661836,0.693736,0.72108
30,0.684765,0.759777,0.710935,0.699128,0.790501
50,0.746933,0.775648,0.749795,0.724027,0.782352


In [9]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999978
2,0.919926
3,0.856603
5,0.754672
10,0.625359
20,0.687786
30,0.729021
50,0.755751


In [10]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,1e-06
2,0.00512
3,0.003471
5,0.008019
10,0.007837
20,0.007515
30,0.014058
50,0.007455


In [11]:
# Get confidence interval for each row (number of experiments)
err = std_scores*1.96

In [12]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999976,0.999981
2,0.90989,0.929962
3,0.8498,0.863406
5,0.738956,0.770388
10,0.609999,0.640719
20,0.673057,0.702515
30,0.701467,0.756575
50,0.741139,0.770363


In [13]:
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999978
2,0.919926
3,0.856603
5,0.754672
10,0.625359
20,0.687786
30,0.729021
50,0.755751


In [14]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_uncorrected_file)
ci.to_pickle(ci_uncorrected_file)
np.save(similarity_permuted_file, permuted_score)