# Simulation experiment 

Run entire simulation experiment multiple times to generate confidence interval

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [2]:
# Parameters
NN_architecture = 'NN_2500_30'
analysis_name = 'analysis_3'
file_prefix = "Partition"
num_simulated_experiments = 50
lst_num_partitions = [1, 2, 3, 5, 10, 20,
                    30, 50]
corrected = False
use_pca = True
num_PCs = 10

iterations = range(5) 
num_cores = 5

In [3]:
# Input
base_dir = os.path.abspath(
      os.path.join(
          os.getcwd(), "../.."))

local_dir = local_dir = os.path.abspath(
    os.path.join(
        os.getcwd(), "../../../..")) 

normalized_data_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "input",
    "recount2_gene_normalized_data.tsv")

experiment_ids_file = os.path.join(
      base_dir,
      "data",
      "metadata",
      "recount2_experiment_ids.txt")

In [4]:
# Output files
local_dir = "/home/alexandra/Documents/"

similarity_uncorrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "analysis_3_similarity_uncorrected.pickle")

ci_uncorrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "analysis_3_ci_uncorrected.pickle")

In [5]:
# Run multiple simulations - uncorrected
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.matched_simulation_experiment_uncorrected)(i,
                                                             NN_architecture,
                                                             analysis_name,
                                                             num_simulated_experiments,
                                                             lst_num_partitions,
                                                             corrected,
                                                             use_pca,
                                                             num_PCs,
                                                             file_prefix,
                                                             normalized_data_file,
                                                             experiment_ids_file) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 68.8min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 79.5min remaining: 119.2min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 95.8min remaining: 63.9min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 115.3min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 115.3min finished


In [6]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.99997,0.999976,0.999967,0.999932,0.999927
2,0.904011,0.905521,0.908656,0.912116,0.929931
3,0.855136,0.830856,0.819789,0.852348,0.835572
5,0.760451,0.762769,0.700758,0.744787,0.708403
10,0.608675,0.634671,0.60363,0.597629,0.616578
20,0.69147,0.726645,0.660767,0.689196,0.712778
30,0.679327,0.74702,0.690138,0.701332,0.708917
50,0.716986,0.7853,0.722228,0.700594,0.712252


In [7]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999954
2,0.912047
3,0.83874
5,0.735434
10,0.612237
20,0.696171
30,0.705347
50,0.727472


In [8]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,7e-06
2,0.003311
3,0.004705
5,0.009211
10,0.004534
20,0.007951
30,0.008176
50,0.010531


In [9]:
# Get confidence interval for each row (number of experiments)
err = std_scores*1.96

In [10]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.99994,0.999969
2,0.905558,0.918537
3,0.829519,0.847961
5,0.71738,0.753487
10,0.60335,0.621123
20,0.680588,0.711755
30,0.689322,0.721372
50,0.706832,0.748112


In [11]:
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999954
2,0.912047
3,0.83874
5,0.735434
10,0.612237
20,0.696171
30,0.705347
50,0.727472


In [12]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_uncorrected_file)
ci.to_pickle(ci_uncorrected_file)