# Simulation experiment 

Run entire simulation experiment multiple times to generate confidence interval

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [None]:
# Parameters
NN_architecture = 'NN_2500_30'
analysis_name = 'analysis_1'
file_prefix = "Partition"
num_simulated_experiments = 600
lst_num_partitions = [1, 2, 3, 5, 10, 20,
                    30, 50, 70, 100, 200, 300, 400, 500, 600]
corrected = False
use_pca = True
num_PCs = 10

iterations = range(10) 
num_cores = 5

In [None]:
# Input
base_dir = os.path.abspath(
      os.path.join(
          os.getcwd(), "../.."))

normalized_data_file = os.path.join(
      base_dir,
      "data",
      "input",
      "train_set_normalized.pcl")

In [2]:
# Output files
local_dir = "/home/alexandra/Documents/"

similarity_uncorrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "analysis_1_similarity_uncorrected.pickle")

ci_uncorrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "analysis_1_ci_uncorrected.pickle")

In [3]:
# Run multiple simulations - uncorrected
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.matched_simulation_experiment_uncorrected)(i,
                                                             NN_architecture,
                                                            analysis_name,
                                                            num_simulated_samples,
                                                            lst_num_experiments,
                                                            corrected,
                                                            use_pca,
                                                            num_PCs,
                                                             "Partition",
                                                            normalized_data_file) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 69.8min
[Parallel(n_jobs=5)]: Done   2 out of  10 | elapsed: 74.8min remaining: 299.1min
[Parallel(n_jobs=5)]: Done   3 out of  10 | elapsed: 75.4min remaining: 176.0min
[Parallel(n_jobs=5)]: Done   4 out of  10 | elapsed: 75.6min remaining: 113.5min
[Parallel(n_jobs=5)]: Done   5 out of  10 | elapsed: 77.9min remaining: 77.9min
[Parallel(n_jobs=5)]: Done   6 out of  10 | elapsed: 148.9min remaining: 99.3min
[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed: 150.4min remaining: 64.5min
[Parallel(n_jobs=5)]: Done   8 out of  10 | elapsed: 152.9min remaining: 38.2min
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed: 154.0min remaining:    0.0s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed: 154.0min finished


In [4]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score,score,score,score,score,score
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.999993,0.999993,0.999993,0.999993,0.999993,0.999993,0.999993,0.999994,0.999993,0.999993
2,0.904416,0.903941,0.904135,0.901354,0.905342,0.901204,0.911813,0.900921,0.905612,0.903674
3,0.812157,0.806322,0.809117,0.812685,0.813655,0.822197,0.815723,0.804353,0.816948,0.812147
5,0.631277,0.629378,0.647738,0.636986,0.627587,0.628475,0.625668,0.632649,0.625189,0.640328
10,0.233339,0.254049,0.250521,0.232487,0.245954,0.241014,0.255585,0.24297,0.232637,0.26127
20,0.310943,0.334621,0.368095,0.32578,0.323755,0.325563,0.334329,0.358671,0.314837,0.321335
30,0.343801,0.35958,0.39989,0.367667,0.384197,0.364188,0.358929,0.384039,0.366256,0.364958
50,0.437817,0.421561,0.497509,0.445692,0.451782,0.459051,0.404102,0.465499,0.433092,0.406299
70,0.513537,0.477599,0.550758,0.525372,0.47795,0.525419,0.522733,0.497372,0.511791,0.477668
100,0.593415,0.568589,0.624866,0.605094,0.577356,0.609951,0.581596,0.620496,0.605294,0.604247


In [5]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999993
2,0.904241
3,0.81253
5,0.632527
10,0.244983
20,0.331793
30,0.36935
50,0.44224
70,0.50802
100,0.59909


In [6]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,9.37988e-08
2,0.0009970343
3,0.001639038
5,0.002273829
10,0.003258212
20,0.005798362
30,0.005037687
50,0.008980395
70,0.00785685
100,0.005852482


In [7]:
# Get confidence interval for each row (number of experiments)
err = std_scores*1.96

In [8]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999993,0.999993
2,0.901986,0.906497
3,0.808823,0.816238
5,0.627384,0.637671
10,0.237612,0.252353
20,0.318677,0.344909
30,0.357955,0.380746
50,0.421927,0.462554
70,0.490248,0.525792
100,0.585852,0.612329


In [9]:
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999993
2,0.904241
3,0.81253
5,0.632527
10,0.244983
20,0.331793
30,0.36935
50,0.44224
70,0.50802
100,0.59909


In [10]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_uncorrected_file)
ci.to_pickle(ci_uncorrected_file)