# Simulation experiment 

Run entire simulation experiment multiple times to generate confidence interval

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [None]:
# Parameters
NN_architecture = 'NN_2500_30'
analysis_name = 'analysis_1'
file_prefix = 'Partition_corrected'
num_simulated_experiments = 600
lst_num_partitions = [1, 2, 3, 5, 10, 20,
                    30, 50, 70, 100, 200, 300, 400, 500, 600]
corrected = True
use_pca = True
num_PCs = 10

iterations = range(10) 
num_cores = 5

In [None]:
# Input files
base_dir = os.path.abspath(
  os.path.join(
      os.getcwd(), "../.."))    # base dir on repo

normalized_data_file = os.path.join(
  base_dir,
  "data",
  "input",
  "train_set_normalized.pcl")

In [2]:
# Output files
local_dir = "/home/alexandra/Documents/"

similarity_corrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "analysis_1_similarity_corrected.pickle")

ci_corrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "analysis_1_ci_corrected.pickle")

In [3]:
# Run multiple simulations - corrected
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.matched_simulation_experiment_corrected)(i,
                                                           NN_architecture,
                                                           analysis_name,
                                                           num_simulated_samples,
                                                           lst_num_experiments,
                                                           corrected,
                                                           use_pca,
                                                           num_PCs,
                                                           "Partition",
                                                           normalized_data_file) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 136.4min
[Parallel(n_jobs=5)]: Done   2 out of  10 | elapsed: 142.2min remaining: 568.7min
[Parallel(n_jobs=5)]: Done   3 out of  10 | elapsed: 145.8min remaining: 340.2min
[Parallel(n_jobs=5)]: Done   4 out of  10 | elapsed: 147.8min remaining: 221.7min
[Parallel(n_jobs=5)]: Done   5 out of  10 | elapsed: 152.7min remaining: 152.7min
[Parallel(n_jobs=5)]: Done   6 out of  10 | elapsed: 269.8min remaining: 179.9min
[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed: 284.3min remaining: 121.8min
[Parallel(n_jobs=5)]: Done   8 out of  10 | elapsed: 292.2min remaining: 73.1min
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed: 298.2min remaining:    0.0s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed: 298.2min finished


In [4]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score,score,score,score,score,score
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.999993,0.999993,0.999993,0.999993,0.999992,0.999992,0.999994,0.999994,0.999994,0.999993
2,0.999376,0.998992,0.998131,0.99822,0.999251,0.998505,0.998478,0.998957,0.997771,0.993747
3,0.997635,0.997181,0.996988,0.998199,0.997656,0.997673,0.994915,0.996409,0.997353,0.99267
5,0.996891,0.995693,0.992918,0.995211,0.996081,0.993301,0.993216,0.994982,0.992973,0.994998
10,0.992408,0.988902,0.985932,0.98721,0.989342,0.989414,0.985331,0.988674,0.986597,0.987743
20,0.980302,0.977038,0.9756,0.979158,0.978035,0.978245,0.973155,0.979184,0.976622,0.976444
30,0.9627,0.968008,0.963185,0.967206,0.972544,0.96736,0.95764,0.965343,0.961807,0.961831
50,0.939892,0.947227,0.952761,0.952086,0.951011,0.95109,0.941838,0.949349,0.944116,0.928592
70,0.93076,0.933912,0.920607,0.934099,0.934509,0.920892,0.913025,0.930231,0.929355,0.899084
100,0.914509,0.902742,0.877174,0.89696,0.900836,0.905216,0.887849,0.904017,0.905647,0.893075


In [5]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999993
2,0.998143
3,0.996668
5,0.994626
10,0.988155
20,0.977378
30,0.964762
50,0.945796
70,0.924647
100,0.898803


In [6]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,1.70249e-07
2,0.0005146682
3,0.0005294477
5,0.000451992
10,0.0006524848
20,0.0006558771
30,0.001328901
50,0.002374167
70,0.003624151
100,0.003342457


In [7]:
# Get confidence interval for each row (number of experiments)
err = std_scores*1.96

In [8]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999993,0.999994
2,0.996979,0.999307
3,0.99547,0.997865
5,0.993604,0.995649
10,0.986679,0.989631
20,0.975895,0.978862
30,0.961756,0.967768
50,0.940426,0.951167
70,0.916449,0.932845
100,0.891242,0.906363


In [9]:
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999993
2,0.998143
3,0.996668
5,0.994626
10,0.988155
20,0.977378
30,0.964762
50,0.945796
70,0.924647
100,0.898803


In [10]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_corrected_file)
ci.to_pickle(ci_corrected_file)