# Simulation experiment 

Run entire simulation experiment multiple times to generate confidence interval

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [None]:
# Parameters
NN_architecture = 'NN_2500_30'
analysis_name = 'analysis_0'
file_prefix = 'Experiment_corrected'
num_simulated_samples = 6000
lst_num_experiments = [1, 2, 5, 10, 20,
                     50, 100, 500, 1000, 2000, 3000, 6000]
corrected = True
use_pca = True
num_PCs = 10

iterations = range(5) 
num_cores = 5

In [None]:
# Input files
base_dir = os.path.abspath(
  os.path.join(
      os.getcwd(), "../.."))    # base dir on repo


normalized_data_file = os.path.join(
  base_dir,
  "data",
  "input",
  "train_set_normalized.pcl")

In [2]:
# Output files
local_dir = "/home/alexandra/Documents/"

similarity_corrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "analysis_0_similarity_corrected.pickle")

ci_corrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "analysis_0_ci_corrected.pickle")

In [3]:
# Run multiple simulations - corrected
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.simple_simulation_experiment_corrected)(i,
                                                          NN_architecture,
                                                          analysis_name,
                                                          num_simulated_samples,
                                                          lst_num_experiments,
                                                          corrected,
                                                          use_pca,
                                                          num_PCs,
                                                          "Experiment",
                                                          normalized_data_file) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 172.7min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 172.9min remaining: 259.3min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 173.0min remaining: 115.3min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 173.1min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 173.1min finished


In [4]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.999994,0.999994,0.999994,0.999994,0.999994
2,0.999899,0.999839,0.999895,0.999872,0.999871
5,0.99971,0.999536,0.999801,0.999652,0.999522
10,0.999456,0.99912,0.999263,0.999263,0.999348
20,0.998397,0.99846,0.998373,0.997999,0.998348
50,0.996317,0.995634,0.996009,0.995757,0.995707
100,0.991482,0.991723,0.992104,0.991381,0.991266
500,0.957287,0.956864,0.958002,0.958039,0.956359
1000,0.912567,0.913328,0.91139,0.913698,0.913698
2000,0.812957,0.813232,0.814414,0.813201,0.815573


In [5]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999994
2,0.999875
5,0.999644
10,0.99929
20,0.998315
50,0.995885
100,0.991591
500,0.95731
1000,0.912936
2000,0.813875


In [6]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,2.990026e-08
2,7.571199e-06
5,3.730238e-05
10,3.912328e-05
20,5.743105e-05
50,8.851082e-05
100,0.0001051169
500,0.0002298867
1000,0.0003099305
2000,0.0003494309


In [7]:
# Get confidence interval for each row (number of experiments)
# z-score for 95% confidence interval
err = std_scores*1.96

In [8]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999994,0.999994
2,0.99986,0.99989
5,0.999571,0.999717
10,0.999213,0.999367
20,0.998203,0.998428
50,0.995711,0.996058
100,0.991385,0.991797
500,0.95686,0.957761
1000,0.912329,0.913544
2000,0.81319,0.81456


In [9]:
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999994
2,0.999875
5,0.999644
10,0.99929
20,0.998315
50,0.995885
100,0.991591
500,0.95731
1000,0.912936
2000,0.813875


In [10]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_corrected_file)
ci.to_pickle(ci_corrected_file)