# Simulation experiment 

Run entire simulation experiment multiple times to generate confidence interval

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [None]:
# Parameters
NN_architecture = 'NN_2500_30'
analysis_name = 'analysis_0'
num_simulated_samples = 6000
lst_num_experiments = [1, 2, 5, 10, 20,
                     50, 100, 500, 1000, 2000, 3000, 6000]
corrected = False
use_pca = True
num_PCs = 10

iterations = range(10) 
num_cores = 5

In [None]:
# Input file
base_dir = os.path.abspath(
      os.path.join(
          os.getcwd(), "../.."))

normalized_data_file = os.path.join(
      base_dir,
      "data",
      "input",
      "train_set_normalized.pcl")

In [2]:
# Output files
local_dir = "/home/alexandra/Documents/"

similarity_uncorrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "analysis_0_similarity_uncorrected.pickle")

ci_uncorrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "analysis_0_ci_uncorrected.pickle")

In [3]:
# Run multiple simulations - uncorrected

results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.simple_simulation_experiment_uncorrected)(i,
                                                            NN_architecture,
                                                            analysis_name,
                                                            num_simulated_samples,
                                                            lst_num_experiments,
                                                            corrected,
                                                            use_pca,
                                                            num_PCs,
                                                            "Experiment",
                                                            normalized_data_file) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed:  3.8min
[Parallel(n_jobs=5)]: Done   2 out of  10 | elapsed:  3.9min remaining: 15.5min
[Parallel(n_jobs=5)]: Done   3 out of  10 | elapsed:  3.9min remaining:  9.1min
[Parallel(n_jobs=5)]: Done   4 out of  10 | elapsed:  4.0min remaining:  6.0min
[Parallel(n_jobs=5)]: Done   5 out of  10 | elapsed:  4.0min remaining:  4.0min
[Parallel(n_jobs=5)]: Done   6 out of  10 | elapsed:  7.5min remaining:  5.0min
[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed:  7.6min remaining:  3.3min
[Parallel(n_jobs=5)]: Done   8 out of  10 | elapsed:  7.7min remaining:  1.9min
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:  7.8min remaining:    0.0s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:  7.8min finished


In [4]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score,score,score,score,score,score
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.999994,0.999994,0.999994,0.999994,0.999994,0.999994,0.999994,0.999994,0.999994,0.999994
2,0.901761,0.900751,0.902341,0.901195,0.901537,0.90196,0.900591,0.901738,0.901833,0.900161
5,0.607209,0.608213,0.606863,0.611047,0.610162,0.609071,0.607958,0.609651,0.609955,0.607965
10,0.13008,0.133383,0.131133,0.132351,0.13317,0.131531,0.127556,0.128726,0.136425,0.125894
20,0.048666,0.046363,0.039898,0.046762,0.047476,0.043265,0.046188,0.038261,0.040468,0.040145
50,0.15264,0.151998,0.1448,0.153213,0.144034,0.15168,0.146229,0.148812,0.144961,0.156377
100,0.228591,0.235202,0.232042,0.233949,0.238646,0.236454,0.23485,0.242122,0.239245,0.238676
500,0.697937,0.704116,0.743546,0.726046,0.71595,0.718,0.741065,0.701284,0.73496,0.733642
1000,0.939947,0.943675,0.937064,0.934701,0.939832,0.93313,0.941193,0.936527,0.940996,0.937253
2000,0.964691,0.966896,0.966457,0.964828,0.965858,0.963524,0.966225,0.966083,0.965869,0.96136


In [5]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999994
2,0.901387
5,0.608809
10,0.131025
20,0.043749
50,0.149474
100,0.235978
500,0.721654
1000,0.938432
2000,0.965179


In [6]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,3.635618e-08
2,0.0002185911
5,0.0004355264
10,0.0009769144
20,0.001196621
50,0.001356729
100,0.001243188
500,0.005298866
1000,0.001026616
2000,0.0005282517


In [7]:
# Get confidence interval for each row (number of experiments)
# z-score for 95% confidence interval 
err = std_scores*1.96

In [8]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999994,0.999994
2,0.900958,0.901815
5,0.607956,0.609663
10,0.12911,0.13294
20,0.041404,0.046094
50,0.146815,0.152134
100,0.233541,0.238414
500,0.711269,0.73204
1000,0.93642,0.940444
2000,0.964144,0.966215


In [9]:
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999994
2,0.901387
5,0.608809
10,0.131025
20,0.043749
50,0.149474
100,0.235978
500,0.721654
1000,0.938432
2000,0.965179


In [10]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_uncorrected_file)
ci.to_pickle(ci_uncorrected_file)