# Simulation experiment 

Run entire simulation experiment multiple times to generate confidence interval

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [2]:
# Output files
local_dir = "/home/alexandra/Documents/"

similarity_corrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "analysis_0_similarity_corrected.pickle")

ci_corrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "analysis_0_ci_corrected.pickle")

In [3]:
# Run multiple simulations - corrected
iterations = range(10) 
num_cores = 5
results = Parallel(n_jobs=num_cores, verbose=100)(delayed(pipelines.simple_simulation_experiment_corrected)(i) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 175.0min
[Parallel(n_jobs=5)]: Done   2 out of  10 | elapsed: 175.5min remaining: 702.1min
[Parallel(n_jobs=5)]: Done   3 out of  10 | elapsed: 175.6min remaining: 409.7min
[Parallel(n_jobs=5)]: Done   4 out of  10 | elapsed: 175.6min remaining: 263.4min
[Parallel(n_jobs=5)]: Done   5 out of  10 | elapsed: 175.6min remaining: 175.6min
[Parallel(n_jobs=5)]: Done   6 out of  10 | elapsed: 346.4min remaining: 231.0min
[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed: 347.2min remaining: 148.8min
[Parallel(n_jobs=5)]: Done   8 out of  10 | elapsed: 347.3min remaining: 86.8min
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed: 347.4min remaining:    0.0s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed: 347.4min finished


In [4]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score,score,score,score,score,score
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.999994,0.999994,0.999994,0.999994,0.999994,0.999994,0.999994,0.999994,0.999994,0.999994
2,0.999953,0.999915,0.999896,0.999929,0.999933,0.999942,0.999924,0.999909,0.999824,0.999881
5,0.999693,0.999602,0.999824,0.999727,0.999732,0.999604,0.999577,0.999594,0.999688,0.9997
10,0.999264,0.999346,0.999365,0.999021,0.999294,0.999432,0.999325,0.999061,0.999159,0.999335
20,0.99862,0.99821,0.998483,0.998338,0.998619,0.998443,0.998328,0.998637,0.998311,0.998288
50,0.996094,0.995584,0.996036,0.996325,0.99585,0.995551,0.995817,0.995422,0.995613,0.996018
100,0.991772,0.991819,0.991569,0.99111,0.991703,0.991514,0.992052,0.991981,0.991859,0.992232
500,0.955082,0.957772,0.95662,0.95652,0.957889,0.955485,0.956831,0.956774,0.958116,0.956258
1000,0.91121,0.91252,0.910659,0.91401,0.911961,0.912539,0.911706,0.912865,0.911963,0.912539
2000,0.80807,0.813846,0.81263,0.81559,0.81669,0.818141,0.813086,0.813489,0.814668,0.816973


In [5]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999994
2,0.999911
5,0.999674
10,0.99926
20,0.998428
50,0.995831
100,0.991761
500,0.956735
1000,0.912197
2000,0.814318


In [6]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,3.69057e-08
2,1.175788e-05
5,2.483047e-05
10,4.29054e-05
20,4.934878e-05
50,9.097945e-05
100,9.964975e-05
500,0.0003145969
1000,0.0002936697
2000,0.0009027824


In [7]:
# Get confidence interval for each row (number of experiments)
err = std_scores*2.262

In [8]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999994,0.999994
2,0.999884,0.999937
5,0.999618,0.99973
10,0.999163,0.999357
20,0.998316,0.998539
50,0.995625,0.996037
100,0.991536,0.991987
500,0.956023,0.957446
1000,0.911533,0.912862
2000,0.812276,0.81636


In [9]:
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999994
2,0.999911
5,0.999674
10,0.99926
20,0.998428
50,0.995831
100,0.991761
500,0.956735
1000,0.912197
2000,0.814318


In [10]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_corrected_file)
ci.to_pickle(ci_corrected_file)