# Simulation experiment 

Run entire simulation experiment multiple times to generate confidence interval

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [2]:
# Parameters
dataset_name = "Human_analysis"
analysis_name = 'analysis_2'
NN_architecture = 'NN_2500_30'
file_prefix = 'Experiment_corrected'
num_simulated_samples = 500
lst_num_experiments = [1, 2, 5, 10, 20,
                     50, 100, 250, 500]
corrected = True
use_pca = True
num_PCs = 10

iterations = range(5) 
num_cores = 5

In [3]:
# Input
base_dir = os.path.abspath(
  os.path.join(
      os.getcwd(), "../.."))    # base dir on repo


normalized_data_file = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "input",
    "recount2_gene_normalized_data.tsv.xz")

In [4]:
# Output files
similarity_corrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_2_similarity_corrected.pickle")

ci_corrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_2_ci_corrected.pickle")

In [5]:
# Run multiple simulations - corrected
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.simple_simulation_experiment_corrected)(i,
                                                          NN_architecture,
                                                          dataset_name,
                                                          analysis_name,
                                                          num_simulated_samples,
                                                          lst_num_experiments,
                                                          corrected,
                                                          use_pca,
                                                          num_PCs,
                                                          file_prefix,
                                                          normalized_data_file) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 78.6min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 78.7min remaining: 118.0min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 78.8min remaining: 52.5min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 78.9min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 78.9min finished


In [6]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.999994,0.999992,0.999992,0.999994,0.999993
2,0.99937,0.998756,0.999143,0.99851,0.999115
5,0.995805,0.996246,0.994421,0.995813,0.995819
10,0.991519,0.992937,0.991083,0.992514,0.991567
20,0.982634,0.980989,0.980753,0.979502,0.981058
50,0.951955,0.950794,0.947115,0.940517,0.951328
100,0.894665,0.901452,0.895667,0.896929,0.889448
250,0.708209,0.705842,0.693866,0.701911,0.70656
500,0.125087,0.120792,0.109209,0.11965,0.116131


In [7]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999993
2,0.998979
5,0.995621
10,0.991924
20,0.980987
50,0.948342
100,0.895632
250,0.703277
500,0.118174


In [8]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,3.10724e-07
2,0.0001080976
5,0.0002203053
10,0.0002435935
20,0.0003528163
50,0.001506035
100,0.001366905
250,0.001817413
500,0.001879934


In [9]:
# Get confidence interval for each row (number of experiments)
# z-score for 95% confidence interval
err = std_scores*1.96

In [10]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999992,0.999994
2,0.998767,0.999191
5,0.995189,0.996053
10,0.991447,0.992402
20,0.980296,0.981679
50,0.94539,0.951293
100,0.892953,0.898311
250,0.699715,0.70684
500,0.114489,0.121859


In [11]:
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999993
2,0.998979
5,0.995621
10,0.991924
20,0.980987
50,0.948342
100,0.895632
250,0.703277
500,0.118174


In [12]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_corrected_file)
ci.to_pickle(ci_corrected_file)