# Simulation experiment using noise-corrected data

Run entire simulation experiment multiple times to generate confidence interval.  The simulation experiment can be found in ```functions/pipeline.py```

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../../")
from functions import pipelines, utils

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [2]:
# Read in config variables
config_file = os.path.abspath(os.path.join(os.getcwd(),"../../configs", "config_Pa_sample_combat.tsv"))
params = utils.read_config(config_file)

In [3]:
# Load parameters
dataset_name = params["dataset_name"]
analysis_name = params["analysis_name"]
NN_architecture = params["NN_architecture"]
num_simulated_samples = params["num_simulated_samples"]
lst_num_experiments = params["lst_num_experiments"]
use_pca = params["use_pca"]
num_PCs = params["num_PCs"]
local_dir = params["local_dir"]
correction_method = params["correction_method"]

iterations = params["iterations"] 
num_cores = params["num_cores"]

In [4]:
# Additional parameters
file_prefix = "Experiment_corrected"
corrected = True

In [5]:
# Input files
base_dir = os.path.abspath(
  os.path.join(
      os.getcwd(), "../.."))    # base dir on repo


normalized_data_file = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "input",
    "train_set_normalized.pcl")

In [6]:
# Output files
similarity_corrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    dataset_name +"_sample_lvl_sim_similarity_corrected_"+correction_method+".pickle")

ci_corrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    dataset_name +"_sample_lvl_sim_ci_corrected_"+correction_method+".pickle")

In [7]:
# Run multiple simulations
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.sample_level_simulation_corrected)(i,
                                                     NN_architecture,
                                                     dataset_name,
                                                     analysis_name,
                                                     num_simulated_samples,
                                                     lst_num_experiments,
                                                     corrected,
                                                     correction_method,
                                                     use_pca,
                                                     num_PCs,
                                                     file_prefix,
                                                     normalized_data_file,
                                                     local_dir) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 298.8min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 299.3min remaining: 448.9min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 299.5min remaining: 199.7min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 299.6min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 299.6min finished


In [8]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.999994,0.999994,0.999994,0.999994,0.999994
2,0.999965,0.999845,0.999907,0.999911,0.999893
5,0.999565,0.999646,0.999567,0.999615,0.999647
10,0.999161,0.999192,0.999237,0.99935,0.99903
20,0.998525,0.998451,0.997966,0.997951,0.998262
50,0.995947,0.99569,0.994992,0.994909,0.995061
100,0.990992,0.990982,0.990718,0.990388,0.990735
500,0.95307,0.951418,0.953585,0.95148,0.952577
1000,0.904443,0.905647,0.905939,0.905173,0.907068
2000,0.802144,0.803865,0.80182,0.807029,0.806779


In [9]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999994
2,0.999904
5,0.999608
10,0.999194
20,0.998231
50,0.99532
100,0.990763
500,0.952426
1000,0.905654
2000,0.804327


In [10]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,3.870667e-08
2,1.352275e-05
5,1.283589e-05
10,3.679959e-05
20,8.4324e-05
50,0.0001476908
100,7.813236e-05
500,0.0003038355
1000,0.0003073755
2000,0.0007838177


In [11]:
# Get confidence interval for each row (number of experiments)
# z-score for 95% confidence interval
err = std_scores*1.96

In [12]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999994,0.999994
2,0.999878,0.999931
5,0.999583,0.999633
10,0.999122,0.999266
20,0.998066,0.998396
50,0.99503,0.995609
100,0.99061,0.990916
500,0.951831,0.953022
1000,0.905051,0.906256
2000,0.802791,0.805864


In [13]:
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999994
2,0.999904
5,0.999608
10,0.999194
20,0.998231
50,0.99532
100,0.990763
500,0.952426
1000,0.905654
2000,0.804327


In [14]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_corrected_file)
ci.to_pickle(ci_corrected_file)