# Simulation experiment using noise-corrected data 

Run entire simulation experiment multiple times to generate confidence interval.  The simulation experiment can be found in ```functions/pipeline.py```

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../../")
from functions import pipelines, utils

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [None]:
# Read in config variables
config_file = os.path.abspath(os.path.join(os.getcwd(),"../../configs", "config_Pa_experiment_combat.tsv"))
params = utils.read_config(config_file)

In [None]:
# Load parameters
dataset_name = params["dataset_name"]
analysis_name = params["analysis_name"]
NN_architecture = params["NN_architecture"]
num_simulated_experiments = params["num_simulated_samples"]
lst_num_partitions = params["lst_num_experiments"]
use_pca = params["use_pca"]
num_PCs = params["num_PCs"]
local_dir = params["local_dir"]
correction_method = params["correction_method"]

iterations = params["iterations"] 
num_cores = params["num_cores"]

In [None]:
# Additional parameters
file_prefix = "Partition_corrected"
corrected = True

In [3]:
# Input files
base_dir = os.path.abspath(
  os.path.join(
      os.getcwd(), "../.."))    # base dir on repo

normalized_data_file = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "input",
    "train_set_normalized.pcl")

experiment_ids_file = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "metadata",
    "experiment_ids.txt")

In [4]:
# Output files
similarity_corrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    dataset_name +"_experiment_lvl_sim_similarity_corrected_"+correction_method+".pickle")

ci_corrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    dataset_name +"_experiment_lvl_sim_ci_corrected_"+correction_method+".pickle")

In [5]:
# Run multiple simulations - corrected
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.experiment_level_simulation_corrected)(i,
                                                         NN_architecture,
                                                         dataset_name,
                                                         analysis_name,
                                                         num_simulated_experiments,
                                                         lst_num_partitions,
                                                         corrected,
                                                         correction_method,
                                                         use_pca,
                                                         num_PCs,
                                                         file_prefix,
                                                         normalized_data_file,
                                                         experiment_ids_file,
                                                         local_dir) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 137.3min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 146.8min remaining: 220.3min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 148.0min remaining: 98.7min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 151.6min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 151.6min finished


In [6]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.999993,0.999993,0.999993,0.999993,0.999993
2,0.998853,0.998547,0.998894,0.999077,0.998011
3,0.997205,0.997114,0.997435,0.99736,0.997048
5,0.991418,0.995909,0.99544,0.994917,0.995684
10,0.990376,0.986824,0.990779,0.986882,0.987215
20,0.957855,0.971843,0.975595,0.977124,0.983426
30,0.940494,0.960633,0.96544,0.96684,0.932293
50,0.932856,0.933557,0.940622,0.9518,0.944854
70,0.935345,0.923692,0.930796,0.926984,0.925988
100,0.852335,0.892757,0.899173,0.899614,0.895168


In [7]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999993
2,0.998677
3,0.997232
5,0.994674
10,0.988415
20,0.973168
30,0.95314
50,0.940738
70,0.928561
100,0.887809


In [8]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,5.724409e-08
2,0.0001321586
3,5.149498e-05
5,0.0005871956
10,0.0006276996
20,0.003012487
30,0.004974081
50,0.002515056
70,0.001447743
100,0.006335547


In [9]:
# Get confidence interval for each row (number of experiments)
err = std_scores*1.96

In [10]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999993,0.999993
2,0.998417,0.998936
3,0.997131,0.997333
5,0.993523,0.995825
10,0.987185,0.989645
20,0.967264,0.979073
30,0.943391,0.962889
50,0.935808,0.945667
70,0.925723,0.931399
100,0.875392,0.900227


In [11]:
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999993
2,0.998677
3,0.997232
5,0.994674
10,0.988415
20,0.973168
30,0.95314
50,0.940738
70,0.928561
100,0.887809


In [12]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_corrected_file)
ci.to_pickle(ci_corrected_file)