# Simulation experiment 

Run entire simulation experiment multiple times to generate confidence interval

In [14]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Parameters
dataset_name = "Pseudomonas_analysis"
analysis_name = 'analysis_1'
NN_architecture = 'NN_2500_30'
file_prefix = "Partition"
num_simulated_experiments = 600
lst_num_partitions = [1, 2, 3, 5, 10, 20,
                    30, 50, 70, 100, 200, 300, 400, 500, 600]
corrected = False
use_pca = True
num_PCs = 10

iterations = range(5) 
num_cores = 5

In [3]:
# Input
base_dir = os.path.abspath(
      os.path.join(
          os.getcwd(), "../.."))

normalized_data_file = os.path.join(
    base_dir,
    dataset_name,    
    "data",
    "input",
    "train_set_normalized.pcl")

experiment_ids_file = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "metadata",
    "experiment_ids.txt")

In [4]:
# Output files
similarity_uncorrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_1_similarity_uncorrected.pickle")

ci_uncorrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_1_ci_uncorrected.pickle")

similarity_permuted_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_1_permuted")

In [5]:
# Run multiple simulations - uncorrected
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.matched_simulation_experiment_uncorrected)(i,
                                                             NN_architecture,
                                                             dataset_name,
                                                             analysis_name,
                                                             num_simulated_experiments,
                                                             lst_num_partitions,
                                                             corrected,
                                                             use_pca,
                                                             num_PCs,
                                                             file_prefix,
                                                             normalized_data_file,
                                                             experiment_ids_file) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 77.5min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 77.5min remaining: 116.3min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 78.9min remaining: 52.6min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 82.6min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 82.6min finished


In [6]:
# permuted score
permuted_score = results[0][0]

In [7]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.999993,0.999993,0.999993,0.999993,0.999993
2,0.900624,0.901185,0.900755,0.902631,0.901901
3,0.80949,0.826829,0.818413,0.808014,0.813203
5,0.627909,0.634846,0.633973,0.632571,0.640819
10,0.222736,0.254938,0.23827,0.255387,0.241524
20,0.321593,0.331814,0.325496,0.344085,0.349099
30,0.36503,0.391315,0.376604,0.355648,0.384198
50,0.439416,0.419361,0.461672,0.416198,0.467807
70,0.484678,0.493653,0.533759,0.5363,0.517343
100,0.540044,0.556955,0.651603,0.616367,0.602392


In [8]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999993
2,0.901419
3,0.81519
5,0.634023
10,0.242571
20,0.334417
30,0.374559
50,0.440891
70,0.513147
100,0.593472


In [9]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,5.777318e-08
2,0.0002659367
3,0.002417582
5,0.001469388
10,0.004272685
20,0.003741202
30,0.00454492
50,0.007469618
70,0.00736373
100,0.01429254


In [10]:
# Get confidence interval for each row (number of experiments)
err = std_scores*1.96

In [11]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999993,0.999993
2,0.900898,0.90194
3,0.810452,0.819929
5,0.631143,0.636903
10,0.234196,0.250945
20,0.327084,0.34175
30,0.365651,0.383467
50,0.42625,0.455531
70,0.498714,0.52758
100,0.565459,0.621486


In [12]:
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999993
2,0.901419
3,0.81519
5,0.634023
10,0.242571
20,0.334417
30,0.374559
50,0.440891
70,0.513147
100,0.593472


In [15]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_uncorrected_file)
ci.to_pickle(ci_uncorrected_file)
np.save(similarity_permuted_file, permuted_score)