# Simulation experiment 

Run entire simulation experiment multiple times to generate confidence interval

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [2]:
# Parameters
dataset_name = "Pseudomonas_analysis"
analysis_name = 'analysis_1'
NN_architecture = 'NN_2500_30'
file_prefix = 'Partition_corrected'
num_simulated_experiments = 600
lst_num_partitions = [1, 2, 3, 5, 10, 20,
                    30, 50, 70, 100, 200, 300, 400, 500, 600]
corrected = True
use_pca = True
num_PCs = 10

iterations = range(5) 
num_cores = 5

In [3]:
# Input files
base_dir = os.path.abspath(
  os.path.join(
      os.getcwd(), "../.."))    # base dir on repo

normalized_data_file = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "input",
    "train_set_normalized.pcl")

experiment_ids_file = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "metadata",
    "experiment_ids.txt")

In [4]:
# Output files
similarity_corrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_1_similarity_corrected.pickle")

ci_corrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_1_ci_corrected.pickle")

In [5]:
# Run multiple simulations - corrected
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.matched_simulation_experiment_corrected)(i,
                                                           NN_architecture,
                                                           dataset_name,
                                                           analysis_name,
                                                           num_simulated_experiments,
                                                           lst_num_partitions,
                                                           corrected,
                                                           use_pca,
                                                           num_PCs,
                                                           file_prefix,
                                                           normalized_data_file,
                                                           experiment_ids_file) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 140.3min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 147.6min remaining: 221.4min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 148.2min remaining: 98.8min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 153.3min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 153.3min finished


In [6]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.999993,0.999993,0.999992,0.999993,0.999993
2,0.997859,0.998244,0.998998,0.999306,0.99858
3,0.996675,0.997391,0.998265,0.997937,0.997002
5,0.992267,0.994618,0.994066,0.995776,0.99327
10,0.986732,0.986416,0.989203,0.990114,0.992153
20,0.966491,0.97774,0.973883,0.978544,0.978156
30,0.963603,0.970735,0.961566,0.962049,0.960981
50,0.944515,0.940228,0.942617,0.945481,0.948233
70,0.917413,0.928954,0.924565,0.924247,0.929358
100,0.871467,0.899031,0.908933,0.896913,0.906193


In [7]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999993
2,0.998598
3,0.997454
5,0.993999
10,0.988924
20,0.974962
30,0.963787
50,0.944215
70,0.924908
100,0.896507


In [8]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,1.341109e-07
2,0.0001826246
3,0.0002064779
5,0.0004205537
10,0.0007586091
20,0.001609888
30,0.001266259
50,0.0009524265
70,0.001523958
100,0.004695125


In [9]:
# Get confidence interval for each row (number of experiments)
err = std_scores*1.96

In [10]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999993,0.999993
2,0.99824,0.998955
3,0.997049,0.997859
5,0.993175,0.994824
10,0.987437,0.990411
20,0.971807,0.978118
30,0.961305,0.966268
50,0.942348,0.946082
70,0.921921,0.927895
100,0.887305,0.90571


In [11]:
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999993
2,0.998598
3,0.997454
5,0.993999
10,0.988924
20,0.974962
30,0.963787
50,0.944215
70,0.924908
100,0.896507


In [12]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_corrected_file)
ci.to_pickle(ci_corrected_file)