# Simulation experiment 

Run entire simulation experiment multiple times to generate confidence interval

In [31]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Parameters
dataset_name = "Pseudomonas_analysis"
analysis_name = 'analysis_0'
NN_architecture = 'NN_2500_30'
file_prefix = "Experiment"
num_simulated_samples = 6000
lst_num_experiments = [1, 2, 5, 10, 20,
                     50, 100, 500, 1000, 2000, 3000, 6000]
corrected = False
use_pca = True
num_PCs = 10

iterations = range(5) 
num_cores = 5

In [3]:
# Input file
base_dir = os.path.abspath(
      os.path.join(
          os.getcwd(), "../.."))

normalized_data_file = os.path.join(
    base_dir,
    dataset_name,    
    "data",
    "input",
    "train_set_normalized.pcl")

In [35]:
# Output files
similarity_uncorrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_0_similarity_uncorrected.pickle")

ci_uncorrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_0_ci_uncorrected.pickle")

similarity_permuted_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_0_permuted")

In [5]:
# Run multiple simulations
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.simple_simulation_experiment_uncorrected)(i,
                                                            NN_architecture,
                                                            dataset_name,
                                                            analysis_name,
                                                            num_simulated_samples,
                                                            lst_num_experiments,
                                                            corrected,
                                                            use_pca,
                                                            num_PCs,
                                                            file_prefix,
                                                            normalized_data_file) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 58.4min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 58.5min remaining: 87.7min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 58.5min remaining: 39.0min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 58.6min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 58.6min finished


In [22]:
# permuted score
permuted_score = results[0][0]

In [23]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.999994,0.999994,0.999994,0.999994,0.999994
2,0.900081,0.901643,0.901607,0.901429,0.900972
5,0.606001,0.610072,0.607779,0.607527,0.611455
10,0.129481,0.132649,0.134811,0.128567,0.130044
20,0.040371,0.043376,0.046324,0.046866,0.042082
50,0.142682,0.149132,0.14786,0.155034,0.139217
100,0.246116,0.235181,0.233653,0.241531,0.23486
500,0.716181,0.74124,0.722165,0.7103,0.701948
1000,0.892126,0.937884,0.942321,0.936079,0.935772
2000,0.959631,0.966316,0.966471,0.962416,0.964505


In [24]:
# Get mean svcca score for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999994
2,0.901147
5,0.608567
10,0.13111
20,0.043804
50,0.146785
100,0.238268
500,0.718367
1000,0.928836
2000,0.963868


In [25]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,4.726433e-08
2,0.0002063907
5,0.0006872232
10,0.0008111632
20,0.0008754871
50,0.001928907
100,0.001693193
500,0.004682489
1000,0.006541999
2000,0.0009116058


In [26]:
# Get confidence interval for each row (number of experiments)
# z-score for 95% confidence interval 
err = std_scores*1.96

In [27]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999994,0.999994
2,0.900742,0.901551
5,0.60722,0.609914
10,0.129521,0.1327
20,0.042088,0.04552
50,0.143004,0.150566
100,0.23495,0.241587
500,0.709189,0.727544
1000,0.916014,0.941659
2000,0.962081,0.965654


In [28]:
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999994
2,0.901147
5,0.608567
10,0.13111
20,0.043804
50,0.146785
100,0.238268
500,0.718367
1000,0.928836
2000,0.963868


In [36]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_uncorrected_file)
ci.to_pickle(ci_uncorrected_file)
np.save(similarity_permuted_file, permuted_score)