# Simulation experiment using noisy data 

Run entire simulation experiment multiple times to generate confidence interval.  The simulation experiment can be found in ```functions/pipeline.py```

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [2]:
# Parameters
dataset_name = "Pseudomonas_analysis"
analysis_name = 'Pa_sample_lvl_sim'
NN_architecture = 'NN_2500_30'
file_prefix = "Experiment"
num_simulated_samples = 6000
lst_num_experiments = [1, 2, 5, 10, 20,
                     50, 100, 500, 1000, 2000, 3000, 6000]

corrected = False
use_pca = True
num_PCs = 10
local_dir = os.path.abspath(
      os.path.join(
          os.getcwd(), "../../../../"))

iterations = range(5) 
num_cores = 5

In [3]:
# Input file
base_dir = os.path.abspath(
      os.path.join(
          os.getcwd(), "../.."))

normalized_data_file = os.path.join(
    base_dir,
    dataset_name,    
    "data",
    "input",
    "train_set_normalized.pcl")

In [4]:
# Output files
similarity_uncorrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "Pa_sample_lvl_sim_similarity_uncorrected.pickle")

ci_uncorrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "Pa_sample_lvl_sim_ci_uncorrected.pickle")

similarity_permuted_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "Pa_sample_lvl_sim_permuted")

In [5]:
# Run multiple simulations
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.sample_level_simulation_uncorrected)(i,
                                                       NN_architecture,
                                                       dataset_name,
                                                       analysis_name,
                                                       num_simulated_samples,
                                                       lst_num_experiments,
                                                       corrected,
                                                       use_pca,
                                                       num_PCs,
                                                       file_prefix,
                                                       normalized_data_file,
                                                       local_dir) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 57.4min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 57.4min remaining: 86.2min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 57.5min remaining: 38.3min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 57.6min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 57.6min finished


In [6]:
# permuted score
permuted_score = results[0][0]

In [7]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.999994,0.999994,0.999994,0.999994,0.999994
2,0.900042,0.902843,0.900469,0.900302,0.900899
5,0.60897,0.608523,0.609727,0.609673,0.610966
10,0.132896,0.132388,0.131792,0.132078,0.124027
20,0.046526,0.048585,0.051452,0.040847,0.042529
50,0.143543,0.136679,0.158847,0.148668,0.136564
100,0.244587,0.237877,0.239256,0.241794,0.24216
500,0.768671,0.734189,0.736753,0.71899,0.757317
1000,0.939169,0.939292,0.937404,0.935116,0.940601
2000,0.96607,0.965912,0.965399,0.964399,0.965082


In [8]:
# Get mean svcca score for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999994
2,0.900911
5,0.609572
10,0.130636
20,0.045988
50,0.14486
100,0.241135
500,0.743184
1000,0.938316
2000,0.965372


In [9]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,2.412816e-08
2,0.0003554485
5,0.0002932667
10,0.001175554
20,0.00137196
50,0.002947946
100,0.0008298621
500,0.00623895
1000,0.0006702862
2000,0.0002127304


In [10]:
# Get confidence interval for each row (number of experiments)
# z-score for 95% confidence interval 
err = std_scores*1.96

In [11]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999994,0.999994
2,0.900215,0.901608
5,0.608997,0.610147
10,0.128332,0.13294
20,0.043299,0.048677
50,0.139082,0.150638
100,0.239508,0.242761
500,0.730956,0.755412
1000,0.937003,0.93963
2000,0.964955,0.965789


In [12]:
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999994
2,0.900911
5,0.609572
10,0.130636
20,0.045988
50,0.14486
100,0.241135
500,0.743184
1000,0.938316
2000,0.965372


In [13]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_uncorrected_file)
ci.to_pickle(ci_uncorrected_file)
np.save(similarity_permuted_file, permuted_score)