# Simulation experiment 

Run entire simulation experiment multiple times to generate confidence interval

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [2]:
# Parameters
NN_architecture = 'NN_2500_30'
analysis_name = 'analysis_2'
file_prefix = "Experiment"
num_simulated_samples = 500
lst_num_experiments = [1, 2, 5, 10, 20,
                     50, 100, 250, 500]
corrected = False
use_pca = True
num_PCs = 10

iterations = range(5) 
num_cores = 5

In [3]:
# Input file
local_dir = "/home/alexandra/Documents/"

normalized_data_file = os.path.join(
  local_dir,
  "Data",
  "Batch_effects",
  "input",
  "recount2_gene_normalized_data.tsv")

In [4]:
# Output files
local_dir = "/home/alexandra/Documents/"

similarity_corrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "saved variables",
    "analysis_2_similarity_uncorrected.pickle")

ci_corrected_file = os.path.join(
    local_dir,
    "Data",
    "Batch_effects",
    "output",
    "saved variables",
    "analysis_2_ci_uncorrected.pickle")

In [5]:
# Run multiple simulations - corrected
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.simple_simulation_experiment_uncorrected)(i,
                                                            NN_architecture,
                                                            analysis_name,
                                                            num_simulated_samples,
                                                            lst_num_experiments,
                                                            corrected,
                                                            use_pca,
                                                            num_PCs,
                                                            file_prefix,
                                                            normalized_data_file) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 45.7min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 46.0min remaining: 69.0min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 46.0min remaining: 30.7min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 46.1min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 46.1min finished


In [6]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.999986,0.999987,0.999988,0.999988,0.999988
2,0.902295,0.901002,0.900913,0.902847,0.899995
5,0.636932,0.624227,0.642637,0.630139,0.629634
10,0.210695,0.202146,0.200061,0.195011,0.200457
20,0.124954,0.125155,0.132709,0.129363,0.126525
50,0.173259,0.196031,0.18311,0.193683,0.172567
100,0.240609,0.276275,0.281203,0.286517,0.262628
250,0.51275,0.540659,0.547269,0.550609,0.54059
500,0.782829,0.812267,0.751485,0.794607,0.796706


In [7]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999987
2,0.90141
5,0.632714
10,0.201674
20,0.127741
50,0.18373
100,0.269447
250,0.538375
500,0.787579


In [8]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,3.036567e-07
2,0.0003628309
5,0.002260503
10,0.00180327
20,0.001039756
50,0.003482093
100,0.005818731
250,0.004731731
500,0.007189577


In [9]:
# Get confidence interval for each row (number of experiments)
err = std_scores*1.96

In [10]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999987,0.999988
2,0.900699,0.902121
5,0.628283,0.637145
10,0.19814,0.205209
20,0.125703,0.129779
50,0.176905,0.190555
100,0.258042,0.280851
250,0.529101,0.54765
500,0.773487,0.801671


In [11]:
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999987
2,0.90141
5,0.632714
10,0.201674
20,0.127741
50,0.18373
100,0.269447
250,0.538375
500,0.787579


In [12]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_corrected_file)
ci.to_pickle(ci_corrected_file)