# Simulation experiment using noise-corrected data

Run entire simulation experiment multiple times to generate confidence interval.  The simulation experiment can be found in ```functions/pipeline.py```

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [2]:
# Parameters
dataset_name = "Human_analysis"
analysis_name = 'Human_experiment_lvl_sim'
NN_architecture = 'NN_2500_30'
file_prefix = 'Partition_corrected'
num_simulated_experiments = 50
lst_num_partitions = [1, 2, 3, 5, 10, 20,
                    30, 50]

corrected = True
use_pca = True
num_PCs = 10
local_dir = os.path.abspath(
      os.path.join(
          os.getcwd(), "../../../../"))

iterations = range(5) 
num_cores = 5

In [3]:
# Input files
base_dir = os.path.abspath(
  os.path.join(
      os.getcwd(), "../.."))    # base dir on repo

normalized_data_file = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "input",
    "recount2_gene_normalized_data.tsv.xz")

experiment_ids_file = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "metadata",
    "recount2_experiment_ids.txt")

In [4]:
# Output files
similarity_corrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "Human_experiment_lvl_sim_similarity_corrected.pickle")

ci_corrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "Human_experiment_lvl_sim_ci_corrected.pickle")

In [5]:
# Run multiple simulations - corrected
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.experiment_level_simulation_corrected)(i,
                                                         NN_architecture,
                                                         dataset_name,
                                                         analysis_name,
                                                         num_simulated_experiments,
                                                         lst_num_partitions,
                                                         corrected,
                                                         use_pca,
                                                         num_PCs,
                                                         file_prefix,
                                                         normalized_data_file,
                                                         experiment_ids_file,
                                                         local_dir) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 94.1min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 118.2min remaining: 177.3min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 130.2min remaining: 86.8min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 155.8min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 155.8min finished


In [6]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.999974,0.999979,0.999982,0.999981,0.999971
2,0.984059,0.983115,0.986332,0.969235,0.966313
3,0.940388,0.978723,0.959813,0.93587,0.934461
5,0.916243,0.93463,0.92507,0.939123,0.878514
10,0.840947,0.794982,0.840782,0.80418,0.844614
20,0.710517,0.709098,0.70634,0.664788,0.643032
30,0.578963,0.563119,0.641844,0.597141,0.534693
50,0.318836,0.304694,0.333847,0.321613,0.294273


In [7]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999977
2,0.977811
3,0.949851
5,0.918716
10,0.825101
20,0.686755
30,0.583152
50,0.314653


In [8]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,2e-06
2,0.002939
3,0.006031
5,0.007636
10,0.007454
20,0.0098
30,0.012657
50,0.004871


In [9]:
# Get confidence interval for each row (number of experiments)
err = std_scores*1.96

In [10]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of partitions,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999974,0.99998
2,0.97205,0.983571
3,0.938031,0.961671
5,0.903749,0.933683
10,0.81049,0.839711
20,0.667547,0.705963
30,0.558345,0.607959
50,0.305106,0.324199


In [11]:
mean_scores

Unnamed: 0_level_0,score
number of partitions,Unnamed: 1_level_1
1,0.999977
2,0.977811
3,0.949851
5,0.918716
10,0.825101
20,0.686755
30,0.583152
50,0.314653


In [12]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_corrected_file)
ci.to_pickle(ci_corrected_file)