# Simulation experiment 

Run entire simulation experiment multiple times to generate confidence interval

In [1]:
%load_ext autoreload
%autoreload 2

from joblib import Parallel, delayed
import multiprocessing
import sys
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../../")
from functions import pipelines

from numpy.random import seed
randomState = 123
seed(randomState)

Using TensorFlow backend.


In [2]:
# Parameters
dataset_name = "Human_analysis"
analysis_name = 'analysis_2'
NN_architecture = 'NN_2500_30'
file_prefix = "Experiment"
num_simulated_samples = 500
lst_num_experiments = [1, 2, 5, 10, 20,
                     50, 100, 250, 500]
corrected = False
use_pca = True
num_PCs = 10

iterations = range(5) 
num_cores = 5

In [3]:
# Input file
base_dir = os.path.abspath(
      os.path.join(
          os.getcwd(), "../.."))

normalized_data_file = os.path.join(
    base_dir,
    dataset_name,    
    "data",
    "input",
    "recount2_gene_normalized_data.tsv.xz")

In [4]:
# Output files
similarity_uncorrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_2_similarity_uncorrected.pickle")

ci_uncorrected_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_2_ci_uncorrected.pickle")

similarity_permuted_file = os.path.join(
    base_dir,
    "results",
    "saved_variables",
    "analysis_2_permuted")

In [5]:
# Run multiple simulations - corrected
results = Parallel(n_jobs=num_cores, verbose=100)(
    delayed(
        pipelines.simple_simulation_experiment_uncorrected)(i,
                                                            NN_architecture,
                                                            dataset_name,
                                                            analysis_name,
                                                            num_simulated_samples,
                                                            lst_num_experiments,
                                                            corrected,
                                                            use_pca,
                                                            num_PCs,
                                                            file_prefix,
                                                            normalized_data_file) for i in iterations)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed: 45.9min
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 46.1min remaining: 69.1min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 46.3min remaining: 30.8min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 46.3min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 46.3min finished


In [6]:
# permuted score
permuted_score = results[0][0]

In [7]:
# Concatenate output dataframes
all_svcca_scores = pd.DataFrame()

for i in iterations:
    all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

all_svcca_scores

Unnamed: 0_level_0,score,score,score,score,score
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.999993,0.999992,0.999993,0.999993,0.999991
2,0.902023,0.901112,0.904359,0.910918,0.905008
5,0.628811,0.635681,0.628711,0.645128,0.628005
10,0.2077,0.222427,0.199267,0.191023,0.216045
20,0.12556,0.13678,0.143462,0.111999,0.142949
50,0.157211,0.172216,0.150251,0.168941,0.189001
100,0.253974,0.258027,0.26283,0.255281,0.276705
250,0.576567,0.551856,0.538013,0.553895,0.575574
500,0.907872,0.865173,0.849864,0.891416,0.874866


In [8]:
# Get median for each row (number of experiments)
mean_scores = all_svcca_scores.mean(axis=1).to_frame()
mean_scores.columns = ['score']
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999993
2,0.904684
5,0.633267
10,0.207292
20,0.13215
50,0.167524
100,0.261364
250,0.559181
500,0.877838


In [9]:
# Get standard dev for each row (number of experiments)
import math
std_scores = (all_svcca_scores.std(axis=1)/math.sqrt(10)).to_frame()
std_scores.columns = ['score']
std_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,2.837118e-07
2,0.001213575
5,0.002317344
10,0.003984076
20,0.00422942
50,0.004716974
100,0.002916544
250,0.005245624
500,0.007138276


In [10]:
# Get confidence interval for each row (number of experiments)
err = std_scores*1.96

In [11]:
# Get boundaries of confidence interval
ymax = mean_scores + err
ymin = mean_scores - err

ci = pd.concat([ymin, ymax], axis=1)
ci.columns = ['ymin', 'ymax']
ci

Unnamed: 0_level_0,ymin,ymax
number of experiments,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.999992,0.999993
2,0.902305,0.907062
5,0.628725,0.637809
10,0.199484,0.215101
20,0.12386,0.14044
50,0.158279,0.176769
100,0.255647,0.26708
250,0.5489,0.569463
500,0.863847,0.891829


In [12]:
mean_scores

Unnamed: 0_level_0,score
number of experiments,Unnamed: 1_level_1
1,0.999993
2,0.904684
5,0.633267
10,0.207292
20,0.13215
50,0.167524
100,0.261364
250,0.559181
500,0.877838


In [14]:
# Pickle dataframe of mean scores scores for first run, interval
mean_scores.to_pickle(similarity_uncorrected_file)
ci.to_pickle(ci_uncorrected_file)
np.save(similarity_permuted_file, permuted_score)