# Add batch effects

Say we are interested in identifying genes that differentiate between disease vs normal states.  However our dataset includes samples from different tissues or time points and there are variations in gene expression that are due to these other conditions and do not have to do with disease state.  These non-relevant variations in the data are called *batch effects*.  

We want to model these batch effects.  To do this we will:
1. Partition our simulated data into n batches
2. For each partition we will randomly shift the expression data.  We randomly generate a binary vector of length=number of genes (*offset vector*).  This vector will serve as the direction that we will shift to.  Then we also have a random scalar that will tell us how big of a step to take in our random direction (*stretch factor*).  We shift our partitioned data by: batch effect partition = partitioned data + stretch factor * offset vector
3. Repeat this for each partition
4. Append all batch effect partitions together


In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import random
import glob
import umap
import pickle
import seaborn as sns
import warnings
warnings.filterwarnings(action='once')

from ggplot import *
from sklearn.decomposition import PCA
from numpy.random import seed
randomState = 123
seed(randomState)

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  return f(*args, **kwds)


In [2]:
# Parameters
analysis_name = 'experiment_0'
NN_architecture = 'NN_2500_30'
num_PCs = 100
num_simulations = 10
num_batches = [1,2,5,10,20,50,100,500,1000,2000,3000,6000]
#num_batches = [1,2,3,4,5,6,7,8,9,10,15,20,50,100,500,800,1000,2000,3000,4000,5000,6000]

In [3]:
# Create directories
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../.."))

new_dir = os.path.join(
    base_dir,
    "data",
    "batch_simulated")

analysis_dir = os.path.join(new_dir, analysis_name)

if os.path.exists(analysis_dir):
    print('directory already exists: {}'.format(analysis_dir))
else:
    print('creating new directory: {}'.format(analysis_dir))
os.makedirs(analysis_dir, exist_ok=True)

directory already exists: /home/alexandra/Documents/Repos/Batch_effects_simulation/data/batch_simulated/experiment_0


In [4]:
# Load arguments
simulated_data_file = os.path.join(
    base_dir,
    "data",
    "simulated",
    analysis_name,
    "simulated_data.txt")

umap_model_file = umap_model_file = os.path.join(
    base_dir,
    "models",  
    NN_architecture,
    "umap_model.pkl")

In [5]:
# Read in UMAP model
infile = open(umap_model_file, 'rb')
umap_model = pickle.load(infile)
infile.close()

In [6]:
# Read in data
simulated_data = pd.read_table(
    simulated_data_file,
    header=0, 
    index_col=0,
    sep='\t')

simulated_data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5539,5540,5541,5542,5543,5544,5545,5546,5547,5548
0,0.690384,0.642501,0.454786,0.65065,0.374857,0.41402,0.353704,0.566726,0.447497,0.165201,...,0.375559,0.600063,0.562897,0.640035,0.661894,0.325466,0.576836,0.56733,0.708088,0.615353
1,0.69106,0.655274,0.527149,0.680636,0.371842,0.443242,0.37422,0.533293,0.502785,0.165815,...,0.290522,0.591411,0.609883,0.585498,0.596831,0.162511,0.459705,0.531669,0.710235,0.698571
2,0.826005,0.694632,0.510346,0.642764,0.611429,0.517733,0.344205,0.631357,0.663116,0.199852,...,0.589395,0.581802,0.59635,0.671666,0.753091,0.16051,0.52174,0.511157,0.747738,0.728167
3,0.600721,0.564944,0.417176,0.594936,0.382868,0.440063,0.387032,0.466111,0.402363,0.223858,...,0.34552,0.547836,0.470917,0.460431,0.564647,0.256127,0.509963,0.34822,0.579351,0.583487
4,0.621544,0.615939,0.473489,0.599652,0.401605,0.481008,0.364476,0.444714,0.447605,0.215759,...,0.450535,0.532127,0.588728,0.547413,0.578212,0.234255,0.424777,0.501985,0.704726,0.717408
5,0.735929,0.732768,0.54555,0.632979,0.460602,0.437013,0.325476,0.572898,0.682289,0.203184,...,0.414369,0.576,0.692773,0.643043,0.746655,0.215016,0.509911,0.638528,0.775373,0.74601
6,0.643262,0.647871,0.449706,0.552101,0.407846,0.421758,0.371682,0.551611,0.530684,0.185147,...,0.3928,0.503936,0.558234,0.505509,0.600357,0.213806,0.478426,0.4432,0.64138,0.686138
7,0.5172,0.567826,0.363922,0.48392,0.346046,0.44053,0.442479,0.474701,0.372429,0.156235,...,0.368144,0.488229,0.461496,0.446372,0.48005,0.364032,0.477898,0.388174,0.528046,0.492165
8,0.746649,0.745975,0.622174,0.748726,0.455695,0.499827,0.26072,0.625723,0.640932,0.206239,...,0.421102,0.636195,0.681262,0.676396,0.637278,0.241293,0.594934,0.663525,0.725401,0.708356
9,0.58071,0.58618,0.419463,0.592984,0.368153,0.40553,0.403313,0.445573,0.449738,0.212873,...,0.41954,0.534721,0.509361,0.491707,0.524512,0.257988,0.408664,0.42976,0.594038,0.572804


In [7]:
# Add batch effects
# ADD MULTIPLE SIMULATION RUNS
num_simulated_samples = simulated_data.shape[0]
num_genes = simulated_data.shape[1]
subset_genes_to_change = np.random.RandomState(randomState).choice([0, 1], size=(num_genes), p=[3./4, 1./4])
    
for i in num_batches:
    print('Creating simulated data with {} batches..'.format(i))
    
    batch_file = os.path.join(
            base_dir,
            "data",
            "batch_simulated",
            analysis_name,
            "Batch_"+str(i)+".txt")
    
    num_samples_per_batch = int(num_simulated_samples/i)
    
    if i == 1:        
        simulated_data.to_csv(batch_file, sep='\t')
        
    else:  
        batch_data_df = pd.DataFrame()
        
        simulated_data_draw = simulated_data
        for j in range(i):
            stretch_factor = np.random.uniform(1.0,1.5)
            
            # Randomly select samples
            batch_df = simulated_data_draw.sample(n=num_samples_per_batch, frac=None, replace=False)
            batch_df.columns = batch_df.columns.astype(str)
            
            # Update df to remove selected samples
            sampled_ids = list(batch_df.index)
            simulated_data_draw = simulated_data_draw.drop(sampled_ids)

            # Add batch effect
            subset_genes_to_change_tile = pd.DataFrame(
                pd.np.tile(
                    subset_genes_to_change,
                    (num_samples_per_batch, 1)),
                index=batch_df.index)

            offset_vector = pd.DataFrame(subset_genes_to_change_tile*stretch_factor)
            offset_vector.columns = offset_vector.columns.astype(str)
            batch_df = batch_df + offset_vector
            
            #batch_df = batch_df*stretch_factor

            # if any exceed 1 then set to 1 since gene expression is normalized
            batch_df[batch_df>=1.0] = 1.0


            # Append batched together
            batch_data_df = batch_data_df.append(batch_df)

            # Select a new direction (i.e. a new subset of genes to change)
            np.random.shuffle(subset_genes_to_change)
            
        # Save
        batch_data_df.to_csv(batch_file, sep='\t')

Creating simulated data with 1 batches..
Creating simulated data with 2 batches..
partition 0
size of simulated data is (6000, 5549)
partition 1
size of simulated data is (3000, 5549)
Creating simulated data with 5 batches..
partition 0
size of simulated data is (6000, 5549)
partition 1
size of simulated data is (4800, 5549)
partition 2
size of simulated data is (3600, 5549)
partition 3
size of simulated data is (2400, 5549)
partition 4
size of simulated data is (1200, 5549)
Creating simulated data with 10 batches..
partition 0
size of simulated data is (6000, 5549)
partition 1
size of simulated data is (5400, 5549)
partition 2
size of simulated data is (4800, 5549)
partition 3
size of simulated data is (4200, 5549)
partition 4
size of simulated data is (3600, 5549)
partition 5
size of simulated data is (3000, 5549)
partition 6
size of simulated data is (2400, 5549)
partition 7
size of simulated data is (1800, 5549)
partition 8
size of simulated data is (1200, 5549)
partition 9
size of

partition 67
size of simulated data is (1980, 5549)
partition 68
size of simulated data is (1920, 5549)
partition 69
size of simulated data is (1860, 5549)
partition 70
size of simulated data is (1800, 5549)
partition 71
size of simulated data is (1740, 5549)
partition 72
size of simulated data is (1680, 5549)
partition 73
size of simulated data is (1620, 5549)
partition 74
size of simulated data is (1560, 5549)
partition 75
size of simulated data is (1500, 5549)
partition 76
size of simulated data is (1440, 5549)
partition 77
size of simulated data is (1380, 5549)
partition 78
size of simulated data is (1320, 5549)
partition 79
size of simulated data is (1260, 5549)
partition 80
size of simulated data is (1200, 5549)
partition 81
size of simulated data is (1140, 5549)
partition 82
size of simulated data is (1080, 5549)
partition 83
size of simulated data is (1020, 5549)
partition 84
size of simulated data is (960, 5549)
partition 85
size of simulated data is (900, 5549)
partition 86
s

partition 126
size of simulated data is (4488, 5549)
partition 127
size of simulated data is (4476, 5549)
partition 128
size of simulated data is (4464, 5549)
partition 129
size of simulated data is (4452, 5549)
partition 130
size of simulated data is (4440, 5549)
partition 131
size of simulated data is (4428, 5549)
partition 132
size of simulated data is (4416, 5549)
partition 133
size of simulated data is (4404, 5549)
partition 134
size of simulated data is (4392, 5549)
partition 135
size of simulated data is (4380, 5549)
partition 136
size of simulated data is (4368, 5549)
partition 137
size of simulated data is (4356, 5549)
partition 138
size of simulated data is (4344, 5549)
partition 139
size of simulated data is (4332, 5549)
partition 140
size of simulated data is (4320, 5549)
partition 141
size of simulated data is (4308, 5549)
partition 142
size of simulated data is (4296, 5549)
partition 143
size of simulated data is (4284, 5549)
partition 144
size of simulated data is (4272,

partition 282
size of simulated data is (2616, 5549)
partition 283
size of simulated data is (2604, 5549)
partition 284
size of simulated data is (2592, 5549)
partition 285
size of simulated data is (2580, 5549)
partition 286
size of simulated data is (2568, 5549)
partition 287
size of simulated data is (2556, 5549)
partition 288
size of simulated data is (2544, 5549)
partition 289
size of simulated data is (2532, 5549)
partition 290
size of simulated data is (2520, 5549)
partition 291
size of simulated data is (2508, 5549)
partition 292
size of simulated data is (2496, 5549)
partition 293
size of simulated data is (2484, 5549)
partition 294
size of simulated data is (2472, 5549)
partition 295
size of simulated data is (2460, 5549)
partition 296
size of simulated data is (2448, 5549)
partition 297
size of simulated data is (2436, 5549)
partition 298
size of simulated data is (2424, 5549)
partition 299
size of simulated data is (2412, 5549)
partition 300
size of simulated data is (2400,

partition 437
size of simulated data is (756, 5549)
partition 438
size of simulated data is (744, 5549)
partition 439
size of simulated data is (732, 5549)
partition 440
size of simulated data is (720, 5549)
partition 441
size of simulated data is (708, 5549)
partition 442
size of simulated data is (696, 5549)
partition 443
size of simulated data is (684, 5549)
partition 444
size of simulated data is (672, 5549)
partition 445
size of simulated data is (660, 5549)
partition 446
size of simulated data is (648, 5549)
partition 447
size of simulated data is (636, 5549)
partition 448
size of simulated data is (624, 5549)
partition 449
size of simulated data is (612, 5549)
partition 450
size of simulated data is (600, 5549)
partition 451
size of simulated data is (588, 5549)
partition 452
size of simulated data is (576, 5549)
partition 453
size of simulated data is (564, 5549)
partition 454
size of simulated data is (552, 5549)
partition 455
size of simulated data is (540, 5549)
partition 45

partition 96
size of simulated data is (5424, 5549)
partition 97
size of simulated data is (5418, 5549)
partition 98
size of simulated data is (5412, 5549)
partition 99
size of simulated data is (5406, 5549)
partition 100
size of simulated data is (5400, 5549)
partition 101
size of simulated data is (5394, 5549)
partition 102
size of simulated data is (5388, 5549)
partition 103
size of simulated data is (5382, 5549)
partition 104
size of simulated data is (5376, 5549)
partition 105
size of simulated data is (5370, 5549)
partition 106
size of simulated data is (5364, 5549)
partition 107
size of simulated data is (5358, 5549)
partition 108
size of simulated data is (5352, 5549)
partition 109
size of simulated data is (5346, 5549)
partition 110
size of simulated data is (5340, 5549)
partition 111
size of simulated data is (5334, 5549)
partition 112
size of simulated data is (5328, 5549)
partition 113
size of simulated data is (5322, 5549)
partition 114
size of simulated data is (5316, 554

partition 251
size of simulated data is (4494, 5549)
partition 252
size of simulated data is (4488, 5549)
partition 253
size of simulated data is (4482, 5549)
partition 254
size of simulated data is (4476, 5549)
partition 255
size of simulated data is (4470, 5549)
partition 256
size of simulated data is (4464, 5549)
partition 257
size of simulated data is (4458, 5549)
partition 258
size of simulated data is (4452, 5549)
partition 259
size of simulated data is (4446, 5549)
partition 260
size of simulated data is (4440, 5549)
partition 261
size of simulated data is (4434, 5549)
partition 262
size of simulated data is (4428, 5549)
partition 263
size of simulated data is (4422, 5549)
partition 264
size of simulated data is (4416, 5549)
partition 265
size of simulated data is (4410, 5549)
partition 266
size of simulated data is (4404, 5549)
partition 267
size of simulated data is (4398, 5549)
partition 268
size of simulated data is (4392, 5549)
partition 269
size of simulated data is (4386,

partition 407
size of simulated data is (3558, 5549)
partition 408
size of simulated data is (3552, 5549)
partition 409
size of simulated data is (3546, 5549)
partition 410
size of simulated data is (3540, 5549)
partition 411
size of simulated data is (3534, 5549)
partition 412
size of simulated data is (3528, 5549)
partition 413
size of simulated data is (3522, 5549)
partition 414
size of simulated data is (3516, 5549)
partition 415
size of simulated data is (3510, 5549)
partition 416
size of simulated data is (3504, 5549)
partition 417
size of simulated data is (3498, 5549)
partition 418
size of simulated data is (3492, 5549)
partition 419
size of simulated data is (3486, 5549)
partition 420
size of simulated data is (3480, 5549)
partition 421
size of simulated data is (3474, 5549)
partition 422
size of simulated data is (3468, 5549)
partition 423
size of simulated data is (3462, 5549)
partition 424
size of simulated data is (3456, 5549)
partition 425
size of simulated data is (3450,

partition 563
size of simulated data is (2622, 5549)
partition 564
size of simulated data is (2616, 5549)
partition 565
size of simulated data is (2610, 5549)
partition 566
size of simulated data is (2604, 5549)
partition 567
size of simulated data is (2598, 5549)
partition 568
size of simulated data is (2592, 5549)
partition 569
size of simulated data is (2586, 5549)
partition 570
size of simulated data is (2580, 5549)
partition 571
size of simulated data is (2574, 5549)
partition 572
size of simulated data is (2568, 5549)
partition 573
size of simulated data is (2562, 5549)
partition 574
size of simulated data is (2556, 5549)
partition 575
size of simulated data is (2550, 5549)
partition 576
size of simulated data is (2544, 5549)
partition 577
size of simulated data is (2538, 5549)
partition 578
size of simulated data is (2532, 5549)
partition 579
size of simulated data is (2526, 5549)
partition 580
size of simulated data is (2520, 5549)
partition 581
size of simulated data is (2514,

partition 719
size of simulated data is (1686, 5549)
partition 720
size of simulated data is (1680, 5549)
partition 721
size of simulated data is (1674, 5549)
partition 722
size of simulated data is (1668, 5549)
partition 723
size of simulated data is (1662, 5549)
partition 724
size of simulated data is (1656, 5549)
partition 725
size of simulated data is (1650, 5549)
partition 726
size of simulated data is (1644, 5549)
partition 727
size of simulated data is (1638, 5549)
partition 728
size of simulated data is (1632, 5549)
partition 729
size of simulated data is (1626, 5549)
partition 730
size of simulated data is (1620, 5549)
partition 731
size of simulated data is (1614, 5549)
partition 732
size of simulated data is (1608, 5549)
partition 733
size of simulated data is (1602, 5549)
partition 734
size of simulated data is (1596, 5549)
partition 735
size of simulated data is (1590, 5549)
partition 736
size of simulated data is (1584, 5549)
partition 737
size of simulated data is (1578,

partition 876
size of simulated data is (744, 5549)
partition 877
size of simulated data is (738, 5549)
partition 878
size of simulated data is (732, 5549)
partition 879
size of simulated data is (726, 5549)
partition 880
size of simulated data is (720, 5549)
partition 881
size of simulated data is (714, 5549)
partition 882
size of simulated data is (708, 5549)
partition 883
size of simulated data is (702, 5549)
partition 884
size of simulated data is (696, 5549)
partition 885
size of simulated data is (690, 5549)
partition 886
size of simulated data is (684, 5549)
partition 887
size of simulated data is (678, 5549)
partition 888
size of simulated data is (672, 5549)
partition 889
size of simulated data is (666, 5549)
partition 890
size of simulated data is (660, 5549)
partition 891
size of simulated data is (654, 5549)
partition 892
size of simulated data is (648, 5549)
partition 893
size of simulated data is (642, 5549)
partition 894
size of simulated data is (636, 5549)
partition 89

partition 36
size of simulated data is (5892, 5549)
partition 37
size of simulated data is (5889, 5549)
partition 38
size of simulated data is (5886, 5549)
partition 39
size of simulated data is (5883, 5549)
partition 40
size of simulated data is (5880, 5549)
partition 41
size of simulated data is (5877, 5549)
partition 42
size of simulated data is (5874, 5549)
partition 43
size of simulated data is (5871, 5549)
partition 44
size of simulated data is (5868, 5549)
partition 45
size of simulated data is (5865, 5549)
partition 46
size of simulated data is (5862, 5549)
partition 47
size of simulated data is (5859, 5549)
partition 48
size of simulated data is (5856, 5549)
partition 49
size of simulated data is (5853, 5549)
partition 50
size of simulated data is (5850, 5549)
partition 51
size of simulated data is (5847, 5549)
partition 52
size of simulated data is (5844, 5549)
partition 53
size of simulated data is (5841, 5549)
partition 54
size of simulated data is (5838, 5549)
partition 55

partition 193
size of simulated data is (5421, 5549)
partition 194
size of simulated data is (5418, 5549)
partition 195
size of simulated data is (5415, 5549)
partition 196
size of simulated data is (5412, 5549)
partition 197
size of simulated data is (5409, 5549)
partition 198
size of simulated data is (5406, 5549)
partition 199
size of simulated data is (5403, 5549)
partition 200
size of simulated data is (5400, 5549)
partition 201
size of simulated data is (5397, 5549)
partition 202
size of simulated data is (5394, 5549)
partition 203
size of simulated data is (5391, 5549)
partition 204
size of simulated data is (5388, 5549)
partition 205
size of simulated data is (5385, 5549)
partition 206
size of simulated data is (5382, 5549)
partition 207
size of simulated data is (5379, 5549)
partition 208
size of simulated data is (5376, 5549)
partition 209
size of simulated data is (5373, 5549)
partition 210
size of simulated data is (5370, 5549)
partition 211
size of simulated data is (5367,

partition 350
size of simulated data is (4950, 5549)
partition 351
size of simulated data is (4947, 5549)
partition 352
size of simulated data is (4944, 5549)
partition 353
size of simulated data is (4941, 5549)
partition 354
size of simulated data is (4938, 5549)
partition 355
size of simulated data is (4935, 5549)
partition 356
size of simulated data is (4932, 5549)
partition 357
size of simulated data is (4929, 5549)
partition 358
size of simulated data is (4926, 5549)
partition 359
size of simulated data is (4923, 5549)
partition 360
size of simulated data is (4920, 5549)
partition 361
size of simulated data is (4917, 5549)
partition 362
size of simulated data is (4914, 5549)
partition 363
size of simulated data is (4911, 5549)
partition 364
size of simulated data is (4908, 5549)
partition 365
size of simulated data is (4905, 5549)
partition 366
size of simulated data is (4902, 5549)
partition 367
size of simulated data is (4899, 5549)
partition 368
size of simulated data is (4896,

partition 505
size of simulated data is (4485, 5549)
partition 506
size of simulated data is (4482, 5549)
partition 507
size of simulated data is (4479, 5549)
partition 508
size of simulated data is (4476, 5549)
partition 509
size of simulated data is (4473, 5549)
partition 510
size of simulated data is (4470, 5549)
partition 511
size of simulated data is (4467, 5549)
partition 512
size of simulated data is (4464, 5549)
partition 513
size of simulated data is (4461, 5549)
partition 514
size of simulated data is (4458, 5549)
partition 515
size of simulated data is (4455, 5549)
partition 516
size of simulated data is (4452, 5549)
partition 517
size of simulated data is (4449, 5549)
partition 518
size of simulated data is (4446, 5549)
partition 519
size of simulated data is (4443, 5549)
partition 520
size of simulated data is (4440, 5549)
partition 521
size of simulated data is (4437, 5549)
partition 522
size of simulated data is (4434, 5549)
partition 523
size of simulated data is (4431,

partition 660
size of simulated data is (4020, 5549)
partition 661
size of simulated data is (4017, 5549)
partition 662
size of simulated data is (4014, 5549)
partition 663
size of simulated data is (4011, 5549)
partition 664
size of simulated data is (4008, 5549)
partition 665
size of simulated data is (4005, 5549)
partition 666
size of simulated data is (4002, 5549)
partition 667
size of simulated data is (3999, 5549)
partition 668
size of simulated data is (3996, 5549)
partition 669
size of simulated data is (3993, 5549)
partition 670
size of simulated data is (3990, 5549)
partition 671
size of simulated data is (3987, 5549)
partition 672
size of simulated data is (3984, 5549)
partition 673
size of simulated data is (3981, 5549)
partition 674
size of simulated data is (3978, 5549)
partition 675
size of simulated data is (3975, 5549)
partition 676
size of simulated data is (3972, 5549)
partition 677
size of simulated data is (3969, 5549)
partition 678
size of simulated data is (3966,

partition 816
size of simulated data is (3552, 5549)
partition 817
size of simulated data is (3549, 5549)
partition 818
size of simulated data is (3546, 5549)
partition 819
size of simulated data is (3543, 5549)
partition 820
size of simulated data is (3540, 5549)
partition 821
size of simulated data is (3537, 5549)
partition 822
size of simulated data is (3534, 5549)
partition 823
size of simulated data is (3531, 5549)
partition 824
size of simulated data is (3528, 5549)
partition 825
size of simulated data is (3525, 5549)
partition 826
size of simulated data is (3522, 5549)
partition 827
size of simulated data is (3519, 5549)
partition 828
size of simulated data is (3516, 5549)
partition 829
size of simulated data is (3513, 5549)
partition 830
size of simulated data is (3510, 5549)
partition 831
size of simulated data is (3507, 5549)
partition 832
size of simulated data is (3504, 5549)
partition 833
size of simulated data is (3501, 5549)
partition 834
size of simulated data is (3498,

partition 971
size of simulated data is (3087, 5549)
partition 972
size of simulated data is (3084, 5549)
partition 973
size of simulated data is (3081, 5549)
partition 974
size of simulated data is (3078, 5549)
partition 975
size of simulated data is (3075, 5549)
partition 976
size of simulated data is (3072, 5549)
partition 977
size of simulated data is (3069, 5549)
partition 978
size of simulated data is (3066, 5549)
partition 979
size of simulated data is (3063, 5549)
partition 980
size of simulated data is (3060, 5549)
partition 981
size of simulated data is (3057, 5549)
partition 982
size of simulated data is (3054, 5549)
partition 983
size of simulated data is (3051, 5549)
partition 984
size of simulated data is (3048, 5549)
partition 985
size of simulated data is (3045, 5549)
partition 986
size of simulated data is (3042, 5549)
partition 987
size of simulated data is (3039, 5549)
partition 988
size of simulated data is (3036, 5549)
partition 989
size of simulated data is (3033,

partition 1124
size of simulated data is (2628, 5549)
partition 1125
size of simulated data is (2625, 5549)
partition 1126
size of simulated data is (2622, 5549)
partition 1127
size of simulated data is (2619, 5549)
partition 1128
size of simulated data is (2616, 5549)
partition 1129
size of simulated data is (2613, 5549)
partition 1130
size of simulated data is (2610, 5549)
partition 1131
size of simulated data is (2607, 5549)
partition 1132
size of simulated data is (2604, 5549)
partition 1133
size of simulated data is (2601, 5549)
partition 1134
size of simulated data is (2598, 5549)
partition 1135
size of simulated data is (2595, 5549)
partition 1136
size of simulated data is (2592, 5549)
partition 1137
size of simulated data is (2589, 5549)
partition 1138
size of simulated data is (2586, 5549)
partition 1139
size of simulated data is (2583, 5549)
partition 1140
size of simulated data is (2580, 5549)
partition 1141
size of simulated data is (2577, 5549)
partition 1142
size of simul

partition 1277
size of simulated data is (2169, 5549)
partition 1278
size of simulated data is (2166, 5549)
partition 1279
size of simulated data is (2163, 5549)
partition 1280
size of simulated data is (2160, 5549)
partition 1281
size of simulated data is (2157, 5549)
partition 1282
size of simulated data is (2154, 5549)
partition 1283
size of simulated data is (2151, 5549)
partition 1284
size of simulated data is (2148, 5549)
partition 1285
size of simulated data is (2145, 5549)
partition 1286
size of simulated data is (2142, 5549)
partition 1287
size of simulated data is (2139, 5549)
partition 1288
size of simulated data is (2136, 5549)
partition 1289
size of simulated data is (2133, 5549)
partition 1290
size of simulated data is (2130, 5549)
partition 1291
size of simulated data is (2127, 5549)
partition 1292
size of simulated data is (2124, 5549)
partition 1293
size of simulated data is (2121, 5549)
partition 1294
size of simulated data is (2118, 5549)
partition 1295
size of simul

partition 1430
size of simulated data is (1710, 5549)
partition 1431
size of simulated data is (1707, 5549)
partition 1432
size of simulated data is (1704, 5549)
partition 1433
size of simulated data is (1701, 5549)
partition 1434
size of simulated data is (1698, 5549)
partition 1435
size of simulated data is (1695, 5549)
partition 1436
size of simulated data is (1692, 5549)
partition 1437
size of simulated data is (1689, 5549)
partition 1438
size of simulated data is (1686, 5549)
partition 1439
size of simulated data is (1683, 5549)
partition 1440
size of simulated data is (1680, 5549)
partition 1441
size of simulated data is (1677, 5549)
partition 1442
size of simulated data is (1674, 5549)
partition 1443
size of simulated data is (1671, 5549)
partition 1444
size of simulated data is (1668, 5549)
partition 1445
size of simulated data is (1665, 5549)
partition 1446
size of simulated data is (1662, 5549)
partition 1447
size of simulated data is (1659, 5549)
partition 1448
size of simul

partition 1583
size of simulated data is (1251, 5549)
partition 1584
size of simulated data is (1248, 5549)
partition 1585
size of simulated data is (1245, 5549)
partition 1586
size of simulated data is (1242, 5549)
partition 1587
size of simulated data is (1239, 5549)
partition 1588
size of simulated data is (1236, 5549)
partition 1589
size of simulated data is (1233, 5549)
partition 1590
size of simulated data is (1230, 5549)
partition 1591
size of simulated data is (1227, 5549)
partition 1592
size of simulated data is (1224, 5549)
partition 1593
size of simulated data is (1221, 5549)
partition 1594
size of simulated data is (1218, 5549)
partition 1595
size of simulated data is (1215, 5549)
partition 1596
size of simulated data is (1212, 5549)
partition 1597
size of simulated data is (1209, 5549)
partition 1598
size of simulated data is (1206, 5549)
partition 1599
size of simulated data is (1203, 5549)
partition 1600
size of simulated data is (1200, 5549)
partition 1601
size of simul

partition 1737
size of simulated data is (789, 5549)
partition 1738
size of simulated data is (786, 5549)
partition 1739
size of simulated data is (783, 5549)
partition 1740
size of simulated data is (780, 5549)
partition 1741
size of simulated data is (777, 5549)
partition 1742
size of simulated data is (774, 5549)
partition 1743
size of simulated data is (771, 5549)
partition 1744
size of simulated data is (768, 5549)
partition 1745
size of simulated data is (765, 5549)
partition 1746
size of simulated data is (762, 5549)
partition 1747
size of simulated data is (759, 5549)
partition 1748
size of simulated data is (756, 5549)
partition 1749
size of simulated data is (753, 5549)
partition 1750
size of simulated data is (750, 5549)
partition 1751
size of simulated data is (747, 5549)
partition 1752
size of simulated data is (744, 5549)
partition 1753
size of simulated data is (741, 5549)
partition 1754
size of simulated data is (738, 5549)
partition 1755
size of simulated data is (735,

partition 1892
size of simulated data is (324, 5549)
partition 1893
size of simulated data is (321, 5549)
partition 1894
size of simulated data is (318, 5549)
partition 1895
size of simulated data is (315, 5549)
partition 1896
size of simulated data is (312, 5549)
partition 1897
size of simulated data is (309, 5549)
partition 1898
size of simulated data is (306, 5549)
partition 1899
size of simulated data is (303, 5549)
partition 1900
size of simulated data is (300, 5549)
partition 1901
size of simulated data is (297, 5549)
partition 1902
size of simulated data is (294, 5549)
partition 1903
size of simulated data is (291, 5549)
partition 1904
size of simulated data is (288, 5549)
partition 1905
size of simulated data is (285, 5549)
partition 1906
size of simulated data is (282, 5549)
partition 1907
size of simulated data is (279, 5549)
partition 1908
size of simulated data is (276, 5549)
partition 1909
size of simulated data is (273, 5549)
partition 1910
size of simulated data is (270,

partition 48
size of simulated data is (5904, 5549)
partition 49
size of simulated data is (5902, 5549)
partition 50
size of simulated data is (5900, 5549)
partition 51
size of simulated data is (5898, 5549)
partition 52
size of simulated data is (5896, 5549)
partition 53
size of simulated data is (5894, 5549)
partition 54
size of simulated data is (5892, 5549)
partition 55
size of simulated data is (5890, 5549)
partition 56
size of simulated data is (5888, 5549)
partition 57
size of simulated data is (5886, 5549)
partition 58
size of simulated data is (5884, 5549)
partition 59
size of simulated data is (5882, 5549)
partition 60
size of simulated data is (5880, 5549)
partition 61
size of simulated data is (5878, 5549)
partition 62
size of simulated data is (5876, 5549)
partition 63
size of simulated data is (5874, 5549)
partition 64
size of simulated data is (5872, 5549)
partition 65
size of simulated data is (5870, 5549)
partition 66
size of simulated data is (5868, 5549)
partition 67

partition 205
size of simulated data is (5590, 5549)
partition 206
size of simulated data is (5588, 5549)
partition 207
size of simulated data is (5586, 5549)
partition 208
size of simulated data is (5584, 5549)
partition 209
size of simulated data is (5582, 5549)
partition 210
size of simulated data is (5580, 5549)
partition 211
size of simulated data is (5578, 5549)
partition 212
size of simulated data is (5576, 5549)
partition 213
size of simulated data is (5574, 5549)
partition 214
size of simulated data is (5572, 5549)
partition 215
size of simulated data is (5570, 5549)
partition 216
size of simulated data is (5568, 5549)
partition 217
size of simulated data is (5566, 5549)
partition 218
size of simulated data is (5564, 5549)
partition 219
size of simulated data is (5562, 5549)
partition 220
size of simulated data is (5560, 5549)
partition 221
size of simulated data is (5558, 5549)
partition 222
size of simulated data is (5556, 5549)
partition 223
size of simulated data is (5554,

partition 361
size of simulated data is (5278, 5549)
partition 362
size of simulated data is (5276, 5549)
partition 363
size of simulated data is (5274, 5549)
partition 364
size of simulated data is (5272, 5549)
partition 365
size of simulated data is (5270, 5549)
partition 366
size of simulated data is (5268, 5549)
partition 367
size of simulated data is (5266, 5549)
partition 368
size of simulated data is (5264, 5549)
partition 369
size of simulated data is (5262, 5549)
partition 370
size of simulated data is (5260, 5549)
partition 371
size of simulated data is (5258, 5549)
partition 372
size of simulated data is (5256, 5549)
partition 373
size of simulated data is (5254, 5549)
partition 374
size of simulated data is (5252, 5549)
partition 375
size of simulated data is (5250, 5549)
partition 376
size of simulated data is (5248, 5549)
partition 377
size of simulated data is (5246, 5549)
partition 378
size of simulated data is (5244, 5549)
partition 379
size of simulated data is (5242,

partition 516
size of simulated data is (4968, 5549)
partition 517
size of simulated data is (4966, 5549)
partition 518
size of simulated data is (4964, 5549)
partition 519
size of simulated data is (4962, 5549)
partition 520
size of simulated data is (4960, 5549)
partition 521
size of simulated data is (4958, 5549)
partition 522
size of simulated data is (4956, 5549)
partition 523
size of simulated data is (4954, 5549)
partition 524
size of simulated data is (4952, 5549)
partition 525
size of simulated data is (4950, 5549)
partition 526
size of simulated data is (4948, 5549)
partition 527
size of simulated data is (4946, 5549)
partition 528
size of simulated data is (4944, 5549)
partition 529
size of simulated data is (4942, 5549)
partition 530
size of simulated data is (4940, 5549)
partition 531
size of simulated data is (4938, 5549)
partition 532
size of simulated data is (4936, 5549)
partition 533
size of simulated data is (4934, 5549)
partition 534
size of simulated data is (4932,

partition 673
size of simulated data is (4654, 5549)
partition 674
size of simulated data is (4652, 5549)
partition 675
size of simulated data is (4650, 5549)
partition 676
size of simulated data is (4648, 5549)
partition 677
size of simulated data is (4646, 5549)
partition 678
size of simulated data is (4644, 5549)
partition 679
size of simulated data is (4642, 5549)
partition 680
size of simulated data is (4640, 5549)
partition 681
size of simulated data is (4638, 5549)
partition 682
size of simulated data is (4636, 5549)
partition 683
size of simulated data is (4634, 5549)
partition 684
size of simulated data is (4632, 5549)
partition 685
size of simulated data is (4630, 5549)
partition 686
size of simulated data is (4628, 5549)
partition 687
size of simulated data is (4626, 5549)
partition 688
size of simulated data is (4624, 5549)
partition 689
size of simulated data is (4622, 5549)
partition 690
size of simulated data is (4620, 5549)
partition 691
size of simulated data is (4618,

partition 828
size of simulated data is (4344, 5549)
partition 829
size of simulated data is (4342, 5549)
partition 830
size of simulated data is (4340, 5549)
partition 831
size of simulated data is (4338, 5549)
partition 832
size of simulated data is (4336, 5549)
partition 833
size of simulated data is (4334, 5549)
partition 834
size of simulated data is (4332, 5549)
partition 835
size of simulated data is (4330, 5549)
partition 836
size of simulated data is (4328, 5549)
partition 837
size of simulated data is (4326, 5549)
partition 838
size of simulated data is (4324, 5549)
partition 839
size of simulated data is (4322, 5549)
partition 840
size of simulated data is (4320, 5549)
partition 841
size of simulated data is (4318, 5549)
partition 842
size of simulated data is (4316, 5549)
partition 843
size of simulated data is (4314, 5549)
partition 844
size of simulated data is (4312, 5549)
partition 845
size of simulated data is (4310, 5549)
partition 846
size of simulated data is (4308,

partition 984
size of simulated data is (4032, 5549)
partition 985
size of simulated data is (4030, 5549)
partition 986
size of simulated data is (4028, 5549)
partition 987
size of simulated data is (4026, 5549)
partition 988
size of simulated data is (4024, 5549)
partition 989
size of simulated data is (4022, 5549)
partition 990
size of simulated data is (4020, 5549)
partition 991
size of simulated data is (4018, 5549)
partition 992
size of simulated data is (4016, 5549)
partition 993
size of simulated data is (4014, 5549)
partition 994
size of simulated data is (4012, 5549)
partition 995
size of simulated data is (4010, 5549)
partition 996
size of simulated data is (4008, 5549)
partition 997
size of simulated data is (4006, 5549)
partition 998
size of simulated data is (4004, 5549)
partition 999
size of simulated data is (4002, 5549)
partition 1000
size of simulated data is (4000, 5549)
partition 1001
size of simulated data is (3998, 5549)
partition 1002
size of simulated data is (39

partition 1138
size of simulated data is (3724, 5549)
partition 1139
size of simulated data is (3722, 5549)
partition 1140
size of simulated data is (3720, 5549)
partition 1141
size of simulated data is (3718, 5549)
partition 1142
size of simulated data is (3716, 5549)
partition 1143
size of simulated data is (3714, 5549)
partition 1144
size of simulated data is (3712, 5549)
partition 1145
size of simulated data is (3710, 5549)
partition 1146
size of simulated data is (3708, 5549)
partition 1147
size of simulated data is (3706, 5549)
partition 1148
size of simulated data is (3704, 5549)
partition 1149
size of simulated data is (3702, 5549)
partition 1150
size of simulated data is (3700, 5549)
partition 1151
size of simulated data is (3698, 5549)
partition 1152
size of simulated data is (3696, 5549)
partition 1153
size of simulated data is (3694, 5549)
partition 1154
size of simulated data is (3692, 5549)
partition 1155
size of simulated data is (3690, 5549)
partition 1156
size of simul

partition 1290
size of simulated data is (3420, 5549)
partition 1291
size of simulated data is (3418, 5549)
partition 1292
size of simulated data is (3416, 5549)
partition 1293
size of simulated data is (3414, 5549)
partition 1294
size of simulated data is (3412, 5549)
partition 1295
size of simulated data is (3410, 5549)
partition 1296
size of simulated data is (3408, 5549)
partition 1297
size of simulated data is (3406, 5549)
partition 1298
size of simulated data is (3404, 5549)
partition 1299
size of simulated data is (3402, 5549)
partition 1300
size of simulated data is (3400, 5549)
partition 1301
size of simulated data is (3398, 5549)
partition 1302
size of simulated data is (3396, 5549)
partition 1303
size of simulated data is (3394, 5549)
partition 1304
size of simulated data is (3392, 5549)
partition 1305
size of simulated data is (3390, 5549)
partition 1306
size of simulated data is (3388, 5549)
partition 1307
size of simulated data is (3386, 5549)
partition 1308
size of simul

partition 1442
size of simulated data is (3116, 5549)
partition 1443
size of simulated data is (3114, 5549)
partition 1444
size of simulated data is (3112, 5549)
partition 1445
size of simulated data is (3110, 5549)
partition 1446
size of simulated data is (3108, 5549)
partition 1447
size of simulated data is (3106, 5549)
partition 1448
size of simulated data is (3104, 5549)
partition 1449
size of simulated data is (3102, 5549)
partition 1450
size of simulated data is (3100, 5549)
partition 1451
size of simulated data is (3098, 5549)
partition 1452
size of simulated data is (3096, 5549)
partition 1453
size of simulated data is (3094, 5549)
partition 1454
size of simulated data is (3092, 5549)
partition 1455
size of simulated data is (3090, 5549)
partition 1456
size of simulated data is (3088, 5549)
partition 1457
size of simulated data is (3086, 5549)
partition 1458
size of simulated data is (3084, 5549)
partition 1459
size of simulated data is (3082, 5549)
partition 1460
size of simul

partition 1594
size of simulated data is (2812, 5549)
partition 1595
size of simulated data is (2810, 5549)
partition 1596
size of simulated data is (2808, 5549)
partition 1597
size of simulated data is (2806, 5549)
partition 1598
size of simulated data is (2804, 5549)
partition 1599
size of simulated data is (2802, 5549)
partition 1600
size of simulated data is (2800, 5549)
partition 1601
size of simulated data is (2798, 5549)
partition 1602
size of simulated data is (2796, 5549)
partition 1603
size of simulated data is (2794, 5549)
partition 1604
size of simulated data is (2792, 5549)
partition 1605
size of simulated data is (2790, 5549)
partition 1606
size of simulated data is (2788, 5549)
partition 1607
size of simulated data is (2786, 5549)
partition 1608
size of simulated data is (2784, 5549)
partition 1609
size of simulated data is (2782, 5549)
partition 1610
size of simulated data is (2780, 5549)
partition 1611
size of simulated data is (2778, 5549)
partition 1612
size of simul

partition 1746
size of simulated data is (2508, 5549)
partition 1747
size of simulated data is (2506, 5549)
partition 1748
size of simulated data is (2504, 5549)
partition 1749
size of simulated data is (2502, 5549)
partition 1750
size of simulated data is (2500, 5549)
partition 1751
size of simulated data is (2498, 5549)
partition 1752
size of simulated data is (2496, 5549)
partition 1753
size of simulated data is (2494, 5549)
partition 1754
size of simulated data is (2492, 5549)
partition 1755
size of simulated data is (2490, 5549)
partition 1756
size of simulated data is (2488, 5549)
partition 1757
size of simulated data is (2486, 5549)
partition 1758
size of simulated data is (2484, 5549)
partition 1759
size of simulated data is (2482, 5549)
partition 1760
size of simulated data is (2480, 5549)
partition 1761
size of simulated data is (2478, 5549)
partition 1762
size of simulated data is (2476, 5549)
partition 1763
size of simulated data is (2474, 5549)
partition 1764
size of simul

partition 1899
size of simulated data is (2202, 5549)
partition 1900
size of simulated data is (2200, 5549)
partition 1901
size of simulated data is (2198, 5549)
partition 1902
size of simulated data is (2196, 5549)
partition 1903
size of simulated data is (2194, 5549)
partition 1904
size of simulated data is (2192, 5549)
partition 1905
size of simulated data is (2190, 5549)
partition 1906
size of simulated data is (2188, 5549)
partition 1907
size of simulated data is (2186, 5549)
partition 1908
size of simulated data is (2184, 5549)
partition 1909
size of simulated data is (2182, 5549)
partition 1910
size of simulated data is (2180, 5549)
partition 1911
size of simulated data is (2178, 5549)
partition 1912
size of simulated data is (2176, 5549)
partition 1913
size of simulated data is (2174, 5549)
partition 1914
size of simulated data is (2172, 5549)
partition 1915
size of simulated data is (2170, 5549)
partition 1916
size of simulated data is (2168, 5549)
partition 1917
size of simul

partition 2052
size of simulated data is (1896, 5549)
partition 2053
size of simulated data is (1894, 5549)
partition 2054
size of simulated data is (1892, 5549)
partition 2055
size of simulated data is (1890, 5549)
partition 2056
size of simulated data is (1888, 5549)
partition 2057
size of simulated data is (1886, 5549)
partition 2058
size of simulated data is (1884, 5549)
partition 2059
size of simulated data is (1882, 5549)
partition 2060
size of simulated data is (1880, 5549)
partition 2061
size of simulated data is (1878, 5549)
partition 2062
size of simulated data is (1876, 5549)
partition 2063
size of simulated data is (1874, 5549)
partition 2064
size of simulated data is (1872, 5549)
partition 2065
size of simulated data is (1870, 5549)
partition 2066
size of simulated data is (1868, 5549)
partition 2067
size of simulated data is (1866, 5549)
partition 2068
size of simulated data is (1864, 5549)
partition 2069
size of simulated data is (1862, 5549)
partition 2070
size of simul

partition 2204
size of simulated data is (1592, 5549)
partition 2205
size of simulated data is (1590, 5549)
partition 2206
size of simulated data is (1588, 5549)
partition 2207
size of simulated data is (1586, 5549)
partition 2208
size of simulated data is (1584, 5549)
partition 2209
size of simulated data is (1582, 5549)
partition 2210
size of simulated data is (1580, 5549)
partition 2211
size of simulated data is (1578, 5549)
partition 2212
size of simulated data is (1576, 5549)
partition 2213
size of simulated data is (1574, 5549)
partition 2214
size of simulated data is (1572, 5549)
partition 2215
size of simulated data is (1570, 5549)
partition 2216
size of simulated data is (1568, 5549)
partition 2217
size of simulated data is (1566, 5549)
partition 2218
size of simulated data is (1564, 5549)
partition 2219
size of simulated data is (1562, 5549)
partition 2220
size of simulated data is (1560, 5549)
partition 2221
size of simulated data is (1558, 5549)
partition 2222
size of simul

partition 2356
size of simulated data is (1288, 5549)
partition 2357
size of simulated data is (1286, 5549)
partition 2358
size of simulated data is (1284, 5549)
partition 2359
size of simulated data is (1282, 5549)
partition 2360
size of simulated data is (1280, 5549)
partition 2361
size of simulated data is (1278, 5549)
partition 2362
size of simulated data is (1276, 5549)
partition 2363
size of simulated data is (1274, 5549)
partition 2364
size of simulated data is (1272, 5549)
partition 2365
size of simulated data is (1270, 5549)
partition 2366
size of simulated data is (1268, 5549)
partition 2367
size of simulated data is (1266, 5549)
partition 2368
size of simulated data is (1264, 5549)
partition 2369
size of simulated data is (1262, 5549)
partition 2370
size of simulated data is (1260, 5549)
partition 2371
size of simulated data is (1258, 5549)
partition 2372
size of simulated data is (1256, 5549)
partition 2373
size of simulated data is (1254, 5549)
partition 2374
size of simul

partition 2508
size of simulated data is (984, 5549)
partition 2509
size of simulated data is (982, 5549)
partition 2510
size of simulated data is (980, 5549)
partition 2511
size of simulated data is (978, 5549)
partition 2512
size of simulated data is (976, 5549)
partition 2513
size of simulated data is (974, 5549)
partition 2514
size of simulated data is (972, 5549)
partition 2515
size of simulated data is (970, 5549)
partition 2516
size of simulated data is (968, 5549)
partition 2517
size of simulated data is (966, 5549)
partition 2518
size of simulated data is (964, 5549)
partition 2519
size of simulated data is (962, 5549)
partition 2520
size of simulated data is (960, 5549)
partition 2521
size of simulated data is (958, 5549)
partition 2522
size of simulated data is (956, 5549)
partition 2523
size of simulated data is (954, 5549)
partition 2524
size of simulated data is (952, 5549)
partition 2525
size of simulated data is (950, 5549)
partition 2526
size of simulated data is (948,

partition 2664
size of simulated data is (672, 5549)
partition 2665
size of simulated data is (670, 5549)
partition 2666
size of simulated data is (668, 5549)
partition 2667
size of simulated data is (666, 5549)
partition 2668
size of simulated data is (664, 5549)
partition 2669
size of simulated data is (662, 5549)
partition 2670
size of simulated data is (660, 5549)
partition 2671
size of simulated data is (658, 5549)
partition 2672
size of simulated data is (656, 5549)
partition 2673
size of simulated data is (654, 5549)
partition 2674
size of simulated data is (652, 5549)
partition 2675
size of simulated data is (650, 5549)
partition 2676
size of simulated data is (648, 5549)
partition 2677
size of simulated data is (646, 5549)
partition 2678
size of simulated data is (644, 5549)
partition 2679
size of simulated data is (642, 5549)
partition 2680
size of simulated data is (640, 5549)
partition 2681
size of simulated data is (638, 5549)
partition 2682
size of simulated data is (636,

partition 2819
size of simulated data is (362, 5549)
partition 2820
size of simulated data is (360, 5549)
partition 2821
size of simulated data is (358, 5549)
partition 2822
size of simulated data is (356, 5549)
partition 2823
size of simulated data is (354, 5549)
partition 2824
size of simulated data is (352, 5549)
partition 2825
size of simulated data is (350, 5549)
partition 2826
size of simulated data is (348, 5549)
partition 2827
size of simulated data is (346, 5549)
partition 2828
size of simulated data is (344, 5549)
partition 2829
size of simulated data is (342, 5549)
partition 2830
size of simulated data is (340, 5549)
partition 2831
size of simulated data is (338, 5549)
partition 2832
size of simulated data is (336, 5549)
partition 2833
size of simulated data is (334, 5549)
partition 2834
size of simulated data is (332, 5549)
partition 2835
size of simulated data is (330, 5549)
partition 2836
size of simulated data is (328, 5549)
partition 2837
size of simulated data is (326,

KeyboardInterrupt: 

## Plot batch data using UMAP

In [None]:
"""
# Plot generated data 

for i in num_batches:
    batch_data_file = os.path.join(
        base_dir,
        "data",
        "batch_simulated",
        analysis_name,
        "Batch_"+str(i)+".txt")
    
    batch_data = pd.read_table(
        batch_data_file,
        header=0,
        sep='\t',
        index_col=0)
    
    # UMAP embedding of decoded batch data
    batch_data_UMAPencoded = umap_model.transform(batch_data)
    batch_data_UMAPencoded_df = pd.DataFrame(data=batch_data_UMAPencoded,
                                             index=batch_data.index,
                                             columns=['1','2'])
    
        
    g = ggplot(aes(x='1',y='2'), data=batch_data_UMAPencoded_df) + \
                geom_point(alpha=0.5) + \
                scale_color_brewer(type='qual', palette='Set2') + \
                ggtitle("{} Batches".format(i))
    
    print(g)"""

## Plot batch data using PCA

In [None]:
"""
# Plot generated data 

for i in num_batches:
    batch_data_file = os.path.join(
        base_dir,
        "data",
        "batch_simulated",
        analysis_name,
        "Batch_"+str(i)+".txt")
    
    batch_data = pd.read_table(
        batch_data_file,
        header=0,
        sep='\t',
        index_col=0)
    
    # PCA projection    
    pca = PCA(n_components=num_PCs)
    batch_data_PCAencoded = pca.fit_transform(batch_data)
    
    # Encode data using PCA model    
    batch_data_PCAencoded_df = pd.DataFrame(batch_data_PCAencoded,
                                         index=batch_data.index
                                         )
    
    g = sns.pairplot(batch_data_PCAencoded_df)
    g.fig.suptitle("Batch {}".format(i))
       
    # Select pairwise PC's to plot
    pc1 = 0
    pc2 = 2
    
    # Encode data using PCA model    
    batch_data_PCAencoded_df = pd.DataFrame(batch_data_PCAencoded[:,[pc1,pc2]],
                                         index=batch_data.index,
                                         columns=['PC {}'.format(pc1), 'PC {}'.format(pc2)])
    
    g = ggplot(aes(x='PC {}'.format(pc1),y='PC {}'.format(pc2)), data=batch_data_PCAencoded_df)  + \
                geom_point(alpha=0.5) + \
                scale_color_brewer(type='qual', palette='Set2') + \
                ggtitle("{} Batches".format(i))
    print(g)

"""