# Add batch effects

Say we are interested in identifying genes that differentiate between disease vs normal states.  However our dataset includes samples from different tissues or time points and there are variations in gene expression that are due to these other conditions and do not have to do with disease state.  These non-relevant variations in the data are called *batch effects*.  

We want to model these batch effects.  To do this we will:
1. Partition our simulated data into n batches
2. For each partition we will randomly shift the expression data.  We randomly generate a binary vector of length=number of genes (*offset vector*).  This vector will serve as the direction that we will shift to.  Then we also have a random scalar that will tell us how big of a step to take in our random direction (*stretch factor*).  We shift our partitioned data by: batch effect partition = partitioned data + stretch factor * offset vector
3. Repeat this for each partition
4. Append all batch effect partitions together


In [1]:
%load_ext autoreload
%autoreload 2

import os
import ast
import pandas as pd
import numpy as np
import random
import glob
import umap
import pickle
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.decomposition import PCA
from numpy.random import seed
randomState = 123
seed(randomState)

In [2]:
# Load config file
config_file = "config_exp_2.txt"

d = {}
float_params = ["learning_rate", "kappa", "epsilon_std"]
str_params = ["analysis_name", "NN_architecture"]
lst_params = ["num_batches"]
with open(config_file) as f:
    for line in f:
        (name, val) = line.split()
        if name in float_params:
            d[name] = float(val)
        elif name in str_params:
            d[name] = str(val)
        elif name in lst_params:
            d[name] = ast.literal_eval(val)
        else:
            d[name] = int(val)

In [3]:
# Parameters
analysis_name = d["analysis_name"]
NN_architecture = d["NN_architecture"]
num_PCs = d["num_PCs"]
num_batches = d["num_batches"]

In [4]:
# Create directories
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../.."))

new_dir = os.path.join(
    base_dir,
    "data",
    "batch_simulated")

analysis_dir = os.path.join(new_dir, analysis_name)

if os.path.exists(analysis_dir):
    print('directory already exists: {}'.format(analysis_dir))
else:
    print('creating new directory: {}'.format(analysis_dir))
os.makedirs(analysis_dir, exist_ok=True)

directory already exists: /home/alexandra/Documents/Repos/Batch_effects_simulation/data/batch_simulated/experiment_2


In [5]:
# Load arguments
simulated_data_file = os.path.join(
    base_dir,
    "data",
    "simulated",
    analysis_name,
    "simulated_data.txt.xz")

umap_model_file = umap_model_file = os.path.join(
    base_dir,
    "models",  
    NN_architecture,
    "umap_model.pkl")

In [6]:
# Read in UMAP model
infile = open(umap_model_file, 'rb')
umap_model = pickle.load(infile)
infile.close()

In [7]:
# Read in data
simulated_data = pd.read_table(
    simulated_data_file,
    header=0, 
    index_col=0,
    compression='xz',
    sep='\t')

simulated_data.head(10)

Unnamed: 0,5340,339,244,1567,1827,4981,2310,3929,1498,3226,...,2787,2526,3299,3097,5330,2854,494,5089,3662,1920
0,0.535602,0.503128,0.285015,0.182251,0.338972,0.563961,0.32429,0.469941,0.185681,0.09072,...,0.544562,0.256521,0.326031,0.611043,0.412167,0.438068,0.280573,0.609067,0.402821,0.158164
1,0.602998,0.314449,0.170274,0.150126,0.393875,0.425789,0.359611,0.367097,0.162651,0.060858,...,0.558114,0.28427,0.29087,0.574704,0.281427,0.427374,0.250592,0.675274,0.470036,0.133792
2,0.517498,0.419739,0.182155,0.13146,0.32451,0.41385,0.320531,0.416758,0.152202,0.088437,...,0.79233,0.273688,0.418673,0.734889,0.254353,0.468174,0.249746,0.694683,0.476522,0.128183
3,0.397841,0.457606,0.323778,0.249936,0.297673,0.476715,0.381297,0.46833,0.215541,0.132116,...,0.590046,0.380182,0.329754,0.520238,0.422373,0.406375,0.207356,0.60175,0.407103,0.174844
4,0.535997,0.465947,0.277286,0.222062,0.385123,0.421842,0.330086,0.415129,0.177301,0.128083,...,0.557488,0.298452,0.395627,0.590282,0.290962,0.399933,0.272622,0.597588,0.440435,0.206779
5,0.49329,0.396501,0.249366,0.159639,0.375957,0.402583,0.275842,0.381382,0.261653,0.081041,...,0.688526,0.281114,0.452581,0.688948,0.270457,0.273857,0.142955,0.579213,0.546091,0.433394
6,0.433612,0.351818,0.241003,0.179525,0.336945,0.345377,0.321877,0.382335,0.167349,0.141031,...,0.629592,0.270185,0.363605,0.600325,0.337987,0.378784,0.203119,0.582426,0.467594,0.176376
7,0.451588,0.457565,0.33658,0.272774,0.264118,0.369123,0.27505,0.424156,0.250198,0.247813,...,0.390592,0.25181,0.273909,0.448068,0.424536,0.366531,0.295605,0.435115,0.335833,0.287866
8,0.619654,0.442385,0.281102,0.209915,0.4173,0.559756,0.303963,0.474562,0.233043,0.112126,...,0.427282,0.366223,0.458586,0.647082,0.318879,0.373176,0.29556,0.716585,0.48356,0.206377
9,0.434846,0.361911,0.255196,0.199091,0.345327,0.390971,0.34298,0.392987,0.176265,0.135802,...,0.615511,0.281425,0.373171,0.545738,0.388045,0.386622,0.230204,0.596523,0.427771,0.195655


In [8]:
%%time
# Add batch effects
num_simulated_samples = simulated_data.shape[0]
num_genes = simulated_data.shape[1]

# Create an array of the simulated data indices
simulated_ind = np.array(simulated_data.index)

for i in num_batches:
    print('Creating simulated data with {} batches..'.format(i))
    
    batch_file = os.path.join(
            base_dir,
            "data",
            "batch_simulated",
            analysis_name,
            "Batch_"+str(i)+".txt.xz")
    
    num_samples_per_batch = int(num_simulated_samples/i)
    
    if i == 1:        
        simulated_data.to_csv(batch_file, sep='\t', compression='xz')
        
    else:  
        batch_data = simulated_data.copy()
        
        # Shuffle indices
        np.random.shuffle(simulated_ind)
        
        for j in range(i):
            #print(j)
            
            # Partition indices to batch
            partition = np.array_split(simulated_ind, i)
            
            #print("before")
            #print(batch_data.loc[partition[j].tolist()].head())
            
            #print("indices to change: {}".format(partition))
            
            # Scalar to shift gene expressiond data
            stretch_factor = np.random.normal(0.0, 0.2, [1,num_genes])
            
            #print(stretch_factor)
            
            # Tile stretch_factor to be able to add to batches
            num_samples_per_batch = len(partition[j])
            stretch_factor_tile = pd.DataFrame(
                pd.np.tile(
                    stretch_factor,
                    (num_samples_per_batch, 1)),
                index=batch_data.loc[partition[j].tolist()].index,
                columns=batch_data.loc[partition[j].tolist()].columns)
            
            #print(stretch_factor_tile.head())
            
            # Add batch effects
            batch_data.loc[partition[j].tolist()] = batch_data.loc[partition[j].tolist()] + stretch_factor_tile
            
            #print("after")
            #print(batch_data.loc[partition[j].tolist()].head())


        # Should we re-normalize from 0-1 range?
        #from sklearn import preprocessing
        #batch_data = preprocessing.MinMaxScaler().fit_transform(batch_data)
        #batch_data_df = pd.DataFrame(batch_data,
        #                        columns=batch_data.columns,
        #                        index=batch_data.index)
            
        #print(batch_data)
            
        # Save
        batch_data.to_csv(batch_file, sep='\t', compression='xz')

Creating simulated data with 1 batches..
Creating simulated data with 2 batches..
          5340       339       244      1567      1827      4981      2310  \
0     0.550836  0.814025  0.184151  0.075284  0.277809  0.677065  0.344313   
1     0.618232  0.625346  0.069410  0.043159  0.332712  0.538892  0.379633   
2     0.532732  0.730636  0.081290  0.024494  0.263348  0.526954  0.340554   
3     0.413075  0.768503  0.222913  0.142969  0.236510  0.589818  0.401319   
4     0.452779  0.463684  0.280652  0.008873  0.606729  0.392505  0.448977   
5     0.508524  0.707397  0.148502  0.052672  0.314794  0.515687  0.295865   
6     0.350394  0.349554  0.244368 -0.033664  0.558551  0.316041  0.440767   
7     0.466821  0.768462  0.235716  0.165807  0.202955  0.482227  0.295073   
8     0.634888  0.753282  0.180238  0.102948  0.356137  0.672860  0.323986   
9     0.351629  0.359647  0.258561 -0.014098  0.566933  0.361635  0.461871   
10    0.522516  0.463468  0.313727 -0.003993  0.583738  0.40

Creating simulated data with 5 batches..
          5340       339       244      1567      1827      4981      2310  \
0     0.734993  0.376865  0.322638  0.189829  0.548529  0.614390  0.454417   
1     0.802388  0.188185  0.207897  0.157703  0.603432  0.476217  0.489738   
2     0.456803  0.622307 -0.112146 -0.058830  0.244503  0.318615  0.205899   
3     0.353741  0.388164  0.466762  0.335247  0.128270  0.209884  0.482114   
4     0.491897  0.396505  0.420271  0.307373  0.215720  0.155011  0.430903   
5     0.293266  0.315727  0.643425 -0.017857  0.437478  0.240251  0.434758   
6     0.372917  0.554385 -0.053298 -0.010766  0.256937  0.250143  0.207244   
7     0.390893  0.660133  0.042280  0.082483  0.184110  0.273888  0.160418   
8     0.558959  0.644953 -0.013198  0.019625  0.337292  0.464521  0.189331   
9     0.374152  0.564478 -0.039105  0.008801  0.265319  0.295736  0.228348   
10    0.649496  0.767851  0.232281  0.321066  0.290050  0.447277  0.632835   
11    0.624166  0.29682

Creating simulated data with 10 batches..
          5340       339       244      1567      1827      4981      2310  \
0     0.852002  0.256326  0.365427  0.141854 -0.183736  0.317496  0.416933   
1     1.033101  0.433759 -0.099550  0.166265  0.555479  0.278243  0.656962   
2     0.499758  0.770518  0.080260  0.293027  0.105078  0.259118  0.566211   
3     0.458081  0.336048  0.469060  0.229315  0.135218  0.597514  0.381623   
4     0.776204  0.585678  0.245831  0.243450  0.251434  0.326681 -0.246066   
5     0.923394  0.515811 -0.020458  0.175778  0.537560  0.255038  0.573193   
6     0.673818  0.471549  0.209547  0.200913  0.203256  0.250217 -0.254275   
7     0.691794  0.577296  0.305125  0.294162  0.130429  0.273962 -0.301102   
8     0.720262  0.505383  0.401756  0.127621  0.720908  0.637115  0.381953   
9     0.535455  0.424909  0.375849  0.116797  0.648936  0.468330  0.420970   
10    1.035837  0.585041  0.040538  0.225336  0.523736  0.282973  0.585952   
11    0.664982  0.5428

Creating simulated data with 20 batches..
          5340       339       244      1567      1827      4981      2310  \
0     0.413008 -0.228712  0.331575  0.493963  0.410190  0.174112  0.333653   
1     0.480404 -0.417392  0.216834  0.461837  0.465093  0.035939  0.368973   
2     0.636689  0.542579  0.022205  0.235657  0.185265  0.284119  0.412719   
3     0.165773  0.587777  0.199136  0.068213  0.108254  0.506821  0.333919   
4     0.227763  0.233052  0.055649  0.185837  0.516031  0.673357  0.258715   
5     0.548862  0.391270  0.264442 -0.117292  0.178834  0.231676  0.016414   
6     0.125378  0.118923  0.019366  0.143299  0.467853  0.596893  0.250506   
7     0.219520  0.587736  0.211938  0.091050  0.074699  0.399230  0.227672   
8     0.675225  0.437155  0.296179 -0.067016  0.220177  0.388849  0.044535   
9     0.554038  0.484751  0.095246  0.303288  0.206082  0.261240  0.435168   
10    0.483140 -0.266109  0.356922  0.520907  0.433351  0.040668  0.297964   
11    0.480347  0.4178

Creating simulated data with 50 batches..
          5340       339       244      1567      1827      4981      2310  \
0     0.288077  0.429043  0.038275  0.255398  0.409141  0.687135  0.192859   
1     0.702949  0.539439  0.157556  0.082981  0.597423  0.444703  0.569443   
2     0.617450  0.644730  0.169437  0.064316  0.528058  0.432764  0.530363   
3     0.688849  0.094144  0.257024  0.044692  0.147009  0.689408  0.405207   
4     0.708649  0.733066  0.394091  0.371321  0.287491  0.808674  0.254528   
5     0.593242  0.621491  0.236648  0.092494  0.579504  0.421498  0.485674   
6     0.936915  0.530425  0.240637  0.423657  0.473960  0.301780  0.270786   
7     0.232126  0.866384  0.322612  0.207772  0.503376  0.493726  0.215030   
8     0.877006  0.500693  0.037228  0.068386  0.485464  0.554307  0.492270   
9     0.711531  0.537476  0.549503  0.148497  0.523679  0.661979  0.184289   
10    0.392541  0.527557  0.356590 -0.278738  0.497985  0.581350  0.230385   
11    0.067581  0.4723

Creating simulated data with 100 batches..
          5340       339       244      1567      1827      4981      2310  \
0     0.424058  0.710764  0.504008  0.226678  0.255002  0.677604  0.273644   
1     0.885868  0.569142  0.326096  0.053610  0.523392  0.689581  0.319209   
2     0.356844  0.614235 -0.245977 -0.120985  0.391437  0.348214  0.538925   
3     0.123776  0.420213  0.305618  0.278443  0.265575  0.546880  0.287673   
4     0.474253  0.371112  0.375686  0.291420  0.447693  0.489329  0.264048   
5     0.431134  0.528605  0.371011  0.552398  0.815593  0.267160  0.415236   
6     0.287262  0.443683  0.299945  0.494938  0.504664  0.608016  0.118992   
7     0.389431  0.589670  0.458226  0.665533  0.703754  0.233700  0.414444   
8     0.707212  0.304093  0.182002 -0.022247  0.252225  0.190564  0.145385   
9     0.501646  0.563098  0.335451  0.219707  0.387026  0.271349  0.373090   
10    0.731546  0.718554  0.191758 -0.113128  0.373687  0.800229 -0.024918   
11    0.515038  0.560

Creating simulated data with 500 batches..
          5340       339       244      1567      1827      4981      2310  \
0     0.549366  0.340838  0.300199  0.033657  0.383411  0.448259  0.607532   
1     0.765755  0.401021  0.369941  0.123012  0.595188  0.553599  0.302382   
2     0.620235  0.425728  0.209008  0.496704  0.236689  0.043427  0.219171   
3     0.606662  0.235762  0.335451  0.239673  0.337046  0.350943  0.369841   
4     0.912902  0.592194  0.102724  0.135135  0.644172  0.704835  0.472428   
5     0.432669  0.220369  0.466038  0.356890  0.264778  0.188363  0.378787   
6     0.566091  0.361764  0.287271  0.174964  0.338838  0.220962  0.203498   
7     0.212160  0.456653  0.142935  0.130142  0.192281  0.126040  0.122663   
8     0.875828  0.247742  0.086247  0.572707  0.584277  0.693185  0.053976   
9     0.474165  0.647057 -0.056397  0.369026  0.553206  0.556851  0.421506   
10    0.396981  0.436066  0.285833  0.225765  0.005603  0.345497  0.249173   
11    0.680950  0.228

Creating simulated data with 1000 batches..
          5340       339       244      1567      1827      4981      2310  \
0     0.310532  0.502055  0.133213  0.015800  0.141596  0.476744  0.332617   
1     0.485254  0.454015  0.134227  0.128104  0.569634  0.506233  0.123839   
2     0.521281  0.211984  0.066191 -0.229221  0.276625  0.682011  0.405029   
3     0.251152  0.593989  0.827349 -0.021346  0.381875  0.739296  0.587906   
4     0.845076  0.548605  0.578434  0.077107  0.262718  0.348283  0.318127   
5     0.492299  0.724497  0.246644  0.320719  0.270801  0.039936  0.262146   
6     0.575603  0.440026  0.387787  0.210613  0.598147  0.145515  0.059217   
7     0.924536  0.597564  0.170882  0.566720  0.162887  0.529373 -0.067061   
8     0.236064  0.592335  0.401958  0.144713  0.427207  0.638982  0.311080   
9     0.280985  0.258009  0.168965  0.167191  0.449586  0.417983  0.423912   
10    0.781875  0.610150 -0.178317  0.353261  0.288189  0.091873  0.312870   
11    0.096960  0.10

Creating simulated data with 2000 batches..
          5340       339       244      1567      1827      4981      2310  \
0     0.376154  0.458797  0.074676  0.154291  0.088300  0.705907  0.357483   
1     0.587359  0.466014 -0.035239  0.042996  0.358381  0.804603  0.366540   
2     0.602367  0.623259  0.061550  0.052490  0.457888  0.442361  0.313866   
3     0.154564  0.587856  0.223439 -0.040730  0.526176  0.252445  0.440736   
4     0.894884  0.443338  0.337080  0.458640  0.183520  0.132394  0.053418   
5     0.642182  0.323335  0.287807  0.150471  0.303197  0.214360  0.188002   
6     0.482025  0.683389  0.272799  0.138815  0.603187  0.585805  0.408870   
7     0.501799  0.379053  0.313655  0.402311  0.098480  0.110836  0.166133   
8     0.483493  0.373533  0.331309  0.372385  0.472952  0.789796  0.075286   
9     0.714018  0.457485  0.223102 -0.053477  0.389485  0.460928  0.568485   
10    0.748001  0.247057  0.160301  0.064545  0.549821  0.573858  0.332152   
11    0.227648  0.29

Creating simulated data with 3000 batches..
          5340       339       244      1567      1827      4981      2310  \
0     0.534601  0.268258  0.233548  0.157061  0.549253  0.416968  0.564894   
1     0.955086  0.403452  0.351471  0.353596  0.393927  0.311641 -0.129239   
2     0.380392  0.427009  0.320446  0.094213  0.125364  0.456815  0.131354   
3     0.180087  0.235259  0.363919  0.336010  0.508571  0.493588  0.365369   
4     0.400083  0.807385  0.452155  0.192652  0.535616  0.668131  0.166651   
5     0.247914  0.531502 -0.046853  0.159633  0.439224  0.418776  0.466865   
6     0.450433  0.084321  0.331347 -0.205022  0.650575  0.249522  0.299046   
7     0.375437 -0.035750  0.362146  0.034173  0.278821  0.250565  0.281571   
8     0.733105  0.541346  0.296577 -0.109207  0.431198  0.573861  0.001191   
9     0.545742  0.657218  0.430287  0.197614  0.179219  0.235317  0.268772   
10    0.686842  0.688215  0.378467  0.442412  0.378865  0.378861  0.555336   
11    0.498541  0.40

Creating simulated data with 6000 batches..
          5340       339       244      1567      1827      4981      2310  \
0     0.445183  0.605422  0.324201  0.111966  0.271504  0.695618  0.348892   
1     0.730834  0.192184  0.303172  0.411912  0.537544  0.218619 -0.069246   
2     0.581064  0.452797 -0.084438 -0.042465  0.195273  0.149398  0.115964   
3     0.519765  0.340451  0.404710  0.254529  0.273402  0.432087  0.451812   
4     0.671526  0.544765 -0.156744  0.384379  0.143037  0.251499  0.145746   
5     0.753691  0.858954  0.111716  0.181368  0.633195  0.557350  0.133056   
6     0.363206  0.042586  0.429421  0.161582  0.535001  0.233120  0.343464   
7     0.595353  0.239911  0.192425  0.030548  0.205596  0.497548 -0.083077   
8     0.742427  0.312498 -0.001200  0.178271  0.334233  0.601592  0.303066   
9     0.472849  0.234271  0.273362  0.147852  0.405487  0.213789  0.279110   
10    0.383503  0.295221  0.309366  0.030667  0.156372  0.395495 -0.020376   
11    0.482823  0.53

CPU times: user 1h 53min 4s, sys: 17.4 s, total: 1h 53min 21s
Wall time: 1h 53min 19s
