In [1]:
#-------------------------------------------------------------------------------------------------------------------------------
# By Alexandra Lee (July 2018) 
#
# Take the average of the encoded gene expression for the two experimental conditions
# Take the difference of the averages -- this will be the offset for the latent space
#-------------------------------------------------------------------------------------------------------------------------------
import os
import pandas as pd
import numpy as np

randomState = 123
from numpy.random import seed
seed(randomState)

In [2]:
# load arguments
encodedA_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "train_treat_1layer_10latent_encoded.txt")
encodedB_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "train_control_1layer_10latent_encoded.txt")

# output files
out_file = os.path.join(os.path.dirname(os.getcwd()), "data", "train_offset_1layer_10latent.txt")

In [3]:
# read in data
encodedA_data = pd.read_table(encodedA_file, header = 0, sep = '\t', index_col = 0)
encodedB_data = pd.read_table(encodedB_file, header = 0, sep = '\t', index_col = 0)
encodedA_data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
GSM1608067_Biofilm+12hrcipro_rep1.CEL,0.0,0.133183,0.0,2.542186,0.0,0.0,0.0,2.80557,0.0,4.663349
GSM1608068_Biofilm+12hrcipro_rep2.CEL,1.140755,1.716108,0.487591,0.0,0.0,0.908535,1.170017,4.757957,0.0,2.691764
GSM1608069_Biofilm+12hrcipro_rep3.CEL,1.626078,0.0,0.0,2.95332,0.0,0.0,0.0,4.11003,0.913531,1.11789
GSM1244967_PAO1-22-replicate-01.CEL,0.0,4.258472,0.0,3.946286,3.798057,7.482199,0.0,0.0,0.0,0.0
GSM1244969_PAO1-22-replicate-03.CEL,0.0,4.217313,0.0,3.896841,3.889508,7.628132,0.0,0.0,0.0,0.0


In [4]:
# Average gene expression across samples in training set
train_A_mean = encodedA_data.mean(axis=0)
train_B_mean = encodedB_data.mean(axis=0)

train_A_mean

0    0.230569
1    1.714602
2    0.040633
3    2.771958
4    1.258668
5    3.255904
6    0.115666
7    2.419205
8    0.076128
9    2.729467
dtype: float64

In [5]:
train_B_mean

0    2.281682
1    0.776898
2    1.004865
3    0.581459
4    3.173047
5    2.529348
6    0.970227
7    1.973405
8    0.924208
9    1.517139
dtype: float64

In [6]:
# Generate offset using average gene expression in original dataset
train_offset_latent = train_A_mean - train_B_mean


train_offset_latent_df = pd.Series.to_frame(train_offset_latent).transpose()
train_offset_latent_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-2.051113,0.937703,-0.964232,2.190499,-1.914379,0.726556,-0.854561,0.4458,-0.84808,1.212327


In [7]:
# output
train_offset_latent_df.to_csv(out_file, sep='\t')