In [1]:
#-------------------------------------------------------------------------------------------------------------------------------
# By Alexandra Lee (July 2018) 
#
# Take the average of the encoded gene expression for the two experimental conditions
# Take the difference of the averages -- this will be the offset for the latent space
#-------------------------------------------------------------------------------------------------------------------------------
import os
import pandas as pd
import numpy as np

randomState = 123
from numpy.random import seed
seed(randomState)

In [2]:
# load arguments
encodedA_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "cipro_treatment", "train_treat_2layer_10latent_encoded.txt")
encodedB_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "cipro_treatment", "train_control_2layer_10latent_encoded.txt")

# output files
out_file = os.path.join(os.path.dirname(os.getcwd()), "data", "cipro_treatment", "train_offset_2layer_10latent.txt")

In [3]:
# read in data
encodedA_data = pd.read_table(encodedA_file, header = 0, sep = '\t', index_col = 0)
encodedB_data = pd.read_table(encodedB_file, header = 0, sep = '\t', index_col = 0)
encodedA_data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
GSM1608067_Biofilm+12hrcipro_rep1.CEL,0.286802,3.508864,0.0,0.0,3.707403,0.0,0.108762,0.835352,0.0,0.379854
GSM1608068_Biofilm+12hrcipro_rep2.CEL,2.841053,3.48401,0.0,0.0,0.294664,0.0,0.260758,0.875666,0.0,0.224994
GSM1608069_Biofilm+12hrcipro_rep3.CEL,0.0,0.260629,0.0,0.0,3.269254,0.0,0.969558,0.0,0.0,0.0
GSM1244967_PAO1-22-replicate-01.CEL,6.010061,0.0,7.605605,1.371147,3.42183,5.601919,0.0,3.339226,0.0,0.0
GSM1244969_PAO1-22-replicate-03.CEL,6.035924,0.0,7.688664,1.378435,3.342392,5.749014,0.0,3.414607,0.0,0.0


In [4]:
# Average gene expression across samples in training set
train_A_mean = encodedA_data.mean(axis=0)
train_B_mean = encodedB_data.mean(axis=0)

train_A_mean

0    3.822224
1    2.670032
2    2.537466
3    0.528063
4    3.808728
5    2.047496
6    0.124221
7    2.653390
8    0.008043
9    0.064670
dtype: float64

In [5]:
train_B_mean

0    0.782199
1    2.479994
2    0.695804
3    0.962233
4    0.999819
5    3.063112
6    0.403031
7    2.981466
8    0.917664
9    1.417317
dtype: float64

In [6]:
# Generate offset using average gene expression in original dataset
train_offset_latent = train_A_mean - train_B_mean


train_offset_latent_df = pd.Series.to_frame(train_offset_latent).transpose()
train_offset_latent_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,3.040025,0.190039,1.841662,-0.43417,2.808909,-1.015616,-0.278809,-0.328075,-0.909621,-1.352648


In [7]:
# output
train_offset_latent_df.to_csv(out_file, sep='\t')