In [1]:
#-------------------------------------------------------------------------------------------------------------------------------
# By Alexandra Lee (July 2018) 
#
# Take the average of the encoded gene expression for the two experimental conditions
# Take the difference of the averages -- this will be the offset for the latent space
#-------------------------------------------------------------------------------------------------------------------------------
import os
import pandas as pd
import numpy as np
np.random.seed(123)

In [2]:
# load arguments
encodedA_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "tybalt_2layer_10_train_treat_encoded.txt")
encodedB_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "tybalt_2layer_10_train_control_encoded.txt")

# output files
out_file = os.path.join(os.path.dirname(os.getcwd()), "data", "train_offset_latent_2layer.txt")

In [3]:
# read in data
encodedA_data = pd.read_table(encodedA_file, header = 0, sep = '\t', index_col = 0)
encodedB_data = pd.read_table(encodedB_file, header = 0, sep = '\t', index_col = 0)
encodedA_data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
GSM1608067_Biofilm+12hrcipro_rep1.CEL,0.660123,0.0,0.0,0.071671,3.941779,1.282997,0.0,0.132733,2.99605,0.0
GSM1608068_Biofilm+12hrcipro_rep2.CEL,0.338981,0.0,0.0,0.0,0.201274,2.710099,0.13865,0.882745,1.059752,0.0
GSM1608069_Biofilm+12hrcipro_rep3.CEL,1.798442,1.173534,0.0,0.0,4.861267,0.110092,0.358521,0.507635,0.105169,0.0
GSM1244967_PAO1-22-replicate-01.CEL,0.0,0.0,6.093919,6.591572,0.0,5.25609,0.0,0.0,0.0,0.0
GSM1244968_PAO1-22-replicate-02.CEL,0.0,0.0,6.181135,6.376476,0.0,5.081738,0.0,0.0,0.0,0.0


In [4]:
# Change index names to integer for downstream sorting
encodedA_data.columns = [str(i) for i in list(range(0,10))]
encodedB_data.columns = [str(i) for i in list(range(0,10))]

In [5]:
# Average gene expression across samples in training set
train_A_mean = encodedA_data.mean(axis=0)
train_B_mean = encodedB_data.mean(axis=0)

# Generate offset using average gene expression in original dataset
train_offset_latent = (train_A_mean - train_B_mean).sort_index(ascending=True)
train_offset_latent = pd.DataFrame(train_offset_latent, index = train_offset_latent.index)
train_offset_latent

Unnamed: 0,0
0,-2.511202
1,-0.39449
2,3.145393
3,0.158878
4,0.643251
5,2.180199
6,-1.946401
7,-1.163344
8,-0.303066
9,-0.996416


In [6]:
# output
train_offset_latent.to_csv(out_file, sep='\t')