In [1]:
#-------------------------------------------------------------------------------------------------------------------------------
# By Alexandra Lee (July 2018) 
#
# Take the average of the encoded gene expression for the two experimental conditions
# Take the difference of the averages -- this will be the offset for the latent space
#-------------------------------------------------------------------------------------------------------------------------------
import os
import pandas as pd
import numpy as np
np.random.seed(123)

In [2]:
# load arguments
encodedA_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "tybalt_1layer_10_train_treat_encoded.txt")
encodedB_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "tybalt_1layer_10_train_control_encoded.txt")

# output files
out_file = os.path.join(os.path.dirname(os.getcwd()), "data", "train_offset_latent.txt")

In [3]:
# read in data
encodedA_data = pd.read_table(encodedA_file, header = 0, sep = '\t', index_col = 0)
encodedB_data = pd.read_table(encodedB_file, header = 0, sep = '\t', index_col = 0)
encodedA_data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
GSM1608067_Biofilm+12hrcipro_rep1.CEL,1.093034,1.793227,0.0,0.0,2.693321,0.0,0.045586,0.0,0.0,4.305523
GSM1608068_Biofilm+12hrcipro_rep2.CEL,0.398129,0.93154,1.273853,0.0,1.610256,0.0,2.701606,0.501457,1.756625,1.961129
GSM1608069_Biofilm+12hrcipro_rep3.CEL,0.0,0.0,0.0,0.273615,0.785737,0.268187,0.149403,0.0,0.611165,5.167446
GSM1244967_PAO1-22-replicate-01.CEL,7.09647,0.0,0.0,0.0,1.668224,1.143485,0.0,5.619364,0.0,0.0
GSM1244968_PAO1-22-replicate-02.CEL,7.13513,0.0,0.0,0.0,1.617799,1.108371,0.0,5.271164,0.0,0.0


In [4]:
# Change index names to integer for downstream sorting
encodedA_data.columns = [str(i) for i in list(range(0,10))]
encodedB_data.columns = [str(i) for i in list(range(0,10))]

In [5]:
# Average gene expression across samples in training set
train_A_mean = encodedA_data.mean(axis=0)
train_B_mean = encodedB_data.mean(axis=0)

# Generate offset using average gene expression in original dataset
train_offset_latent = (train_A_mean - train_B_mean).sort_index(ascending=True)
train_offset_latent = pd.DataFrame(train_offset_latent, index = train_offset_latent.index)
train_offset_latent

Unnamed: 0,0
0,3.901045
1,-1.435007
2,-0.244977
3,-1.464938
4,-0.647329
5,-1.569835
6,-0.695702
7,0.642414
8,-1.293433
9,0.980941


In [6]:
# output
train_offset_latent.to_csv(out_file, sep='\t')