In [1]:
#-------------------------------------------------------------------------------------------------------------------------------
# By Alexandra Lee (July 2018) 
#
# Take the average of the encoded gene expression for the two experimental conditions
# Take the difference of the averages -- this will be the offset for the latent space
#-------------------------------------------------------------------------------------------------------------------------------
import os
import pandas as pd
import numpy as np

randomState = 123
from numpy.random import seed
seed(randomState)

In [2]:
# load arguments
lowest_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "PA1673_full_old", "train_lowest_2layer_10latent_encoded.txt")
highest_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "PA1673_full_old", "train_highest_2layer_10latent_encoded.txt")

# output files
out_file = os.path.join(os.path.dirname(os.getcwd()), "data", "PA1673_full_old", "train_offset_2layer_10latent.txt")

In [3]:
# read in data
lowest_data = pd.read_table(lowest_file, header=0, sep='\t', index_col=0)
highest_data = pd.read_table(highest_file, header=0, sep='\t', index_col=0)
lowest_data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
120330-10978D_23_(Pae_G1a).CEL,1.671844,0.467009,4.866182,0.0,0.26107,0.308768,0.0,4.166525,5.016794,0.565274
120330-10978D_M4_(Pae_G1a).CEL,1.5322,0.66962,4.986362,0.0,0.328975,0.523619,0.0,3.473647,2.467519,0.396457
DC2.CEL,0.0,0.0,2.914689,1.430126,0.0,1.288556,0.0,1.425554,0.220153,3.024789
Glu_6Hour_(Pae_G1a).CEL,2.009348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GSM1133191_Weiqing_He_D-Glu-2_022410.CEL,0.0,3.918891,6.816535,0.404191,0.0,1.561648,0.0,5.540529,0.0,2.271347


In [4]:
highest_data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
GSM1143378_B1_Pae_G1a_.CEL,0.0,0.466845,0.0,4.826356,0.666511,0.0,4.576054,2.510457,2.08544,0.0
GSM1143380_B3_Pae_G1a_.CEL,0.0,0.478316,0.0,4.660741,0.697484,0.0,4.762185,2.584435,2.099928,0.0
GSM1237737_M9_PAOSX_1_51059600653060050411410396750419.CEL,0.324242,0.324177,2.567198,2.85355,0.0,0.0,2.727763,5.772913,0.080004,1.169128
GSM1237738_M9_PAOSX_3_51059600653060050411410396750359.CEL,0.685869,0.0,2.379802,2.815059,0.0,0.0,1.912213,6.478403,0.136612,1.693112
GSM1237739_M9_PAOSX_4_51059600653061050411410396750526.CEL,0.448335,0.0,1.983744,3.282622,0.0,0.0,0.8865,5.969297,0.0,1.762904


In [5]:
# Average gene expression across samples in training set
train_lowest_mean = lowest_data.mean(axis=0)
train_highest_mean = highest_data.mean(axis=0)

train_lowest_mean

0    0.458656
1    2.057539
2    1.706508
3    0.887277
4    0.821577
5    2.726570
6    0.226416
7    1.244016
8    0.563766
9    3.009767
dtype: float64

In [6]:
train_highest_mean

0    0.475770
1    0.531352
2    1.589102
3    2.001403
4    2.100722
5    0.116668
6    2.354716
7    2.167669
8    2.394753
9    0.440092
dtype: float64

In [7]:
# Generate offset using average gene expression in original dataset
train_offset_latent = train_highest_mean - train_lowest_mean


train_offset_latent_df = pd.Series.to_frame(train_offset_latent).transpose()
train_offset_latent_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.017114,-1.526186,-0.117406,1.114125,1.279145,-2.609903,2.1283,0.923653,1.830987,-2.569676


In [8]:
# output
train_offset_latent_df.to_csv(out_file, sep='\t')