In [1]:
#-------------------------------------------------------------------------------------------------------------------------------
# By Alexandra Lee (July 2018) 
#
# Apply offset vector
#
# In original space: Add offset vector to each sample in the test set condition A to transform the gene expression 
# profile of the test samples to look like the samples are under condition B
#
# In latent space:  Add offset vector to each sample in the encoded test set condition A to transform the gene 
# expression profile of the test samples to look like the samples are under condition B
#-------------------------------------------------------------------------------------------------------------------------------
import os
import pandas as pd
import numpy as np
np.random.seed(123)

In [2]:
# load arguments
test_file = os.path.join(os.path.dirname(os.getcwd()), "data", "test_control.txt")
offset_file = os.path.join(os.path.dirname(os.getcwd()), "data", "train_offset_original.txt")

# Are you applying the offset in the latent space?
latent = False

# output files
out_file = os.path.join(os.path.dirname(os.getcwd()), "output", "estimated_test_control_original.txt")

In [3]:
# read in data
test_data = pd.read_table(test_file, header = 0, sep = '\t', index_col = 0).transpose()

# save header to attach back later
header = test_data.columns

test_data.head(5)
#header

Unnamed: 0,GSM356955.CEL,GSM954576_Nomura_PA01-1_Pae_G1a_.CEL,GSM954578_Nomura_PA01-3_Pae_G1a_.CEL,GSM954579_Nomura_PA01-4_Pae_G1a_.CEL,GSM92182.CEL
PA0001,0.523823,0.449574,0.650631,0.607604,0.687206
PA0002,0.594183,0.569119,0.594249,0.613629,0.709832
PA0003,0.376242,0.356904,0.396833,0.362308,0.432323
PA0004,0.74709,0.606993,0.511396,0.528051,0.731554
PA0005,0.439185,0.313796,0.341081,0.363008,0.400733


In [4]:
# read offset
if latent:
    offset_data = pd.read_table(offset_file, header = 0, sep = '\t', index_col = 0)
    offset_data.index = [str(i) for i in offset_data.index]  # match index between test_data and offset_data
else:
    offset_data = pd.read_table(offset_file, header = None, sep = '\t', index_col = 0)
    
#offset_data.index
offset_data

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
PA0001,-0.071212
PA0002,-0.119662
PA0003,-0.020354
PA0004,-0.089397
PA0005,-0.203709
PA0006,0.069165
PA0007,0.061485
PA0008,-0.174466
PA0009,-0.156391
PA0010,0.012799


In [5]:
# Rename header to match
offset_data.columns = ['gene_exp']
test_data.columns = ['gene_exp']*test_data.shape[1]

test_data

Unnamed: 0,gene_exp,gene_exp.1,gene_exp.2,gene_exp.3,gene_exp.4
PA0001,0.523823,0.449574,0.650631,0.607604,0.687206
PA0002,0.594183,0.569119,0.594249,0.613629,0.709832
PA0003,0.376242,0.356904,0.396833,0.362308,0.432323
PA0004,0.747090,0.606993,0.511396,0.528051,0.731554
PA0005,0.439185,0.313796,0.341081,0.363008,0.400733
PA0006,0.466252,0.452389,0.380707,0.380931,0.306312
PA0007,0.493666,0.540796,0.574916,0.535510,0.419512
PA0008,0.543735,0.346749,0.296616,0.333461,0.484630
PA0009,0.622464,0.372734,0.348616,0.320784,0.623610
PA0010,0.244651,0.185079,0.165047,0.227256,0.134044


In [6]:
# Apply offset
estimated_data = test_data.add(offset_data, axis = 'index')
estimated_data.columns = header
estimated_data = estimated_data.transpose()

estimated_data

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM356955.CEL,0.452611,0.474521,0.355888,0.657693,0.235476,0.535417,0.555152,0.369269,0.466073,0.25745,...,0.45192,0.349782,0.422279,0.560079,0.60135,0.682988,0.625504,0.590902,0.495578,0.611328
GSM954576_Nomura_PA01-1_Pae_G1a_.CEL,0.378362,0.449457,0.33655,0.517596,0.110087,0.521554,0.602281,0.172283,0.216343,0.197878,...,0.529324,0.331799,0.430517,0.390584,0.344039,0.306129,0.343396,0.081946,0.163921,0.167468
GSM954578_Nomura_PA01-3_Pae_G1a_.CEL,0.579418,0.474587,0.376479,0.421999,0.137372,0.449872,0.636402,0.12215,0.192225,0.177845,...,0.640131,0.362761,0.569909,0.383985,0.391314,0.363135,0.316152,0.21226,0.18601,0.361947
GSM954579_Nomura_PA01-4_Pae_G1a_.CEL,0.536392,0.493967,0.341953,0.438653,0.159299,0.450096,0.596996,0.158995,0.164393,0.240054,...,0.613852,0.349994,0.565448,0.332412,0.331696,0.351778,0.264343,0.145413,0.165584,0.299842
GSM92182.CEL,0.615994,0.590169,0.411968,0.642157,0.197024,0.375477,0.480997,0.310164,0.467219,0.146843,...,0.565762,0.292682,0.516426,0.473506,0.627288,0.202279,0.311071,0.287141,0.559191,0.721533


In [7]:
# Output estimated gene experession values
estimated_data.to_csv(out_file, sep='\t')