In [1]:
#-------------------------------------------------------------------------------------------------------------------------------
# By Alexandra Lee (July 2018) 
#
# Apply offset vector
#
# In original space: Add offset vector to each sample in the test set condition A to transform the gene expression 
# profile of the test samples to look like the samples are under condition B
#
# In latent space:  Add offset vector to each sample in the encoded test set condition A to transform the gene 
# expression profile of the test samples to look like the samples are under condition B
#-------------------------------------------------------------------------------------------------------------------------------
import os
import pandas as pd
import numpy as np
np.random.seed(123)

In [2]:
# load arguments
test_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "tybalt_2layer_10_test_control_encoded.txt")
offset_file = os.path.join(os.path.dirname(os.getcwd()), "data", "train_offset_latent_2layer.txt")

# Are you applying the offset in the latent space?
latent = True

# output files
out_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "estimated_test_control_encoded_2layer.txt")

In [3]:
# read in data
test_data = pd.read_table(test_file, header = 0, sep = '\t', index_col = 0).transpose()

# save header to attach back later
header = test_data.columns

test_data.head(5)
#header

Unnamed: 0,GSM356955.CEL,GSM954576_Nomura_PA01-1_Pae_G1a_.CEL,GSM954578_Nomura_PA01-3_Pae_G1a_.CEL,GSM954579_Nomura_PA01-4_Pae_G1a_.CEL,GSM92182.CEL
0,1.557081,0.405962,0.245053,0.361329,4.61073
1,0.0,0.0,0.0,0.080056,0.391221
2,0.0,0.749353,0.261679,0.176546,0.050018
3,0.0,5.579066,7.001842,6.583868,6.894863
4,4.812786,3.664676,1.163499,1.589735,0.194969


In [4]:
# read offset
if latent:
    offset_data = pd.read_table(offset_file, header = 0, sep = '\t', index_col = 0)
    offset_data.index = [str(i) for i in offset_data.index]  # match index between test_data and offset_data
else:
    offset_data = pd.read_table(offset_file, header = None, sep = '\t', index_col = 0)
    
#offset_data.index
offset_data

Unnamed: 0,0
0,-2.511202
1,-0.39449
2,3.145393
3,0.158878
4,0.643251
5,2.180199
6,-1.946401
7,-1.163344
8,-0.303066
9,-0.996416


In [5]:
# Rename header to match
offset_data.columns = ['gene_exp']
test_data.columns = ['gene_exp']*test_data.shape[1]

test_data

Unnamed: 0,gene_exp,gene_exp.1,gene_exp.2,gene_exp.3,gene_exp.4
0,1.557081,0.405962,0.245053,0.361329,4.61073
1,0.0,0.0,0.0,0.080056,0.391221
2,0.0,0.749353,0.261679,0.176546,0.050018
3,0.0,5.579066,7.001842,6.583868,6.894863
4,4.812786,3.664676,1.163499,1.589735,0.194969
5,0.0,4.965723,5.724838,5.530672,2.03205
6,0.296684,0.0,0.0,0.0,2.15295
7,0.318213,1.020203,2.427797,2.284974,2.695515
8,4.430712,3.017876,2.109365,2.120986,0.0
9,0.455239,0.0,0.0,0.0,3.091622


In [6]:
# Apply offset
estimated_data = test_data.add(offset_data, axis = 'index')
estimated_data.columns = header
estimated_data = estimated_data.transpose()

estimated_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
GSM356955.CEL,-0.95412,-0.39449,3.145393,0.158878,5.456038,2.180199,-1.649717,-0.845131,4.127646,-0.541177
GSM954576_Nomura_PA01-1_Pae_G1a_.CEL,-2.10524,-0.39449,3.894746,5.737945,4.307927,7.145921,-1.946401,-0.143141,2.71481,-0.996416
GSM954578_Nomura_PA01-3_Pae_G1a_.CEL,-2.266149,-0.39449,3.407073,7.160721,1.80675,7.905036,-1.946401,1.264453,1.806299,-0.996416
GSM954579_Nomura_PA01-4_Pae_G1a_.CEL,-2.149872,-0.314433,3.32194,6.742746,2.232987,7.710871,-1.946401,1.12163,1.81792,-0.996416
GSM92182.CEL,2.099529,-0.003269,3.195411,7.053741,0.83822,4.212248,0.206549,1.532171,-0.303066,2.095206


In [7]:
# Output estimated gene experession values
estimated_data.to_csv(out_file, sep='\t')