In [1]:
#-------------------------------------------------------------------------------------------------------------------------------
# By Alexandra Lee (July 2018) 
#
# Apply offset vector
#
# In original space: Add offset vector to each sample in the test set condition A to transform the gene expression 
# profile of the test samples to look like the samples are under condition B
#
# In latent space:  Add offset vector to each sample in the encoded test set condition A to transform the gene 
# expression profile of the test samples to look like the samples are under condition B
#-------------------------------------------------------------------------------------------------------------------------------
import os
import pandas as pd
import numpy as np

randomState = 123
from numpy.random import seed
seed(randomState)

In [2]:
# load arguments
test_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "PA1673_full_old", "test_lowest_2layer_10latent_encoded.txt")
offset_file = os.path.join(os.path.dirname(os.getcwd()), "data", "PA1673_full_old", "train_offset_2layer_10latent.txt")

# Percentage of the offset to apply to the dataset
percentage = 0.554

# output files
out_file = os.path.join(os.path.dirname(os.getcwd()), "encoded", "PA1673_full_old", "estimated_test_mid2_2layer_10latent_encoded.txt")

In [3]:
# read in data
test_data = pd.read_table(test_file, header=0, sep='\t', index_col=0)

test_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
120330-10978D_24_(Pae_G1a).CEL,1.492632,0.50935,4.600537,0.0,0.420127,0.575989,0.0,3.453617,4.806202,0.761389
120330-10978D_M1_(Pae_G1a).CEL,1.399847,0.76328,5.651806,0.0,0.084455,0.274443,0.0,4.036065,3.995601,0.87495
120330-10978D_M3_(Pae_G1a).CEL,1.009503,1.274258,6.116884,0.0,0.018261,0.1687,0.0,3.331357,3.921042,0.888572
GSM1027586_062508plcHR.CEL,0.0,2.770251,6.559667,0.247611,0.0,1.45736,0.0,3.013114,4.776258,0.0
GSM1027589_071008gbdR.CEL,0.299402,2.893673,7.522685,0.38938,0.0,0.297789,0.0,3.137895,3.857933,0.117549
GSM1191071_WT-pyr-2.CEL,0.0,0.0,7.887179,0.0,0.345404,0.0,1.246244,0.0,0.0,0.0
GSM1267105_HZI1971_Pae_G1a.CEL,0.0,0.546383,6.117712,4.700137,0.0,0.10152,0.0,2.37865,0.0,1.686517
GSM1267106_HZI1972_Pae_G1a.CEL,0.0,0.45209,6.298626,4.342052,0.0,0.138227,0.0,2.629239,0.0,1.724521
GSM1267107_HZI1973_Pae_G1a.CEL,0.0,0.283211,6.151817,4.41038,0.0,0.143809,0.0,2.722233,0.0,1.726472
GSM1421002_EXdnr_control1.CEL,0.0,4.490083,0.0,0.060323,0.0,3.712572,1.118783,0.081938,0.0,4.513785


In [4]:
# read offset
offset_data = pd.read_table(offset_file, header=0, sep='\t', index_col=0)
#offset_data = pd.read_table(offset_file, header=None, sep = '\t', index_col = 0).T #original
    
offset_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.017114,-1.526186,-0.117406,1.114125,1.279145,-2.609903,2.1283,0.923653,1.830987,-2.569676


In [5]:
# Apply offset
estimated_data = test_data.values + percentage*offset_data.values
estimated_data = pd.DataFrame(estimated_data, index = test_data.index)

estimated_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
120330-10978D_24_(Pae_G1a).CEL,1.502113,-0.336158,4.535495,0.617225,1.128773,-0.869897,1.179078,3.965321,5.820569,-0.662211
120330-10978D_M1_(Pae_G1a).CEL,1.409328,-0.082228,5.586763,0.617225,0.793102,-1.171443,1.179078,4.547769,5.009968,-0.54865
120330-10978D_M3_(Pae_G1a).CEL,1.018984,0.428751,6.051841,0.617225,0.726908,-1.277186,1.179078,3.843061,4.935409,-0.535028
GSM1027586_062508plcHR.CEL,0.009481,1.924743,6.494624,0.864837,0.708646,0.011474,1.179078,3.524818,5.790625,-1.4236
GSM1027589_071008gbdR.CEL,0.308882,2.048166,7.457642,1.006605,0.708646,-1.148097,1.179078,3.649598,4.8723,-1.306051
GSM1191071_WT-pyr-2.CEL,0.009481,-0.845507,7.822137,0.617225,1.054051,-1.445886,2.425322,0.511704,1.014367,-1.4236
GSM1267105_HZI1971_Pae_G1a.CEL,0.009481,-0.299124,6.052669,5.317363,0.708646,-1.344366,1.179078,2.890354,1.014367,0.262917
GSM1267106_HZI1972_Pae_G1a.CEL,0.009481,-0.393418,6.233583,4.959278,0.708646,-1.307659,1.179078,3.140943,1.014367,0.300921
GSM1267107_HZI1973_Pae_G1a.CEL,0.009481,-0.562296,6.086774,5.027606,0.708646,-1.302077,1.179078,3.233937,1.014367,0.302872
GSM1421002_EXdnr_control1.CEL,0.009481,3.644576,-0.065043,0.677548,0.708646,2.266686,2.297861,0.593641,1.014367,3.090185


In [6]:
# Output estimated gene experession values
estimated_data.to_csv(out_file, sep='\t')