In [1]:
#-------------------------------------------------------------------------------------------------------------------------------
# By Alexandra Lee (July 2018) 
#
# Apply offset vector
#
# In original space: Add offset vector to each sample in the test set condition A to transform the gene expression 
# profile of the test samples to look like the samples are under condition B
#
# In latent space:  Add offset vector to each sample in the encoded test set condition A to transform the gene 
# expression profile of the test samples to look like the samples are under condition B
#-------------------------------------------------------------------------------------------------------------------------------
import os
import pandas as pd
import numpy as np

randomState = 123
from numpy.random import seed
seed(randomState)

In [2]:
# load arguments
test_file = os.path.join(os.path.dirname(os.getcwd()), "data", "cipro_treatment", "test_control.txt")
offset_file = os.path.join(os.path.dirname(os.getcwd()), "data", "cipro_treatment", "train_offset_original.txt")

# output files
out_file = os.path.join(os.path.dirname(os.getcwd()), "output", "cipro_treatment", "estimated_test_control_original.txt")

In [3]:
# read in data
test_data = pd.read_table(test_file, header = 0, sep = '\t', index_col = 0)

test_data

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM356955.CEL,0.523823,0.594183,0.376242,0.74709,0.439185,0.466252,0.493666,0.543735,0.622464,0.244651,...,0.313673,0.56457,0.48715,0.701037,0.635983,0.599291,0.714874,0.796487,0.674756,0.638105
GSM954576_Nomura_PA01-1_Pae_G1a_.CEL,0.449574,0.569119,0.356904,0.606993,0.313796,0.452389,0.540796,0.346749,0.372734,0.185079,...,0.391077,0.546586,0.495389,0.531542,0.378673,0.222432,0.432766,0.287531,0.343099,0.194245
GSM954578_Nomura_PA01-3_Pae_G1a_.CEL,0.650631,0.594249,0.396833,0.511396,0.341081,0.380707,0.574916,0.296616,0.348616,0.165047,...,0.501884,0.577548,0.634781,0.524943,0.425947,0.279438,0.405522,0.417845,0.365188,0.388724
GSM954579_Nomura_PA01-4_Pae_G1a_.CEL,0.607604,0.613629,0.362308,0.528051,0.363008,0.380931,0.53551,0.333461,0.320784,0.227256,...,0.475606,0.564781,0.63032,0.47337,0.36633,0.268081,0.353713,0.350998,0.344761,0.326619
GSM92182.CEL,0.687206,0.709832,0.432323,0.731554,0.400733,0.306312,0.419512,0.48463,0.62361,0.134044,...,0.427516,0.50747,0.581297,0.614464,0.661921,0.118582,0.400441,0.492726,0.738369,0.74831


In [4]:
# read offset
offset_data = pd.read_table(offset_file, header = 0, sep = '\t', index_col = 0)

offset_data

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
0,-0.099782,-0.121703,-0.037494,-0.043734,-0.216924,0.072742,0.082047,-0.161538,-0.160335,0.014055,...,0.059468,-0.17019,-0.124185,-0.108054,0.036396,0.100216,-0.005987,-0.147975,-0.177429,-0.075753


In [5]:
# Apply offset
estimated_data = test_data.values + offset_data.values
estimated_data = pd.DataFrame(estimated_data, index = test_data.index)

estimated_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5539,5540,5541,5542,5543,5544,5545,5546,5547,5548
GSM356955.CEL,0.424041,0.47248,0.338748,0.703356,0.222261,0.538994,0.575714,0.382197,0.462129,0.258707,...,0.373141,0.39438,0.362965,0.592983,0.67238,0.699508,0.708887,0.648512,0.497327,0.562353
GSM954576_Nomura_PA01-1_Pae_G1a_.CEL,0.349791,0.447416,0.31941,0.563259,0.096872,0.525131,0.622843,0.185211,0.212399,0.199135,...,0.450545,0.376397,0.371204,0.423488,0.415069,0.322648,0.426779,0.139555,0.16567,0.118492
GSM954578_Nomura_PA01-3_Pae_G1a_.CEL,0.550848,0.472546,0.359339,0.467662,0.124157,0.453449,0.656964,0.135078,0.188281,0.179102,...,0.561352,0.407358,0.510595,0.416889,0.462344,0.379654,0.399536,0.26987,0.187759,0.312971
GSM954579_Nomura_PA01-4_Pae_G1a_.CEL,0.507822,0.491927,0.324813,0.484316,0.146084,0.453674,0.617558,0.171924,0.160449,0.241311,...,0.535073,0.394591,0.506135,0.365316,0.402726,0.368297,0.347726,0.203023,0.167332,0.250866
GSM92182.CEL,0.587424,0.588129,0.394828,0.68782,0.183809,0.379054,0.501559,0.323092,0.463275,0.1481,...,0.486983,0.33728,0.457112,0.506411,0.698317,0.218798,0.394454,0.344751,0.56094,0.672557


In [6]:
# Output estimated gene experession values
estimated_data.to_csv(out_file, sep='\t')