In [1]:
#-------------------------------------------------------------------------------------------------------------------------------
# By Alexandra Lee (July 2018) 
#
# Generate input datasets
# Use map_file to group samples into phenotype groups (condition A and B) based on experimental design annotations
# Then group samples into training and test sets
#
# Generate offset vector using gene expression data in the original space (train_offset_original):
# average gene expression for condition A - average gene expression for condition B using all genes/dimensions
#-------------------------------------------------------------------------------------------------------------------------------
import os
import pandas as pd
import numpy as np
np.random.seed(123)

In [2]:
# load arguments
data_file = 'C:/Users/alexj/Documents/UPenn/CGreene/Pseudomonas_scratch/data/all-pseudomonas-gene-normalized.pcl'
#data_file = os.path.join(os.path.dirname(os.getcwd()), "data", "all-pseudomonas-gene-normalized.pcl")  # repo file is zipped
map_file = os.path.join(os.path.dirname(os.getcwd()), "metadata", "mapping_phosphate.txt")

In [3]:
# read in data
data = pd.read_table(data_file, header = 0, sep = '\t', index_col = 0)
X = data.transpose()
X.head(5)

Gene_symbol,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
0.1_12hr_CSV86(Pae_G1a).CEL,0.472897,0.396658,0.253776,0.0,0.17564,0.554385,0.41137,0.382222,0.310144,0.642522,...,0.358597,0.390048,0.457406,0.684082,0.338351,0.608325,0.643496,0.276075,0.112773,0.14517
0.1_2hr_CSV86(Pae_G1a).CEL,0.262346,0.086216,0.359853,0.439214,0.269749,0.768433,0.212505,0.062043,0.567695,0.467073,...,0.358504,0.414206,0.389879,0.477693,0.0,0.479385,0.154471,0.140891,0.167505,0.15706
0.1_6hr_CSV86(Pae_G1a).CEL,0.473658,0.244862,0.33075,0.097697,0.387226,0.328319,0.22882,0.330039,0.318081,0.512864,...,0.180744,0.380741,0.173501,0.251571,0.182793,0.528301,0.504985,0.499782,0.061106,0.365612
0.1_7hr_CSV86(Pae_G1a).CEL,0.439273,0.343402,0.192698,0.274677,0.628979,0.553796,0.431391,0.36348,0.385721,0.094584,...,0.346837,0.153927,0.067349,0.319723,0.282442,0.490655,0.531415,0.15388,0.132333,0.260087
0.1_9hr_CSV86(Pae_G1a).CEL,0.220827,0.145525,0.437803,0.293201,0.63512,0.462893,0.488733,0.309584,0.318646,0.591914,...,0.237726,0.301945,0.070222,0.513605,0.114277,0.360259,0.386868,0.223995,0.105343,0.102088


In [4]:
# read in metadata file containing grouping of each sample into training/test and phenotypic group
grp = pd.read_table(map_file, header = 0, sep = '\t', index_col = None)
grp.head(5)

Unnamed: 0,Experiment ID,Sample ID,Group,Dataset
0,E-GEOD-30967,GSM767700.CEL,A,Train
1,E-GEOD-30967,GSM767701.CEL,A,Train
2,E-GEOD-30967,GSM767704.CEL,B,Train
3,E-GEOD-30967,GSM767705.CEL,B,Train
4,E-GEOD-30967,GSM767702.CEL,A,Test


In [5]:
# Group samples into condition A and B based on mapping file provided

# ***Group samples into training and test sets based on percentage***
train_A = pd.DataFrame()
train_B = pd.DataFrame()
test_A = pd.DataFrame()
test_B = pd.DataFrame()

for index, row in grp.iterrows():
    if row['Dataset'] == 'Train':
        if row['Group'] == 'A':
            sample = str(row['Sample ID'])
            train_A = train_A.append(X[X.index.str.match(sample)])
            #print('Training group A {}'.format(sample))
        else:
            sample = str(row['Sample ID'])
            train_B = train_B.append(X[X.index.str.match(sample)])
            #print('Training group B {}'.format(sample))
    else:
        if row['Group'] == 'A':
            sample = str(row['Sample ID'])
            test_A = test_A.append(X[X.index.str.match(sample)])
            #print('Test group A {}'.format(sample))
        else:
            sample = str(row['Sample ID'])
            test_B = test_B.append(X[X.index.str.match(sample)])
            #print('Test group B {}'.format(sample))
train_A

Gene_symbol,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM767700.CEL,0.51003,0.62291,0.273744,0.489565,0.352052,0.348162,0.64842,0.348422,0.185635,0.169545,...,0.356518,0.496107,0.557342,0.422293,0.628904,0.163315,0.392724,0.504344,0.562259,0.669543
GSM767701.CEL,0.538774,0.629058,0.225741,0.46533,0.346793,0.298327,0.611559,0.362063,0.176506,0.157606,...,0.368671,0.470055,0.590991,0.385859,0.635533,0.20078,0.413012,0.539918,0.592608,0.685391


In [6]:
# Average gene expression across samples in training set
train_A_mean = train_A.mean(axis=0)
train_B_mean = train_B.mean(axis=0)

# Generate offset using average gene expression in original dataset
train_offset_original = train_A_mean - train_B_mean

In [7]:
# Output training and test sets
train_A.to_csv(os.path.join(os.path.dirname(os.getcwd()), "data", "train_A.txt"), sep='\t')
train_B.to_csv(os.path.join(os.path.dirname(os.getcwd()), "data", "train_B.txt"), sep='\t')
test_A.to_csv(os.path.join(os.path.dirname(os.getcwd()), "data", "test_A.txt"), sep='\t')
test_B.to_csv(os.path.join(os.path.dirname(os.getcwd()), "data", "test_B.txt"), sep='\t')

train_offset_original.to_csv(os.path.join(os.path.dirname(os.getcwd()), "data", "train_offset_original.txt"), sep='\t')