# Process pseudomonas data
This notebook does the following:

1. Selects template experiment from the Pseudomonas compendium
2. Normalizes the Pseudomonas compendium
3. Train VAE on the normalized data

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
from sklearn import preprocessing
import pickle

from ponyo import utils, train_vae_modules, simulate_expression_data
from generic_expression_patterns_modules import process, calc

np.random.seed(123)

Using TensorFlow backend.


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../"))

config_file = os.path.abspath(os.path.join(base_dir,
                                           "configs",
                                           "config_pseudomonas_1183.tsv"))
params = utils.read_config(config_file)

In [3]:
# Load params
local_dir = params["local_dir"]
dataset_name = params['dataset_name']
NN_architecture = params['NN_architecture']
project_id = params['project_id']
metadata_colname = params['metadata_colname']
template_data_file = params['template_data_file']
original_compendium_file = params['compendium_data_file']
normalized_data_file = params['normalized_compendium_data_file']
shared_genes_file = params['shared_genes_file']
scaler_file = params['scaler_transform_file']

### Download Pseudomonas compendium
The compendium is downloaded from https://raw.githubusercontent.com/greenelab/adage/master/Data_collection_processing/Pa_compendium_02.22.2014.pcl

In [4]:
# Read compendium
original_compendium = pd.read_csv(original_compendium_file,
                                  header=0,
                                  index_col=0,
                                  sep="\t")

if original_compendium.shape != (950, 5549):
    original_compendium = original_compendium.T
    
assert original_compendium.shape == (950, 5549)

print(original_compendium.shape)
original_compendium.head()

(950, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
05_PA14000-4-2_5-10-07_S2.CEL,9.62,10.576,9.296,9.87,8.512,7.904,7.039,10.21,9.785,5.486,...,7.741,9.73,10.516,10.64,9.747,5.769,9.224,11.512,12.53,11.805
54375-4-05.CEL,9.328,10.782,9.17,10.269,7.238,7.664,6.855,9.632,9.404,5.684,...,7.128,9.688,10.2,9.457,9.318,5.524,7.911,10.828,11.598,11.269
AKGlu_plus_nt_7-8-09_s1.CEL,9.369,10.596,9.715,9.487,7.804,7.682,6.714,9.498,9.523,5.766,...,7.343,9.718,10.42,10.165,10.305,5.807,8.576,10.858,12.256,11.31
anaerobic_NO3_1.CEL,9.083,9.897,8.068,7.31,6.724,7.141,8.492,7.741,7.64,5.268,...,7.375,8.288,9.437,8.937,9.418,5.956,7.481,7.688,9.206,9.396
anaerobic_NO3_2.CEL,8.855,9.931,8.167,7.527,6.864,7.155,8.492,7.717,7.268,5.427,...,7.425,8.589,9.314,8.685,9.273,5.729,7.699,7.414,9.363,9.425


### Select template experiment

We manually selected bioproject [E-GEOD-9989](https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-9989/?query=George+O%27Toole), which contains 2 samples (3 replicates each) of PA14 WT that are grown on CFBE41o- cells are either treated tobramycin or untreated.

Another bioproject selected [E-MEXP-1183](https://www.ebi.ac.uk/arrayexpress/experiments/E-MEXP-1183/), which contains a total of 10 samples. But for now we will select those 4 samples using WT that were measuring the effect of acyl-HSL signal.

In [5]:
sample_ids = simulate_expression_data.get_sample_ids(project_id, dataset_name, metadata_colname)

In [6]:
# Get samples from experiment id
template_data = original_compendium.loc[sample_ids]
print(template_data.shape)
template_data.head()

(10, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
MSC_01.CEL,9.808,11.11,10.192,9.907,8.006,8.187,7.311,9.901,10.122,5.474,...,8.422,9.934,10.339,10.395,10.418,5.587,8.364,11.088,12.213,11.737
MSC_02.CEL,9.7,10.598,9.895,9.755,8.289,7.961,7.312,9.991,9.756,5.469,...,7.92,9.753,10.304,10.428,10.152,5.689,7.92,11.233,12.361,11.975
MSC_03.CEL,9.644,10.894,9.862,9.85,8.13,7.771,7.401,10.438,9.913,5.582,...,7.181,9.925,10.682,10.743,10.18,5.902,8.023,11.504,12.246,11.355
MSC_04.CEL,9.652,10.793,9.724,9.889,8.144,7.834,7.104,9.967,9.78,5.466,...,7.507,9.788,10.046,10.336,9.838,5.585,7.845,11.093,12.182,11.73
MSC_05.CEL,9.757,10.658,9.536,9.652,8.292,8.028,7.309,10.247,9.901,5.466,...,7.948,9.779,10.264,10.444,10.025,5.613,7.886,11.417,12.196,11.911


In [7]:
if project_id == "E-MEXP-1183":
    # drop samples
    sample_ids_to_drop = ["MSC_05.CEL",
                          "MSC_06.CEL",
                          "MSC_07.CEL",
                          "MSC_08.CEL",
                          "MSC_09.CEL",
                          "MSC_10.CEL"]
    template_data = template_data.drop(sample_ids_to_drop)
    
    assert(template_data.shape[0] == 4)

Note: We are training a compendium using all the samples (including those that are being dropped in the template experiment). However, only the subset of samples (those kept) in the template experiment are those used in the DE analysis in order to ensure the comparison of samples with consistent backgrounds. 

So there is an inconsistency in the samples used to learn a low-dimensional representation and those used to calculate DE statistics. The inconsistency could possibly effect the DE statistics if the low dimensional space is significantly different including these extra samples vs not. These few samples will likely not effect the space.

### Normalize compendium 

In [8]:
# 0-1 normalize per gene
scaler = preprocessing.MinMaxScaler()
original_data_scaled = scaler.fit_transform(original_compendium)
original_data_scaled_df = pd.DataFrame(original_data_scaled,
                                columns=original_compendium.columns,
                                index=original_compendium.index)

print(original_data_scaled_df.shape)
original_data_scaled_df.head()

(950, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
05_PA14000-4-2_5-10-07_S2.CEL,0.853241,0.725263,0.640628,0.811398,0.694432,0.533942,0.158691,0.889579,0.885095,0.176594,...,0.466929,0.702729,0.790944,0.89321,0.789901,0.164209,0.970345,0.887367,0.900485,0.880045
54375-4-05.CEL,0.778714,0.767817,0.61493,0.907752,0.39884,0.460883,0.11377,0.761419,0.801707,0.222662,...,0.35207,0.694483,0.733248,0.63902,0.681045,0.11028,0.619557,0.747547,0.749919,0.805455
AKGlu_plus_nt_7-8-09_s1.CEL,0.789178,0.729395,0.726086,0.718908,0.530162,0.466362,0.079346,0.731707,0.827752,0.24174,...,0.392355,0.700373,0.773416,0.791147,0.931489,0.172573,0.797221,0.753679,0.85622,0.811161
anaerobic_NO3_1.CEL,0.716182,0.585003,0.390169,0.19319,0.279582,0.301674,0.513428,0.342129,0.415627,0.125872,...,0.398351,0.419596,0.593938,0.527288,0.70642,0.205371,0.504675,0.105683,0.363489,0.544809
anaerobic_NO3_2.CEL,0.657989,0.592026,0.410361,0.245593,0.312065,0.305936,0.513428,0.336807,0.334209,0.162866,...,0.40772,0.478696,0.571481,0.473141,0.669627,0.155404,0.562917,0.049673,0.388853,0.548845


### Save data files

In [9]:
# Save data
original_compendium.to_csv(
    original_compendium_file, float_format='%.3f', sep='\t')

template_data.to_csv(template_data_file, float_format='%.5f', sep='\t')

original_data_scaled_df.to_csv(
    normalized_data_file, float_format='%.3f', sep='\t')

# Save scaler transform
outfile = open(scaler_file,'wb')
pickle.dump(scaler,outfile)
outfile.close()

# Save shared genes
# In this case all genes are used
shared_genes = list(original_compendium.columns)

outfile = open(shared_genes_file,'wb')
pickle.dump(shared_genes,outfile)
outfile.close()

### Train VAE 

In [10]:
# Setup directories
# Create VAE directories
output_dirs = [os.path.join(base_dir, dataset_name, "models"),
               os.path.join(base_dir, dataset_name, "logs")]

# Check if analysis output directory exist otherwise create
for each_dir in output_dirs:
    if os.path.exists(each_dir) == False:
        print('creating new directory: {}'.format(each_dir))
        os.makedirs(each_dir, exist_ok=True)

# Check if NN architecture directory exist otherwise create
for each_dir in output_dirs:
    new_dir = os.path.join(each_dir, NN_architecture)
    if os.path.exists(new_dir) == False:
        print('creating new directory: {}'.format(new_dir))
        os.makedirs(new_dir, exist_ok=True)

In [11]:
# Train VAE on new compendium data
#train_vae_modules.train_vae(config_file,
#                            normalized_data_file)