# Create compression-only dataset
This notebook creates an experiment that only applies VAE encoding. This experiment will be used to ...

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import os
import glob
import pandas as pd
from sklearn import preprocessing
from keras.models import load_model
from ponyo import utils
from rpy2.robjects import pandas2ri
pandas2ri.activate()

Using TensorFlow backend.


In [2]:
# Read in config variables
config_file = os.path.abspath(os.path.join(os.getcwd(),"../configs", "config_Pa_experiment_limma.tsv"))
params = utils.read_config(config_file)

In [3]:
# Load parameters
local_dir = params["local_dir"]
dataset_name = params['dataset_name']
NN_architecture = params['NN_architecture']
experiment_id = 'E-GEOD-51409'

base_dir = os.path.abspath(
  os.path.join(
      os.getcwd(), "../"))

In [4]:
# Input files
# File containing expression data from original compendium
original_compendium_file = os.path.join(
    local_dir,
    "input",
    "Pa_compendium_02.22.2014.pcl")

# File containing expression data from template experiment
selected_original_data_file = os.path.join(
    local_dir,
    "pseudo_experiment",
    "selected_original_data_"+experiment_id+"_example.txt")

# Load VAE encoder and decoder models
NN_dir = os.path.join(
    base_dir, 
    dataset_name,
    "models",
    NN_architecture)
model_encoder_file = glob.glob(os.path.join(
    NN_dir,
    "*_encoder_model.h5"))[0]

weights_encoder_file = glob.glob(os.path.join(
    NN_dir,
    "*_encoder_weights.h5"))[0]

model_decoder_file = glob.glob(os.path.join(
    NN_dir,
    "*_decoder_model.h5"))[0]

weights_decoder_file = glob.glob(os.path.join(
    NN_dir,
    "*_decoder_weights.h5"))[0]

loaded_model = load_model(model_encoder_file)
loaded_decode_model = load_model(model_decoder_file)

loaded_model.load_weights(weights_encoder_file)
loaded_decode_model.load_weights(weights_decoder_file)

Instructions for updating:
Colocations handled automatically by placer.




In [5]:
# Output files
selected_compressed_data_file = os.path.join(
    local_dir,
    "pseudo_experiment",
    "selected_compressed_only_data_"+experiment_id+"_example.txt")

DE_stats_compressed_only_file = os.path.join(
    local_dir,
    "pseudo_experiment",
    "output_original",
    "DE_stats_compressed_only_data_"+experiment_id+"_example.txt")

DE_stats_original_file = os.path.join(
    local_dir,
    "pseudo_experiment",
    "output_original",
    "DE_stats_original_data_"+experiment_id+"_example.txt")

## Normalize expression data

In [6]:
# Read compendium
original_compendium = pd.read_csv(original_compendium_file,
                                header=0,
                                index_col=0,
                                sep="\t").T

print(original_compendium.shape)
original_compendium.head()

(950, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
05_PA14000-4-2_5-10-07_S2.CEL,9.62009,10.575783,9.296287,9.870074,8.512268,7.903954,7.039473,10.209826,9.784684,5.485688,...,7.740609,9.730384,10.516061,10.639916,9.746849,5.768592,9.224442,11.512176,12.529719,11.804896
54375-4-05.CEL,9.327996,10.781977,9.169988,10.269239,7.237999,7.663758,6.855194,9.631573,9.404465,5.684067,...,7.127736,9.687607,10.199612,9.457152,9.318372,5.523898,7.911031,10.828271,11.597643,11.26852
AKGlu_plus_nt_7-8-09_s1.CEL,9.368599,10.596248,9.714517,9.487155,7.804147,7.681754,6.714411,9.497601,9.523126,5.766331,...,7.343241,9.717993,10.419979,10.164667,10.305005,5.806817,8.57573,10.85825,12.255953,11.309662
anaerobic_NO3_1.CEL,9.083292,9.89705,8.068471,7.310218,6.723634,7.141148,8.492302,7.740717,7.640251,5.267993,...,7.37474,8.287819,9.437053,8.936576,9.418147,5.956482,7.481406,7.687985,9.205525,9.395773
anaerobic_NO3_2.CEL,8.854901,9.931392,8.167126,7.526595,6.864015,7.154523,8.492109,7.716687,7.268094,5.427256,...,7.425398,8.588969,9.313851,8.684602,9.272818,5.729479,7.699086,7.414436,9.363494,9.424762


In [7]:
# Read template experiment
original_template = pd.read_csv(selected_original_data_file,
                                header=0,
                                index_col=0,
                                sep="\t")

print(original_template.shape)
original_template.head()

(6, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM1244967_PAO1-22-replicate-01.CEL,9.049,9.927,8.886,8.805,5.988,7.764,8.35,7.774,7.774,5.511,...,9.356,7.378,9.645,7.429,7.655,6.132,6.241,7.684,9.073,10.695
GSM1244968_PAO1-22-replicate-02.CEL,8.833,9.917,9.01,8.901,6.096,7.749,8.32,8.301,7.601,5.707,...,9.486,6.894,9.723,7.789,7.605,6.231,6.303,7.815,8.896,10.714
GSM1244969_PAO1-22-replicate-03.CEL,8.885,9.907,8.738,8.629,6.27,7.989,8.305,7.954,7.695,5.68,...,9.08,7.297,9.656,7.346,7.659,5.703,6.39,7.864,9.196,10.672
GSM1244970_PAO1-37-replicate-01.CEL,8.778,9.872,8.755,8.662,7.213,8.426,8.67,8.653,7.738,5.695,...,9.153,7.928,9.573,8.285,8.581,5.96,6.266,7.824,10.908,12.177
GSM1244971_PAO1-37-replicate-02.CEL,9.061,9.828,8.342,8.842,6.466,7.97,8.432,8.227,7.877,5.785,...,9.337,8.013,9.553,8.591,8.629,5.779,6.831,8.053,10.93,12.152


In [8]:
# 0-1 normalize compendium per gene
scaler = preprocessing.MinMaxScaler()
original_compendium_scaled = scaler.fit_transform(original_compendium)

In [9]:
# 0-1 normalize template experiment using scaler above
original_data_scaled = scaler.transform(original_template)

original_data_scaled_df = pd.DataFrame(original_data_scaled,
                                columns=original_template.columns,
                                index=original_template.index)

print(original_data_scaled_df.shape)
original_data_scaled_df.head()

(6, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM1244967_PAO1-22-replicate-01.CEL,0.707566,0.591265,0.556941,0.554245,0.108772,0.49136,0.478806,0.349432,0.444937,0.182447,...,0.769591,0.240954,0.631923,0.203227,0.259084,0.24414,0.173356,0.104847,0.341998,0.725569
GSM1244968_PAO1-22-replicate-02.CEL,0.652424,0.5892,0.58223,0.577429,0.133831,0.486795,0.471482,0.466295,0.407078,0.228044,...,0.793953,0.145933,0.646164,0.280591,0.246396,0.265929,0.189921,0.131629,0.3134,0.728212
GSM1244969_PAO1-22-replicate-03.CEL,0.665699,0.587134,0.526757,0.51174,0.174203,0.559844,0.46782,0.389347,0.427648,0.221763,...,0.717869,0.225052,0.633931,0.185391,0.260099,0.14972,0.213166,0.141646,0.36187,0.722368
GSM1244970_PAO1-37-replicate-01.CEL,0.638383,0.579904,0.530224,0.51971,0.393,0.692856,0.556928,0.544351,0.437058,0.225253,...,0.731549,0.348933,0.618777,0.38718,0.494078,0.206284,0.180035,0.133469,0.63847,0.93179
GSM1244971_PAO1-37-replicate-02.CEL,0.710629,0.570815,0.445995,0.563181,0.219679,0.554061,0.498825,0.449885,0.467477,0.24619,...,0.76603,0.36562,0.615125,0.452939,0.506259,0.166447,0.330992,0.180285,0.642025,0.928312


## Encode and decode data

In [10]:
# Pass original data through VAE
# Encode selected experiment into latent space
data_encoded = loaded_model.predict_on_batch(original_data_scaled_df)
data_encoded_df = pd.DataFrame(
    data_encoded, 
    index=original_data_scaled_df.index)

# Decode simulated data into raw gene space
data_decoded = loaded_decode_model.predict_on_batch(data_encoded_df)

vae_data = pd.DataFrame(data_decoded,
                        index=data_encoded_df.index,
                        columns=original_data_scaled_df.columns)

print(vae_data.shape)
vae_data.head()

(6, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM1244967_PAO1-22-replicate-01.CEL,0.734141,0.598737,0.566224,0.552679,0.22672,0.597481,0.609478,0.441315,0.416802,0.243473,...,0.776762,0.266689,0.627713,0.294586,0.330802,0.217841,0.171984,0.114785,0.470254,0.820475
GSM1244968_PAO1-22-replicate-02.CEL,0.731168,0.59415,0.565731,0.552488,0.230806,0.595234,0.600931,0.438601,0.418153,0.244625,...,0.776605,0.264595,0.62739,0.295795,0.329285,0.222204,0.174291,0.115836,0.470258,0.818813
GSM1244969_PAO1-22-replicate-03.CEL,0.73607,0.600233,0.566864,0.554141,0.22728,0.597275,0.608869,0.448008,0.41988,0.244112,...,0.777392,0.270692,0.628244,0.293663,0.332397,0.215415,0.171846,0.115293,0.469749,0.821917
GSM1244970_PAO1-37-replicate-01.CEL,0.701721,0.63207,0.52484,0.570625,0.256711,0.628166,0.586948,0.490252,0.426348,0.243146,...,0.76785,0.317778,0.635052,0.372441,0.44245,0.212265,0.18982,0.12913,0.575529,0.874276
GSM1244971_PAO1-37-replicate-02.CEL,0.70474,0.635653,0.52642,0.574255,0.252254,0.632168,0.589849,0.493577,0.42957,0.242564,...,0.77481,0.313648,0.638207,0.369951,0.441371,0.206443,0.183555,0.122005,0.572265,0.877861


In [11]:
# Scale data back into original range for DE analysis
vae_data_scaled = scaler.inverse_transform(vae_data)

vae_data_scaled_df = pd.DataFrame(
    vae_data_scaled,
    columns=vae_data.columns,
    index=vae_data.index
)

In [12]:
vae_data_scaled_df.head()

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM1244967_PAO1-22-replicate-01.CEL,9.153097,9.963171,8.931516,8.798515,6.496345,8.112652,8.885253,8.188353,7.645434,5.77332,...,9.394267,7.509082,9.621944,7.854126,7.937604,6.012509,6.235865,7.732611,9.866838,11.37704
GSM1244968_PAO1-22-replicate-02.CEL,9.141452,9.940964,8.929097,8.797726,6.513957,8.10527,8.850241,8.176112,7.651608,5.778271,...,9.393431,7.498415,9.620173,7.85975,7.931629,6.032332,6.244499,7.73775,9.866862,11.365098
GSM1244969_PAO1-22-replicate-03.CEL,9.160655,9.970416,8.934652,8.804568,6.498758,8.111978,8.882757,8.218536,7.659501,5.776067,...,9.397628,7.529473,9.624851,7.849832,7.943892,6.001487,6.235348,7.735096,9.863709,11.387399
GSM1244970_PAO1-37-replicate-01.CEL,9.026104,10.124539,8.728597,8.872827,6.625604,8.213467,8.792966,8.409038,7.689057,5.771916,...,9.346709,7.769309,9.66214,8.216413,8.377556,5.987173,6.302623,7.802777,10.51843,11.763679
GSM1244971_PAO1-37-replicate-02.CEL,9.037931,10.141888,8.736346,8.887856,6.606396,8.226615,8.804849,8.424031,7.703783,5.769412,...,9.383851,7.748273,9.679419,8.204826,8.373307,5.960722,6.279173,7.767926,10.498225,11.789443


In [13]:
# Save expression data for use in heatmap plot
vae_data_scaled_df.to_csv(selected_compressed_data_file, sep="\t")

## DE analysis

In [14]:
%%R
#if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")

#BiocManager::install("limma")

NULL


In [15]:
%%R
suppressPackageStartupMessages(library("limma"))

In [16]:
# files for analysis
metadata_file = os.path.join(
    local_dir,
    "pseudo_experiment",
    "metadata_deg_temp.txt")

In [17]:
%%R -i metadata_file -i experiment_id -i selected_compressed_data_file -i DE_stats_compressed_only_file -i selected_original_data_file -i DE_stats_original_file
get_DE_stats <- function(metadata_file, 
                         experiment_id, 
                         expression_file,
                         out_file){
    # Read in data
    expression_data <- t(as.matrix(read.csv(expression_file, sep="\t", header=TRUE, row.names=1)))
    metadata <- as.matrix(read.csv(metadata_file, sep="\t", header=TRUE, row.names=1))
    
    print("Checking sample ordering...")
    print(all.equal(colnames(expression_data), rownames(metadata)))
  
    # NOTE: It make sure the metadata is in the same order 
    # as the column names of the expression matrix.
    group <- interaction(metadata[,1])
  
    mm <- model.matrix(~0 + group)
  
    ## DEGs of simulated data
    # lmFit expects input array to have structure: gene x sample
    # lmFit fits a linear model using weighted least squares for each gene:
    fit <- lmFit(expression_data, mm)
  
    # Comparisons between groups (log fold-changes) are obtained as contrasts of these fitted linear models:
    # Samples are grouped based on experimental condition
    # The variability of gene expression is compared between these groups
    # For experiment E-GEOD-51409, we are comparing the expression profile
    # of samples grown in 37 degrees versus those grown in 22 degrees
    contr <- makeContrasts(group37 - group22, levels = colnames(coef(fit)))

    # Estimate contrast for each gene
    tmp <- contrasts.fit(fit, contr)

    # Empirical Bayes smoothing of standard errors (shrinks standard errors 
    # that are much larger or smaller than those from other genes towards the average standard error)
    tmp <- eBayes(tmp)
  
    # Get significant DEGs
    top.table <- topTable(tmp, sort.by = "P", n = Inf)
    all_genes <-  as.data.frame(top.table)
  
    # Find all DEGs based on Bonferroni corrected p-value cutoff
    threshold = 0.05/5549
    num_sign_DEGs <- all_genes[all_genes[,'P.Value']<threshold,]
  
  # Save summary statistics of DEGs
  write.table(all_genes, file = out_file, row.names = T, sep = "\t", quote = F)
  
}

get_DE_stats(metadata_file, experiment_id, selected_compressed_data_file, DE_stats_compressed_only_file)
get_DE_stats(metadata_file, experiment_id, selected_original_data_file, DE_stats_original_file)

[1] "Checking sample ordering..."
[1] TRUE
[1] "Checking sample ordering..."
[1] TRUE
