# Create compression-only dataset
This notebook creates an experiment that only applies VAE encoding. This experiment will be used to ...

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import os
import glob
import pandas as pd
from sklearn import preprocessing
from keras.models import load_model
from ponyo import utils
from rpy2.robjects import pandas2ri
pandas2ri.activate()

Using TensorFlow backend.


In [2]:
# Read in config variables
config_file = os.path.abspath(os.path.join(os.getcwd(),"../configs", "config_Pa_experiment_limma.tsv"))
params = utils.read_config(config_file)

In [3]:
# Load parameters
local_dir = params["local_dir"]
dataset_name = params['dataset_name']
NN_architecture = params['NN_architecture']
experiment_id = 'E-GEOD-51409'

base_dir = os.path.abspath(
  os.path.join(
      os.getcwd(), "../"))

In [4]:
# Input files
# File containing expression data from template experiment
selected_original_data_file = os.path.join(
    local_dir,
    "pseudo_experiment",
    "selected_original_data_"+experiment_id+"_example.txt")

# Load VAE encoder and decoder models
NN_dir = os.path.join(
    base_dir, 
    dataset_name,
    "models",
    NN_architecture)
model_encoder_file = glob.glob(os.path.join(
    NN_dir,
    "*_encoder_model.h5"))[0]

weights_encoder_file = glob.glob(os.path.join(
    NN_dir,
    "*_encoder_weights.h5"))[0]

model_decoder_file = glob.glob(os.path.join(
    NN_dir,
    "*_decoder_model.h5"))[0]

weights_decoder_file = glob.glob(os.path.join(
    NN_dir,
    "*_decoder_weights.h5"))[0]

loaded_model = load_model(model_encoder_file)
loaded_decode_model = load_model(model_decoder_file)

loaded_model.load_weights(weights_encoder_file)
loaded_decode_model.load_weights(weights_decoder_file)

Instructions for updating:
Colocations handled automatically by placer.




In [5]:
# Output files
selected_compressed_data_file = os.path.join(
    local_dir,
    "pseudo_experiment",
    "selected_compressed_only_data_"+experiment_id+"_example.txt")

DE_stats_compressed_only_file = os.path.join(
    local_dir,
    "pseudo_experiment",
    "output_original",
    "DE_stats_compressed_only_data_"+experiment_id+"_example.txt")

## Normalize expression data

In [6]:
# Read compendium
original_template = pd.read_csv(selected_original_data_file,
                                header=0,
                                index_col=0,
                                sep="\t")

print(original_template.shape)
original_template.head()

(6, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM1244967_PAO1-22-replicate-01.CEL,9.049,9.927,8.886,8.805,5.988,7.764,8.35,7.774,7.774,5.511,...,9.356,7.378,9.645,7.429,7.655,6.132,6.241,7.684,9.073,10.695
GSM1244968_PAO1-22-replicate-02.CEL,8.833,9.917,9.01,8.901,6.096,7.749,8.32,8.301,7.601,5.707,...,9.486,6.894,9.723,7.789,7.605,6.231,6.303,7.815,8.896,10.714
GSM1244969_PAO1-22-replicate-03.CEL,8.885,9.907,8.738,8.629,6.27,7.989,8.305,7.954,7.695,5.68,...,9.08,7.297,9.656,7.346,7.659,5.703,6.39,7.864,9.196,10.672
GSM1244970_PAO1-37-replicate-01.CEL,8.778,9.872,8.755,8.662,7.213,8.426,8.67,8.653,7.738,5.695,...,9.153,7.928,9.573,8.285,8.581,5.96,6.266,7.824,10.908,12.177
GSM1244971_PAO1-37-replicate-02.CEL,9.061,9.828,8.342,8.842,6.466,7.97,8.432,8.227,7.877,5.785,...,9.337,8.013,9.553,8.591,8.629,5.779,6.831,8.053,10.93,12.152


In [7]:
# 0-1 normalize per gene
scaler = preprocessing.MinMaxScaler()
original_data_scaled = scaler.fit_transform(original_template)
original_data_scaled_df = pd.DataFrame(original_data_scaled,
                                columns=original_template.columns,
                                index=original_template.index)

print(original_data_scaled_df.shape)
original_data_scaled_df.head()

(6, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM1244967_PAO1-22-replicate-01.CEL,0.957597,0.292899,0.814371,0.647059,0.0,0.022157,0.094538,0.0,0.849854,0.0,...,0.725159,0.432529,0.753943,0.066667,0.048828,0.809434,0.296782,0.0,0.087021,0.015282
GSM1244968_PAO1-22-replicate-02.CEL,0.194346,0.263314,1.0,1.0,0.088163,0.0,0.031513,0.599545,0.597668,0.715328,...,1.0,0.0,1.0,0.355823,0.0,0.996226,0.370679,0.355014,0.0,0.027907
GSM1244969_PAO1-22-replicate-03.CEL,0.378092,0.233728,0.592814,0.0,0.230204,0.354505,0.0,0.204778,0.734694,0.616788,...,0.141649,0.360143,0.788644,0.0,0.052734,0.0,0.474374,0.487805,0.147493,0.0
GSM1244970_PAO1-37-replicate-01.CEL,0.0,0.130178,0.618263,0.121324,1.0,1.0,0.766807,1.0,0.797376,0.671533,...,0.295983,0.924039,0.526814,0.754217,0.953125,0.484906,0.326579,0.379404,0.989184,1.0
GSM1244971_PAO1-37-replicate-02.CEL,1.0,0.0,0.0,0.783088,0.390204,0.32644,0.266807,0.515358,1.0,1.0,...,0.684989,1.0,0.463722,1.0,1.0,0.143396,1.0,1.0,1.0,0.983389


## Encode and decode data

In [8]:
# Pass original data through VAE
# Encode selected experiment into latent space
data_encoded = loaded_model.predict_on_batch(original_data_scaled_df)
data_encoded_df = pd.DataFrame(
    data_encoded, 
    index=original_data_scaled_df.index)

# Decode simulated data into raw gene space
data_decoded = loaded_decode_model.predict_on_batch(data_encoded_df)

vae_data = pd.DataFrame(data_decoded,
                        index=data_encoded_df.index,
                        columns=original_data_scaled_df.columns)

print(vae_data.shape)
vae_data.head()

(6, 5549)


Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM1244967_PAO1-22-replicate-01.CEL,0.420443,0.389077,0.385479,0.368508,0.330612,0.434929,0.461819,0.333977,0.283591,0.240681,...,0.396966,0.396095,0.422606,0.354316,0.366556,0.373445,0.428717,0.322171,0.413957,0.371422
GSM1244968_PAO1-22-replicate-02.CEL,0.380524,0.290166,0.382522,0.286285,0.363396,0.464386,0.413961,0.342846,0.254519,0.248737,...,0.432642,0.365742,0.365608,0.32634,0.323375,0.464993,0.462104,0.305956,0.34644,0.307586
GSM1244969_PAO1-22-replicate-03.CEL,0.480756,0.499102,0.408941,0.46108,0.336411,0.419291,0.48387,0.347434,0.335924,0.227753,...,0.393407,0.439087,0.481618,0.384788,0.398557,0.317675,0.429422,0.357631,0.469276,0.428998
GSM1244970_PAO1-37-replicate-01.CEL,0.600752,0.547715,0.450149,0.522247,0.45532,0.472994,0.364187,0.512215,0.453983,0.256614,...,0.48672,0.541609,0.56624,0.546081,0.590758,0.313036,0.506269,0.534598,0.703639,0.707507
GSM1244971_PAO1-37-replicate-02.CEL,0.632919,0.597903,0.460818,0.572523,0.459251,0.489724,0.35027,0.545943,0.494845,0.234108,...,0.510917,0.575514,0.592957,0.596281,0.644547,0.261214,0.503706,0.546009,0.727365,0.741654


In [9]:
# Scale data back into original range for DE analysis
vae_data_scaled = scaler.inverse_transform(vae_data)

vae_data_scaled_df = pd.DataFrame(
    vae_data_scaled,
    columns=vae_data.columns,
    index=vae_data.index
)

In [10]:
vae_data_scaled_df.head()

Unnamed: 0,PA0001,PA0002,PA0003,PA0004,PA0005,PA0006,PA0007,PA0008,PA0009,PA0010,...,PA5561,PA5562,PA5563,PA5564,PA5565,PA5566,PA5567,PA5568,PA5569,PA5570
GSM1244967_PAO1-22-replicate-01.CEL,8.896985,9.959508,8.5995,8.729235,6.392999,8.043447,8.524826,8.067566,7.385544,5.576946,...,9.200765,7.33723,9.539966,7.787124,7.980353,5.900925,6.351694,7.802881,9.737989,11.23099
GSM1244968_PAO1-22-replicate-02.CEL,8.885688,9.926077,8.597525,8.70687,6.433159,8.06339,8.502046,8.075361,7.3656,5.579154,...,9.21764,7.303265,9.521898,7.752294,7.936136,5.949446,6.379705,7.796898,9.600658,11.134917
GSM1244969_PAO1-22-replicate-03.CEL,8.914054,9.996696,8.615172,8.754414,6.400104,8.03286,8.535322,8.079394,7.421443,5.573405,...,9.199081,7.385339,9.558673,7.825061,8.013123,5.871367,6.352285,7.815966,9.850508,11.317642
GSM1244970_PAO1-37-replicate-01.CEL,8.948012,10.013127,8.642699,8.77105,6.545767,8.069217,8.478354,8.224236,7.502432,5.581312,...,9.243218,7.50006,9.585498,8.025871,8.209936,5.868909,6.41676,7.881267,10.327201,11.736798
GSM1244971_PAO1-37-replicate-02.CEL,8.957116,10.030091,8.649826,8.784726,6.550583,8.080543,8.471728,8.253884,7.530464,5.575145,...,9.254663,7.538,9.593967,8.08837,8.265017,5.841444,6.414609,7.885478,10.375461,11.78819


In [11]:
# Save expression data for use in heatmap plot
vae_data_scaled_df.to_csv(selected_compressed_data_file, sep="\t")

## DE analysis

In [12]:
%%R
#if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")

#BiocManager::install("limma")

NULL


In [13]:
%%R
suppressPackageStartupMessages(library("limma"))

In [14]:
# files for analysis
metadata_file = os.path.join(
    local_dir,
    "pseudo_experiment",
    "metadata_deg_temp.txt")

In [15]:
%%R -i metadata_file -i experiment_id -i selected_compressed_data_file -i DE_stats_compressed_only_file
get_DE_stats <- function(metadata_file, 
                         experiment_id, 
                         expression_file,
                         out_file){
    # Read in data
    expression_data <- t(as.matrix(read.csv(expression_file, sep="\t", header=TRUE, row.names=1)))
    metadata <- as.matrix(read.csv(metadata_file, sep="\t", header=TRUE, row.names=1))
    
    print("Checking sample ordering...")
    print(all.equal(colnames(expression_data), rownames(metadata)))
  
    # NOTE: It make sure the metadata is in the same order 
    # as the column names of the expression matrix.
    group <- interaction(metadata[,1])
  
    mm <- model.matrix(~0 + group)
  
    ## DEGs of simulated data
    # lmFit expects input array to have structure: gene x sample
    # lmFit fits a linear model using weighted least squares for each gene:
    fit <- lmFit(expression_data, mm)
  
    # Comparisons between groups (log fold-changes) are obtained as contrasts of these fitted linear models:
    # Samples are grouped based on experimental condition
    # The variability of gene expression is compared between these groups
    # For experiment E-GEOD-51409, we are comparing the expression profile
    # of samples grown in 37 degrees versus those grown in 22 degrees
    contr <- makeContrasts(group37 - group22, levels = colnames(coef(fit)))

    # Estimate contrast for each gene
    tmp <- contrasts.fit(fit, contr)

    # Empirical Bayes smoothing of standard errors (shrinks standard errors 
    # that are much larger or smaller than those from other genes towards the average standard error)
    tmp <- eBayes(tmp)
  
    # Get significant DEGs
    top.table <- topTable(tmp, sort.by = "P", n = Inf)
    all_genes <-  as.data.frame(top.table)
  
    # Find all DEGs based on Bonferroni corrected p-value cutoff
    threshold = 0.05/5549
    num_sign_DEGs <- all_genes[all_genes[,'P.Value']<threshold,]
  
  # Save summary statistics of DEGs
  write.table(all_genes, file = out_file, row.names = T, sep = "\t", quote = F)
  
}

get_DE_stats(metadata_file, experiment_id, selected_compressed_data_file, DE_stats_compressed_only_file)

[1] "Checking sample ordering..."
[1] TRUE
