In [43]:
!wget https://www.dropbox.com/s/tdp11r39faou5ae/cortex.zip?dl=0
!wget https://www.dropbox.com/s/o6nveqyk1k7f95z/expression_sparse.txt?dl=0

--2018-04-29 13:49:06--  https://www.dropbox.com/s/tdp11r39faou5ae/cortex.zip?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.9.1, 2620:100:601b:1::a27d:801
Connecting to www.dropbox.com (www.dropbox.com)|162.125.9.1|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://dl.dropboxusercontent.com/content_link/Aj5Z73aKMsoZ3Q0NCjMJBQW7H5baX5nurASBNcZBeplsc3EdE2xxUu0WCOeOWXJS/file [following]
--2018-04-29 13:49:07--  https://dl.dropboxusercontent.com/content_link/Aj5Z73aKMsoZ3Q0NCjMJBQW7H5baX5nurASBNcZBeplsc3EdE2xxUu0WCOeOWXJS/file
Resolving dl.dropboxusercontent.com (dl.dropboxusercontent.com)... 162.125.9.6, 2620:100:601f:6::a27d:906
Connecting to dl.dropboxusercontent.com (dl.dropboxusercontent.com)|162.125.9.6|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2556102 (2.4M) [application/zip]
Saving to: ‘cortex.zip?dl=0’


2018-04-29 13:49:07 (17.7 MB/s) - ‘cortex.zip?dl=0’ saved [2556102/2556102]

--2018-04-29 13:49:0

In [0]:
from __future__ import print_function

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from sklearn.preprocessing import scale

from keras.layers import Input, Dense, Lambda
from keras.models import Model
from keras import backend as K
from keras import metrics

############################################################################################
# vae: build a variational autoencoder using the input data
# 
# This function is heavily derived from - 
# 
# Inputs:
#   - counts_mat: matrix with gene names as counts and individual cells are rows
#   - loss_fun: loss function for autoencoder
#   - latent_dim: number of latent dimensions to project data into
#   - intermediate_dim: number of dimensions in the hidden layer
#   - batch_size: batch size used for training autoencoder
#   - epochs: epochs for training autoencoder
#   - nbshape: shape parameter for negative binomial loss. Defaults to 1
# 
# Outputs:
#   - x_encoded: Encoded transformation of input
#   - z_mean: parameter for the mean of distribution in the latent space
#
############################################################################################

def vae(counts_mat, loss_fun, latent_dim, intermediate_dim, 
        batch_size, epochs, epsilon_std, nbshape=1):
            
    x_train = preprocess(counts_mat, loss_fun)
    original_dim = x_train.shape[1]
    num_samples = x_train.shape[0]
    
    # encoder network: map inputs to latent distribution parameters
    x = Input(shape=(original_dim,))
    h = Dense(intermediate_dim, activation='relu')(x)
    z_mean = Dense(latent_dim)(h)
    z_log_var = Dense(latent_dim)(h)

    
    # use 'sampling' function to sample new similar points from the latent space
    def sampling(args):
      z_mean, z_log_var = args
      epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0.0, stddev=epsilon_std)
      return z_mean + K.exp(z_log_var / 2) * epsilon
    
    z = Lambda(sampling)([z_mean, z_log_var])
    
    # map these sampled latent points back to reconstructed inputs 
    # i.e. a sort of decoder network
    decoder_h = Dense(intermediate_dim, activation='relu')
    decoder_mean = Dense(original_dim, activation='sigmoid')
    h_decoded = decoder_h(z)
    x_decoded_mean = decoder_mean(h_decoded)


    # build end-to-end autoencoder
    vae_model = Model(x, x_decoded_mean)
    

    # build encoder, from inputs to latent space
    encoder = Model(x, z_mean)
    

    # Train the model using the end-to-end model, with a custom loss function: 
    # the sum of a reconstruction term, and the KL divergence regularization term.
    # Use different user defined loss functions to generate reconstruction loss
    
    if loss_fun == 'poisson':
      reconstruction_loss = original_dim*metrics.poisson(x, x_decoded_mean)
    elif loss_fun == 'negative_binomial':
      reconstruction_loss = original_dim*negative_binomial_loss(x, x_decoded_mean, nbshape)
    elif loss_fun == 'gaussian':
      reconstruction_loss = original_dim*metrics.mean_squared_error(x, x_decoded_mean)
    elif loss_fun == 'bernoulli':
      reconstruction_loss = original_dim*metrics.binary_crossentropy(x, x_decoded_mean)
    else:
      reconstruction_loss = original_dim*metrics.binary_crossentropy(x, x_decoded_mean) # defaults to cross entropy
        
    kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    
    vae_loss = K.mean(reconstruction_loss + kl_loss)    
    
    vae_model.add_loss(vae_loss)
    vae_model.compile(optimizer='rmsprop')
    vae_model.summary()
    
    
    # fit the model with the given data
    vae_model.fit(x_train, shuffle=True, epochs=epochs, batch_size=batch_size, validation_split=0.1)
    

    # project/encode inputs on the latent space
    encoder = Model(x, z_mean)
    x_encoded = encoder.predict(x_train, batch_size=batch_size)
    

    return x_encoded, z_mean

############################################################################################
# preprocess: preprocess counts matrix to make it suitable for particular loss function
#
# Inputs:
#   - Y: matrix with gene names as counts and individual cells are rows
#   - loss_fun: loss function for autoencoder
#
# Outputs:
#   - x_train: preprocessed matrix of counts for training the autoencoder
############################################################################################

def preprocess(counts_mat, loss_fun):
    if loss_fun == 'gaussian':
        x_train = np.log2(1 + (counts_mat.T/np.sum(counts_mat, axis=1)).T * 1e+6) # log2(1+Y/rowSums(Y)*1e6)
        x_train = scale(x_train, axis=1, with_mean=True, with_std=True, copy=True) # scale_center(Y)
        
    elif loss_fun == 'bernoulli':
        x_train = counts_mat
        x_train[x_train > 0] = 1 # int(Y>0)
        
    else:
        x_train = counts_mat
    
    return x_train

############################################################################################
# sampling: sample encoded values of the input using latent space parameters
#
# Inputs:
#   - args: mean and variance parameters of the latent space distribution
# 
# Outputs:
#   - encoded points sampled from the latent space distribution
############################################################################################


In [45]:
# unzip cortex data, move it all to one directory
!unzip cortex.zip?dl=0
!rm cortex.zip?dl=0
!mkdir cortex
!mv cell_info.csv cortex/
!mv cortex_expression.txt cortex/
!mv gene_info.csv cortex/

# move CBMC data to one directory
!mkdir CBMC_sparse
!mv expression_sparse.txt?dl=0 expression_sparse.txt
!mv expression_sparse.txt CBMC_sparse/

!ls

Archive:  cortex.zip?dl=0
  inflating: cell_info.csv           
  inflating: cortex_expression.txt   
  inflating: gene_info.csv           
mkdir: cannot create directory ‘cortex’: File exists
mkdir: cannot create directory ‘CBMC_sparse’: File exists
CBMC_sparse  cortex  datalab


In [48]:
cortex_counts = np.loadtxt('cortex/cortex_expression.txt')
cortex_counts = cortex_counts.T.astype('float32')
encoded_cortex_counts, cortex_counts_z_mean = vae(cortex_counts, loss_fun = 'gaussian', latent_dim = 50, intermediate_dim = 500, batch_size = 100, epochs = 100, epsilon_std = 1, nbshape=1)



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 2000)         0                                            
__________________________________________________________________________________________________
dense_74 (Dense)                (None, 500)          1000500     input_16[0][0]                   
__________________________________________________________________________________________________
dense_75 (Dense)                (None, 50)           25050       dense_74[0][0]                   
__________________________________________________________________________________________________
dense_76 (Dense)                (None, 50)           25050       dense_74[0][0]                   
__________________________________________________________________________________________________
lambda_15 

Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100

Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
 100/2704 [>.............................] - ETA: 0s - loss: 1316.7052

Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
 500/2704 [====>.........................] - ETA: 0s - loss: 1301.5928

Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
 600/2704 [=====>........................] - ETA: 0s - loss: 1297.5171

Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
 500/2704 [====>.........................] - ETA: 0s - loss: 1290.5402

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100

Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
