# Process recount2 data
This notebook does the following:

1. Selects template experiment
2. Downloads subset of recount2 data, including the template experiment (subset of random experiments + 1 template experiment)
3. Train VAE on subset of recount2 data

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
import rpy2
import seaborn as sns
from sklearn import preprocessing
import pickle

from ponyo import utils, train_vae_modules
from generic_expression_patterns_modules import process, calc

np.random.seed(123)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))
Using TensorFlow backend.


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../"))

config_file = os.path.abspath(os.path.join(base_dir,
                                           "config_human.tsv"))
params = utils.read_config(config_file)

### Select template experiment

We manually selected bioproject [SRP012656](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE37764), which contains primary non-small cell lung adenocarcinoma tumors and adjacent normal tissues of 6 never-smoker Korean female patients with 2 replicates each.

In [3]:
# Load params
local_dir = params["local_dir"]
dataset_name = params['dataset_name']
NN_architecture = params['NN_architecture']
project_id = params['project_id']
num_recount2_experiments = params['num_recount2_experiments']
template_data_file = params['template_data_file']
original_compendium_file = params['compendium_data_file']
normalized_data_file = params['normalized_compendium_data_file']
shared_genes_file = params['shared_genes_file']
scaler_file = params['scaler_transform_file']

### Download subset of recount2 to use as a compendium
The compendium will be composed of random experiments + the selected template experiment

In [4]:
%%R
# Select 59
# Select a
# Run one time
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("derfinder")
#BiocManager::install("recount")

NULL


In [5]:
%%R
library('recount')






Attaching package: ‘BiocGenerics’



    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB



    IQR, mad, sd, var, xtabs



    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which, which.max, which.min



Attaching package: ‘S4Vectors’



    expand.grid






    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.




Attaching package: ‘matrixStats’



    anyMissing, rowMedians



Attaching package: ‘DelayedArray’



    colMax

In [6]:
%%R -i project_id -i num_recount2_experiments -i local_dir -i base_dir

source('../generic_expression_patterns_modules/download_recount2_data.R')

get_recount2_compendium(project_id, num_recount2_experiments, local_dir, base_dir)









[1] "SRP012651"
Loading objects:
  rse_gene
[1] "SRP014574"
Loading objects:
  rse_gene
[1] "SRP000542"
Loading objects:
  rse_gene
[1] "SRP016059"
Loading objects:
  rse_gene
[1] "SRP001540"
Loading objects:
  rse_gene
[1] "SRP034626"
Loading objects:
  rse_gene
[1] "SRP062873"
Loading objects:
  rse_gene
[1] "SRP041956"
Loading objects:
  rse_gene
[1] "SRP046254"
Loading objects:
  rse_gene
[1] "SRP044956"
Loading objects:
  rse_gene
[1] "SRP045269"
Loading objects:
  rse_gene
[1] "SRP039357"
Loading objects:
  rse_gene
[1] "SRP049238"
Loading objects:
  rse_gene
[1] "SRP021130"
Loading objects:
  rse_gene
[1] "SRP018571"
Loading objects:
  rse_gene
[1] "SRP056201"
Loading objects:
  rse_gene
[1] "SRP060708"
Loading objects:
  rse_gene
[1] "SRP023262"
Loading objects:
  rse_gene
[1] "SRP037775"
Loading objects:
  rse_gene
[1] "SRP041531"
Loading objects:
  rse_gene
[1] "SRP034953"
Loading objects:
  rse_gene
[1] "SRP010041"
Loading objects:
  rse_gene
[1] "SRP038143"
Loading objects:

### Download expression data for selected project id

In [7]:
%%R -i project_id -i local_dir

source('../generic_expression_patterns_modules/download_recount2_data.R')

get_recount2_template_experiment(project_id, local_dir)

Loading objects:
  rse_gene


### Subset genes and convert gene names
For our downstream analysis we will be comparing our set of differentially expression genes against the set found in [Crow et. al. publication](https://www.pnas.org/content/pnas/116/13/6491.full.pdf), therefore we will limit our genes to include only those genes shared between our starting set of genes and those in publication. 

In [8]:
# Get generic genes identified by Crow et. al.
DE_prior_file = "https://raw.githubusercontent.com/maggiecrow/DEprior/master/DE_Prior.txt"

DE_prior = pd.read_csv(DE_prior_file,
                       header=0,
                       sep="\t")

DE_prior.head()

Unnamed: 0,Gene_Order,Gene_EntrezID,N_HitLists,DE_Prior_Rank,Gene_Name
0,1,7503,79,1.0,XIST
1,2,8653,64,0.999948,DDX3Y
2,3,9086,62,0.99987,EIF1AY
3,4,8284,52,0.99987,KDM5D
4,5,8287,46,0.999791,USP9Y


In [9]:
# Get list of published generic genes
published_generic_genes = list(DE_prior['Gene_Name'])

In [10]:
# Get list of our genes

# Read template data
template_data = pd.read_csv(
    template_data_file,
    header=0,
    sep='\t',
    index_col=0)

our_gene_ids = list(template_data.columns)

In [11]:
# File mapping ensembl ids to hgnc symbols
gene_id_file = os.path.join(
    local_dir,
    "ensembl_hgnc_mapping.tsv")

In [12]:
%%R
suppressWarnings(library("biomaRt"))

In [13]:
%%R -i template_data_file -i gene_id_file

# Get mapping between ensembl gene ids (ours) to HGNC gene symbols (published)

source('../generic_expression_patterns_modules/process_names.R')

if (file.exists(gene_id_file) == FALSE){
    gene_id_mapping <- get_ensembl_symbol_mapping(template_data_file, gene_id_file)
}

In [14]:
# Read gene id mapping
gene_id_mapping = pd.read_csv(
        gene_id_file,
        header=0,
        sep='\t',
        index_col=0)

print(gene_id_mapping.shape)
gene_id_mapping.head()

(57210, 2)


Unnamed: 0,ensembl_gene_id,hgnc_symbol
1,ENSG00000002330,BAD
2,ENSG00000003137,CYP26B1
3,ENSG00000003249,DBNDD1
4,ENSG00000004799,PDK4
5,ENSG00000006062,MAP3K14


In [15]:
# Get mapping between ensembl ids with and without version numbers
# Expressiond data uses ensembl gene ids with version number 
ensembl_gene_ids = pd.DataFrame(data={'ensembl_version': our_gene_ids,
                                      'ensembl_parsed': [gene_id.split('.')[0] for gene_id in our_gene_ids]})

print(ensembl_gene_ids.shape)
ensembl_gene_ids.head()

(58037, 2)


Unnamed: 0,ensembl_version,ensembl_parsed
0,ENSG00000000003.14,ENSG00000000003
1,ENSG00000000005.5,ENSG00000000005
2,ENSG00000000419.12,ENSG00000000419
3,ENSG00000000457.13,ENSG00000000457
4,ENSG00000000460.16,ENSG00000000460


In [16]:
# Map ensembl ids with version number to gene_id_mapping_df
gene_id_mapping = pd.merge(gene_id_mapping, 
                           ensembl_gene_ids, 
                           left_on='ensembl_gene_id',
                           right_on='ensembl_parsed', 
                           how='outer')

print(gene_id_mapping.shape)
gene_id_mapping.set_index('ensembl_version', inplace=True)
gene_id_mapping.head()

(58129, 4)


Unnamed: 0_level_0,ensembl_gene_id,hgnc_symbol,ensembl_parsed
ensembl_version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000002330.13,ENSG00000002330,BAD,ENSG00000002330
ENSG00000003137.8,ENSG00000003137,CYP26B1,ENSG00000003137
ENSG00000003249.13,ENSG00000003249,DBNDD1,ENSG00000003249
ENSG00000004799.7,ENSG00000004799,PDK4,ENSG00000004799
ENSG00000006062.13,ENSG00000006062,MAP3K14,ENSG00000006062


Since this experiment contains both RNA-seq and smRNA-seq samples which are in different ranges so we will drop smRNA samples so that samples are within the same range. The analysis identifying these two subsets of samples can be found in this [notebook](0_explore_input_data.ipynb)

In [17]:
# Replace ensembl ids with gene symbols
template_data = process.replace_ensembl_ids(template_data,
                                            gene_id_mapping)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [18]:
template_data.head()

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,MIR6830,MIR548AH,MYOCOS,MIR522,MIR6715B,MIR3116-2,MIR3202-2,HSFX3,VINAC1P,MIR4481
SRR493937,3.800154,0.01819,9.128459,1.585803,0.713952,84.990542,19.943002,12.065586,2.498822,7.713002,...,0.0,0.0,0.135649,0.0,0.0,0.0,0.0,0.588513,0.0,0.0
SRR493938,3.798473,0.0,9.157669,1.456017,0.738967,85.788323,19.621252,11.738855,2.435308,7.125808,...,0.005417,0.0,0.127873,0.0,0.0,0.0,0.0,0.452511,0.0,0.0
SRR493939,20.222134,0.0,15.522072,3.944919,2.512265,8.072948,47.246078,18.866957,2.44323,12.14367,...,0.0,0.0,0.011112,0.0,0.0,0.0,0.0,0.446833,0.0,0.0
SRR493940,19.988329,0.0,17.140304,3.977217,2.378849,7.599225,46.594458,18.92119,2.448276,12.04896,...,0.147412,0.0,0.037886,0.0,0.0,0.07756,0.0,0.462295,0.0,0.168159
SRR493941,3.239231,0.019398,9.069316,2.181582,1.008155,17.675885,19.371137,11.759756,1.970725,6.740705,...,0.00286,0.0,0.15622,0.0,0.0,0.0,0.070955,0.739403,0.0,0.0


In [19]:
# Get intersection of gene lists
our_gene_ids_hgnc = template_data.columns
shared_genes_hgnc = list(set(our_gene_ids_hgnc).intersection(published_generic_genes))
print(len(shared_genes_hgnc))

17785


In [20]:
# Save shared genes
outfile = open(shared_genes_file,'wb')
pickle.dump(shared_genes_hgnc,outfile)
outfile.close()

In [21]:
# Drop smRNA samples so that samples are within the same range
smRNA_samples = ["SRR493961",
                 "SRR493962",
                 "SRR493963",
                 "SRR493964",
                 "SRR493965",
                 "SRR493966",
                 "SRR493967",
                 "SRR493968",
                 "SRR493969",
                 "SRR493970",
                 "SRR493971",
                 "SRR493972"]

In [22]:
# Drop samples
template_data = template_data.drop(smRNA_samples)

In [23]:
# Drop genes
template_data = template_data[shared_genes_hgnc]

print(template_data.shape)
template_data.head()

(24, 17788)


Unnamed: 0,TRMT10C,RUFY3,CNOT1,SLC35F1,RBM26,IHH,GPR20,ADRB3,MFSD10,DCAF8,...,MRPL3,CEP152,C1QTNF7,RARA,HBD,LINC01526,MFSD14A,TFAP2B,NDUFA4,SULT2B1
SRR493937,4.665383,4.081109,16.805435,0.105384,3.345764,0.596137,0.700501,0.035386,22.347079,13.608744,...,9.813451,0.446482,1.483983,27.105052,0.388193,0.056681,16.617661,0.009494,10.581429,1.52656
SRR493938,4.796513,4.234347,16.808594,0.195406,3.056731,0.471109,0.699908,0.031939,22.376378,14.316637,...,9.553095,0.462525,1.29417,26.411033,0.433801,0.113388,16.912483,0.004779,10.738915,1.518376
SRR493939,3.284528,4.539443,15.327375,0.035567,3.547286,0.22792,0.152627,0.038542,57.659052,21.526116,...,12.157567,0.737306,1.450699,11.622323,0.265947,0.034443,13.683262,0.53662,17.572848,0.803435
SRR493940,3.812965,4.346555,15.367022,0.053466,3.388571,0.286209,0.371584,0.011037,56.854383,22.026572,...,13.030322,0.738708,1.447023,11.706894,0.316107,0.034396,14.27846,0.43527,18.259412,0.924137
SRR493941,2.733277,4.228729,16.461116,0.051296,4.19836,2.10566,2.876334,0.076718,33.46166,16.45275,...,8.254008,0.740339,4.223101,20.432419,0.098966,0.054669,15.831326,0.075468,12.470698,6.44916


In [24]:
print(len(template_data.columns) - len(shared_genes_hgnc))

3


*Note:* There is a difference in the number of `shared_genes_hgnc` and genes in the template experiment because 3 genes have 2 different ensembl gene ids have map to the same hgnc symbol (one forward, one reverse)

### Normalize compendium 

In [25]:
# Read data
original_compendium = pd.read_table(
    original_compendium_file,
    header=0,
    sep='\t',
    index_col=0)

print(original_compendium.shape)
original_compendium.head()

(3388, 58037)


Unnamed: 0,ENSG00000000003.14,ENSG00000000005.5,ENSG00000000419.12,ENSG00000000457.13,ENSG00000000460.16,ENSG00000000938.12,ENSG00000000971.15,ENSG00000001036.13,ENSG00000001084.10,ENSG00000001167.14,...,ENSG00000283690.1,ENSG00000283691.1,ENSG00000283692.1,ENSG00000283693.1,ENSG00000283694.1,ENSG00000283695.1,ENSG00000283696.1,ENSG00000283697.1,ENSG00000283698.1,ENSG00000283699.1
SRR493816,2.893395,0.0,18.298572,1.552609,1.880483,0.26444,1.632884,21.194289,10.014925,4.762246,...,0.0,0.540156,0.0,0.0,0.0,0.0,0.314956,0.358752,0.0,0.0
SRR493817,1.923686,0.0,16.401872,1.245607,1.52945,0.408554,0.071068,16.599504,11.79868,4.036652,...,0.0,0.188427,0.0,0.0,0.0,0.0,0.307766,0.512091,0.0,0.0
SRR547975,1.920405,0.0,42.61059,1.630213,3.371858,0.0,2.919197,64.923769,1.30654,17.485584,...,0.0,2.242607,0.0,0.0,0.0,0.0,0.080863,0.460912,0.0,0.0
SRR547973,6.344573,0.0,27.921543,3.25022,3.201103,0.0,0.0,17.733091,10.638572,4.659644,...,0.0,0.359655,0.0,0.0,0.118196,0.0,0.943308,0.254015,0.0,0.0
SRR547968,1.282905,0.0,16.224764,2.086982,2.213324,0.0,0.0,80.792929,7.692393,13.100063,...,0.0,0.0,0.0,0.0,0.0,0.039251,0.65829,0.794068,0.0,0.0


In [26]:
# Replace ensembl ids with gene symbols
original_compendium = process.replace_ensembl_ids(original_compendium,
                                                gene_id_mapping)

In [27]:
# Drop genes
original_compendium = original_compendium[shared_genes_hgnc]

original_compendium.head()

Unnamed: 0,TRMT10C,RUFY3,CNOT1,SLC35F1,RBM26,IHH,GPR20,ADRB3,MFSD10,DCAF8,...,MRPL3,CEP152,C1QTNF7,RARA,HBD,LINC01526,MFSD14A,TFAP2B,NDUFA4,SULT2B1
SRR493816,3.240427,1.487418,22.294177,0.013186,2.094926,0.126898,0.21086,0.0,27.114229,13.971167,...,15.714002,0.695362,0.013531,5.337377,0.000301,0.066481,9.192732,0.001145,13.334783,0.61818
SRR493817,2.009355,1.204978,24.093032,0.010005,1.367029,0.0,0.164727,0.0,21.579942,13.048861,...,10.849816,0.47643,0.00126,6.103372,0.012302,0.0,8.88537,0.0,11.365625,1.538041
SRR547975,10.485186,0.377008,53.087109,0.0,8.279572,0.020207,0.191273,0.0,10.92234,13.850461,...,51.864493,3.20208,0.0,3.681785,0.0,0.0,22.189305,0.0,27.861512,0.0
SRR547973,5.378727,2.14656,69.606976,0.0,4.24503,2.929866,0.009603,0.0,10.734198,12.349436,...,42.563711,2.007285,0.0,4.435816,0.0,0.0,8.012944,0.0,15.785109,3.258487
SRR547968,11.799357,0.088415,28.885471,0.003233,5.41721,39.274797,0.0,0.0,15.232692,11.469034,...,61.760986,4.224323,0.0,6.368031,0.0,0.0,19.540065,0.0,61.917697,3.690493


In [28]:
# 0-1 normalize per gene
scaler = preprocessing.MinMaxScaler()
original_data_scaled = scaler.fit_transform(original_compendium)
original_data_scaled_df = pd.DataFrame(original_data_scaled,
                                columns=original_compendium.columns,
                                index=original_compendium.index)

print(original_data_scaled_df.shape)
original_data_scaled_df.head()

(3388, 17788)


Unnamed: 0,TRMT10C,RUFY3,CNOT1,SLC35F1,RBM26,IHH,GPR20,ADRB3,MFSD10,DCAF8,...,MRPL3,CEP152,C1QTNF7,RARA,HBD,LINC01526,MFSD14A,TFAP2B,NDUFA4,SULT2B1
SRR493816,0.008236,0.008567,0.107553,2.2e-05,0.030474,0.002778,0.000114,0.0,0.229998,0.146681,...,0.052023,0.013885,0.000248,0.023876,2.364617e-08,0.006778,0.054133,9e-06,0.037151,0.008155
SRR493817,0.005107,0.006941,0.116232,1.6e-05,0.019886,0.0,8.9e-05,0.0,0.183053,0.136998,...,0.035919,0.009513,2.3e-05,0.027302,9.650961e-07,0.0,0.052323,0.0,0.031665,0.020289
SRR547975,0.02665,0.002172,0.256107,0.0,0.120439,0.000442,0.000103,0.0,0.092649,0.145414,...,0.171702,0.063938,0.0,0.01647,0.0,0.0,0.130666,0.0,0.077624,0.0
SRR547973,0.013671,0.012364,0.335804,0.0,0.06175,0.06415,5e-06,0.0,0.091053,0.129655,...,0.140911,0.040081,0.0,0.019843,0.0,0.0,0.047186,0.0,0.043978,0.042983
SRR547968,0.029991,0.000509,0.139352,5e-06,0.078802,0.859928,0.0,0.0,0.129212,0.120411,...,0.204466,0.08435,0.0,0.028486,0.0,0.0,0.115065,0.0,0.172506,0.048682


In [29]:
# Save data
template_data.to_csv(
    template_data_file, float_format='%.5f', sep='\t')

original_compendium.to_csv(
    original_compendium_file, float_format='%.3f', sep='\t')

original_data_scaled_df.to_csv(
    normalized_data_file, float_format='%.3f', sep='\t')

# Save scaler transform
outfile = open(scaler_file,'wb')
pickle.dump(scaler,outfile)
outfile.close()

### Train VAE 
Performed exploratory analysis of compendium data [here](../explore_data/viz_recount2_compendium.ipynb) to help interpret loss curve.

In [30]:
# Setup directories
# Create VAE directories
output_dirs = [os.path.join(base_dir, dataset_name, "models"),
               os.path.join(base_dir, dataset_name, "logs")]

# Check if analysis output directory exist otherwise create
for each_dir in output_dirs:
    if os.path.exists(each_dir) == False:
        print('creating new directory: {}'.format(each_dir))
        os.makedirs(each_dir, exist_ok=True)

# Check if NN architecture directory exist otherwise create
for each_dir in output_dirs:
    new_dir = os.path.join(each_dir, NN_architecture)
    if os.path.exists(new_dir) == False:
        print('creating new directory: {}'.format(new_dir))
        os.makedirs(new_dir, exist_ok=True)

In [31]:
# Train VAE on new compendium data
train_vae_modules.train_vae(config_file,
                   normalized_data_file)