# Process recount2 data
This notebook does the following:

1. Selects template experiment
2. Downloads subset of recount2 data, including the template experiment (50 random experiments + 1 template experiment)
3. Train VAE on subset of recount2 data

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
import random
import rpy2
import seaborn as sns
from sklearn import preprocessing
import pickle

from ponyo import generate_template_data, utils, pipeline
from generic_expression_patterns_modules import process, calc

from numpy.random import seed
random_state = 123
seed(random_state)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))
Using TensorFlow backend.


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../"))

config_file = os.path.abspath(os.path.join(base_dir,
                                           "config_human.tsv"))
params = utils.read_config(config_file)

### Select template experiment

We manually selected bioproject [SRP012656](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE37764), which contains primary non-small cell lung adenocarcinoma tumors and adjacent normal tissues of 6 never-smoker Korean female patients with 2 replicates each.

In [3]:
# Load params
local_dir = params["local_dir"]
dataset_name = params['dataset_name']
NN_architecture = params['NN_architecture']
project_id = params['project_id']

### Download subset of recount2 to use as a compendium
The compendium will be composed of random experiments + the selected template experiment

In [4]:
%%R
# Select 59
# Select a
# Run one time
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("derfinder")
#BiocManager::install("recount")

NULL


In [5]:
%%R
library('recount')






Attaching package: ‘BiocGenerics’



    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB



    plotMA



    IQR, mad, sd, var, xtabs



    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which, which.max, which.min



Attaching package: ‘S4Vectors’



    expand.grid



Attaching package: ‘IRanges’



    collapse





    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.




Attaching package: ‘matrixStats’



    anyMissing, 

In [6]:
#%%R -i project_id -i local_dir

#source('../generic_expression_patterns_modules/download_recount2_data.R')

#get_recount2_compendium(project_id, local_dir)

### Download expression data for selected project id

In [7]:
%%R -i project_id -i local_dir

source('../generic_expression_patterns_modules/download_recount2_data.R')

get_recount2_template_experiment(project_id, local_dir)









Loading objects:
  rse_gene


### Subset genes
For our downstream analysis we will be comparing our set of differentially expression genes against the set found in [Crow et. al. publication](https://www.pnas.org/content/pnas/116/13/6491.full.pdf), therefore we will limit our genes to include only those genes shared between our starting set of genes and those in publication. 

In [8]:
# Get generic genes identified by Crow et. al.
DE_prior_file = "https://raw.githubusercontent.com/maggiecrow/DEprior/master/DE_Prior.txt"

DE_prior = pd.read_csv(DE_prior_file,
                       header=0,
                       sep="\t")

DE_prior.head()

Unnamed: 0,Gene_Order,Gene_EntrezID,N_HitLists,DE_Prior_Rank,Gene_Name
0,1,7503,79,1.0,XIST
1,2,8653,64,0.999948,DDX3Y
2,3,9086,62,0.99987,EIF1AY
3,4,8284,52,0.99987,KDM5D
4,5,8287,46,0.999791,USP9Y


In [9]:
# Get list of published generic genes
published_generic_genes = list(DE_prior['Gene_Name'])

In [10]:
# Get list of our genes
# Load real template experiment
template_data_file = params['template_data_file']

# Read template data
template_data = pd.read_csv(
    template_data_file,
    header=0,
    sep='\t',
    index_col=0)

our_gene_ids = list(template_data.columns)

In [11]:
# File mapping ensembl ids to hgnc symbols
gene_id_file = os.path.join(
    local_dir,
    "ensembl_hgnc_mapping.tsv")

In [12]:
%%R
suppressWarnings(library("biomaRt"))

In [13]:
%%R -i template_data_file -i gene_id_file

# Get mapping between ensembl gene ids (ours) to HGNC gene symbols (published)

source('../generic_expression_patterns_modules/process_names.R')

if (file.exists(gene_id_file) == FALSE){
    gene_id_mapping <- get_ensembl_symbol_mapping(template_data_file, gene_id_file)
}

In [14]:
# Read gene id mapping
gene_id_mapping = pd.read_csv(
        gene_id_file,
        header=0,
        sep='\t',
        index_col=0)

print(gene_id_mapping.shape)
gene_id_mapping.head()

(57210, 2)


Unnamed: 0,ensembl_gene_id,hgnc_symbol
1,ENSG00000002330,BAD
2,ENSG00000003137,CYP26B1
3,ENSG00000003249,DBNDD1
4,ENSG00000004799,PDK4
5,ENSG00000006062,MAP3K14


In [15]:
# Get mapping between ensembl ids with and without version numbers
# Expressiond data uses ensembl gene ids with version number 
ensembl_gene_ids = pd.DataFrame(data={'ensembl_version': our_gene_ids,
                                      'ensembl_parsed': [gene_id.split('.')[0] for gene_id in our_gene_ids]})

print(ensembl_gene_ids.shape)
ensembl_gene_ids.head()

(58037, 2)


Unnamed: 0,ensembl_version,ensembl_parsed
0,ENSG00000000003.14,ENSG00000000003
1,ENSG00000000005.5,ENSG00000000005
2,ENSG00000000419.12,ENSG00000000419
3,ENSG00000000457.13,ENSG00000000457
4,ENSG00000000460.16,ENSG00000000460


In [16]:
# Map ensembl ids with version number to gene_id_mapping_df
gene_id_mapping = pd.merge(gene_id_mapping, 
                           ensembl_gene_ids, 
                           left_on='ensembl_gene_id',
                           right_on='ensembl_parsed', 
                           how='outer')

print(gene_id_mapping.shape)
gene_id_mapping.set_index('ensembl_version', inplace=True)
gene_id_mapping.head()

(58129, 4)


Unnamed: 0_level_0,ensembl_gene_id,hgnc_symbol,ensembl_parsed
ensembl_version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000002330.13,ENSG00000002330,BAD,ENSG00000002330
ENSG00000003137.8,ENSG00000003137,CYP26B1,ENSG00000003137
ENSG00000003249.13,ENSG00000003249,DBNDD1,ENSG00000003249
ENSG00000004799.7,ENSG00000004799,PDK4,ENSG00000004799
ENSG00000006062.13,ENSG00000006062,MAP3K14,ENSG00000006062


Since this experiment contains both RNA-seq and smRNA-seq samples which are in different ranges so we will drop smRNA samples so that samples are within the same range. The analysis identifying these two subsets of samples can be found in this [notebook](0_explore_input_data.ipynb)

In [17]:
# Replace ensembl ids with gene symbols
template_data = process.replace_ensembl_ids(template_data,
                                            gene_id_mapping)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [18]:
template_data.head()

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,MIR6830,MIR548AH,MYOCOS,MIR522,MIR6715B,MIR3116-2,MIR3202-2,HSFX3,VINAC1P,MIR4481
SRR493937,3.800154,0.01819,9.128459,1.585803,0.713952,84.990542,19.943002,12.065586,2.498822,7.713002,...,0.0,0.0,0.135649,0.0,0.0,0.0,0.0,0.588513,0.0,0.0
SRR493938,3.798473,0.0,9.157669,1.456017,0.738967,85.788323,19.621252,11.738855,2.435308,7.125808,...,0.005417,0.0,0.127873,0.0,0.0,0.0,0.0,0.452511,0.0,0.0
SRR493939,20.222134,0.0,15.522072,3.944919,2.512265,8.072948,47.246078,18.866957,2.44323,12.14367,...,0.0,0.0,0.011112,0.0,0.0,0.0,0.0,0.446833,0.0,0.0
SRR493940,19.988329,0.0,17.140304,3.977217,2.378849,7.599225,46.594458,18.92119,2.448276,12.04896,...,0.147412,0.0,0.037886,0.0,0.0,0.07756,0.0,0.462295,0.0,0.168159
SRR493941,3.239231,0.019398,9.069316,2.181582,1.008155,17.675885,19.371137,11.759756,1.970725,6.740705,...,0.00286,0.0,0.15622,0.0,0.0,0.0,0.070955,0.739403,0.0,0.0


In [19]:
# Get intersection of gene lists
our_gene_ids_hgnc = template_data.columns
shared_genes_hgnc = list(set(our_gene_ids_hgnc).intersection(published_generic_genes))
print(len(shared_genes_hgnc))

17785


In [20]:
# Save shared genes
shared_genes_file = os.path.join(
    local_dir,
    "shared_gene_ids.pickle")

outfile = open(shared_genes_file,'wb')
pickle.dump(shared_genes_hgnc,outfile)
outfile.close()

In [21]:
# Drop smRNA samples so that samples are within the same range
smRNA_samples = ["SRR493961",
                 "SRR493962",
                 "SRR493963",
                 "SRR493964",
                 "SRR493965",
                 "SRR493966",
                 "SRR493967",
                 "SRR493968",
                 "SRR493969",
                 "SRR493970",
                 "SRR493971",
                 "SRR493972"]

In [22]:
# Drop samples
template_data = template_data.drop(smRNA_samples)

In [23]:
# Drop genes
template_data = template_data[shared_genes_hgnc]

print(template_data.shape)
template_data.head()

(24, 17788)


Unnamed: 0,POLR3G,SSTR4,CORIN,BBOX1-AS1,PROSER2-AS1,LRRC37A3,SERPINA7,GABRA3,SNX22,MATN1,...,LRP5,SRD5A3-AS1,CFP,RHBDF1,SUPT5H,TEFM,DMRTC2,SLC35E3,SOD2,TMEM70
SRR493937,0.247983,0.092681,0.155784,0.022465,0.156563,1.039957,0.0,0.022215,20.697529,0.358349,...,17.995453,0.659922,3.781465,7.228667,19.568409,0.86979,0.0,0.667853,61.45739,9.29935
SRR493938,0.39993,0.041186,0.159909,0.032658,0.123457,1.010312,0.0,0.009993,20.712299,0.305661,...,18.666885,0.571155,4.032993,7.338301,19.84044,0.83997,0.008727,0.708261,61.391806,9.188426
SRR493939,1.115604,0.0,0.288992,1.351308,0.316169,1.792745,0.0,0.501358,37.996993,0.43392,...,32.938705,2.220845,0.419704,17.261978,30.272669,2.069097,0.152653,0.943442,10.63613,7.73625
SRR493940,1.150075,0.108467,0.23415,1.832712,0.174537,1.951469,0.026073,0.386864,37.201605,0.569087,...,33.044193,2.016507,0.578981,17.409587,30.783963,2.084962,0.07815,0.970944,10.656667,8.143106
SRR493941,0.50649,0.054713,0.341643,0.0,0.145713,1.273116,0.0,0.007448,16.055489,0.153819,...,36.473139,0.584874,3.538937,13.199591,15.164681,1.165465,0.0,0.717657,10.316851,4.318829


In [24]:
print(len(template_data.columns) - len(shared_genes_hgnc))

3


*Note:* There is a difference in the number of `shared_genes_hgnc` and genes in the template experiment because 3 genes have 2 different ensembl gene ids have map to the same hgnc symbol (one forward, one reverse)

In [25]:
# Save 
template_data.to_csv(template_data_file, float_format='%.5f', sep='\t')

### Normalize compendium 

In [26]:
# Load real gene expression data
original_compendium_file = params['compendium_data_file']

In [27]:
# Read data
original_compendium = pd.read_table(
    original_compendium_file,
    header=0,
    sep='\t',
    index_col=0)

print(original_compendium.shape)
original_compendium.head()

(4119, 17788)


Unnamed: 0,ANKRD36B,DMBT1,CFAP45,TP53I11,AARS2,KLRA1P,SOCS6,USP9X,ABI3BP,LCMT1-AS2,...,IL7,KCNJ3,SLC30A3,RTTN,FAM32A,ERBB3,SMYD5,BCAS1,TYRO3,ZNF283
SRR223496,0.967,0.154,0.918,12.729,7.732,0.53,3.206,23.838,0.033,0.085,...,0.24,0.0,0.17,0.861,23.54,7.742,11.772,0.206,2.488,0.935
SRR223497,1.458,55.136,0.177,0.616,8.166,0.608,4.714,7.537,0.006,0.218,...,0.513,0.014,0.118,2.169,26.574,4.492,12.299,0.007,2.343,0.474
SRR223506,0.53,0.006,3.627,9.054,11.241,0.119,2.736,6.762,0.033,0.039,...,0.001,0.001,0.397,1.904,47.685,3.783,8.768,0.085,4.555,2.435
SRR223524,0.355,0.013,0.244,0.687,5.419,0.154,4.513,16.814,0.294,0.022,...,0.161,0.0,0.181,1.229,59.911,0.057,7.475,0.012,1.04,0.686
SRR223525,1.346,0.91,0.448,6.607,4.814,0.297,2.515,5.427,0.115,0.064,...,0.033,0.0,0.317,1.214,24.41,7.884,10.431,0.069,0.617,0.603


In [28]:
# Replace ensembl ids with gene symbols
#original_compendium = process.replace_ensembl_ids(original_compendium,
#                                                gene_id_mapping)

In [29]:
# Drop genes
original_compendium = original_compendium[shared_genes_hgnc]

original_compendium.head()

Unnamed: 0,POLR3G,SSTR4,CORIN,BBOX1-AS1,PROSER2-AS1,LRRC37A3,SERPINA7,GABRA3,SNX22,MATN1,...,LRP5,SRD5A3-AS1,CFP,RHBDF1,SUPT5H,TEFM,DMRTC2,SLC35E3,SOD2,TMEM70
SRR223496,0.655,0.0,0.01,0.021,0.736,0.846,0.0,0.134,23.167,0.231,...,28.805,0.709,0.098,5.882,31.86,3.619,0.0,0.424,6.531,7.811
SRR223497,3.068,0.0,0.0,0.0,1.62,0.925,0.0,0.039,23.608,0.566,...,38.817,0.379,0.066,11.39,15.556,1.605,0.0,0.539,21.54,6.549
SRR223506,1.871,0.0,0.035,0.034,1.003,0.898,0.0,2.425,33.519,0.227,...,19.708,0.431,0.265,2.327,49.27,3.775,0.0,0.269,5.545,4.361
SRR223524,11.65,0.006,0.059,0.039,0.013,2.255,0.0,4.248,43.632,0.328,...,3.314,2.096,0.024,1.574,20.061,2.512,0.001,0.493,8.279,9.685
SRR223525,0.98,0.019,0.056,1.472,0.153,1.256,0.0,2.123,144.121,0.087,...,14.225,2.375,0.367,22.861,19.63,1.031,0.039,0.584,4.113,3.822


In [30]:
# 0-1 normalize per gene
scaler = preprocessing.MinMaxScaler()
original_data_scaled = scaler.fit_transform(original_compendium)
original_data_scaled_df = pd.DataFrame(original_data_scaled,
                                columns=original_compendium.columns,
                                index=original_compendium.index)

print(original_data_scaled_df.shape)
original_data_scaled_df.head()

(4119, 17785)


Unnamed: 0,POLR3G,SSTR4,CORIN,BBOX1-AS1,PROSER2-AS1,LRRC37A3,SERPINA7,GABRA3,SNX22,MATN1,...,LRP5,SRD5A3-AS1,CFP,RHBDF1,SUPT5H,TEFM,DMRTC2,SLC35E3,SOD2,TMEM70
SRR223496,0.006218,0.0,0.000388,0.000331,0.044985,0.019284,0.0,0.000138,0.082804,0.019822,...,0.189333,0.046446,0.000469,0.077203,0.091867,0.10981,0.0,0.000563,0.00736,0.015749
SRR223497,0.029124,0.0,0.0,0.0,0.099016,0.021085,0.0,4e-05,0.08438,0.048567,...,0.255142,0.024828,0.000316,0.149497,0.044855,0.0487,0.0,0.000716,0.024276,0.013204
SRR223506,0.017761,0.0,0.001359,0.000536,0.061304,0.02047,0.0,0.002489,0.119804,0.019478,...,0.129539,0.028235,0.001268,0.030542,0.142069,0.114543,0.0,0.000357,0.006249,0.008793
SRR223524,0.110591,0.000498,0.002292,0.000615,0.000795,0.051402,0.0,0.00436,0.15595,0.028145,...,0.021783,0.137308,0.000115,0.020659,0.057845,0.076221,1.5e-05,0.000655,0.009331,0.019527
SRR223525,0.009303,0.001577,0.002175,0.023217,0.009352,0.02863,0.0,0.002179,0.515119,0.007465,...,0.0935,0.155585,0.001756,0.300056,0.056603,0.031283,0.000589,0.000775,0.004635,0.007706


In [31]:
# Save data
normalized_data_file = params['normalized_compendium_data_file']

original_data_scaled_df.to_csv(
    normalized_data_file, float_format='%.3f', sep='\t')

original_compendium.to_csv(
    original_compendium_file, float_format='%.3f', sep='\t')

# Save scaler transform
scaler_file = params['scaler_transform_file']

outfile = open(scaler_file,'wb')
pickle.dump(scaler,outfile)
outfile.close()

### Train VAE 

In [32]:
# Setup directories
# Create VAE directories
output_dirs = [os.path.join(base_dir, dataset_name, "models"),
               os.path.join(base_dir, dataset_name, "logs")]

# Check if analysis output directory exist otherwise create
for each_dir in output_dirs:
    if os.path.exists(each_dir) == False:
        print('creating new directory: {}'.format(each_dir))
        os.makedirs(each_dir, exist_ok=True)

# Check if NN architecture directory exist otherwise create
for each_dir in output_dirs:
    new_dir = os.path.join(each_dir, NN_architecture)
    if os.path.exists(new_dir) == False:
        print('creating new directory: {}'.format(new_dir))
        os.makedirs(new_dir, exist_ok=True)

In [33]:
# Train VAE on new compendium data
#pipeline.train_vae(config_file,
#                   normalized_data_file)