# Process recount2 data
This notebook does the following:

1. Selects template experiment
2. Downloads subset of recount2 data, including the template experiment (50 random experiments + 1 template experiment)
3. Train VAE on subset of recount2 data

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
import random
import rpy2
import seaborn as sns
from sklearn import preprocessing
import pickle

from ponyo import generate_template_data, utils, pipeline
from generic_modules import process, calc

from numpy.random import seed
random_state = 123
seed(random_state)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))
Using TensorFlow backend.


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../"))

config_file = os.path.abspath(os.path.join(base_dir,
                                           "config_human.tsv"))
params = utils.read_config(config_file)

### Select template experiment

We manually selected bioproject [SRP012656](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE37764), which contains primary non-small cell lung adenocarcinoma tumors and adjacent normal tissues of 6 never-smoker Korean female patients with 2 replicates each.

In [3]:
# Load params
local_dir = params["local_dir"]
dataset_name = params['dataset_name']
NN_architecture = params['NN_architecture']
project_id = params['project_id']

### Download subset of recount2 to use as a compendium
The compendium will be composed of random experiments + the selected template experiment

In [4]:
%%R
# Select 59
# Select a
# Run one time
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("derfinder")
#BiocManager::install("recount")

NULL


In [5]:
%%R
library('recount')

In [6]:
#%%R -i project_id -i local_dir

#source('../generic_modules/download_recount2_data.R')

#get_recount2_compendium(project_id, local_dir)

### Download expression data for selected project id

In [7]:
%%R -i project_id -i local_dir

source('../generic_modules/download_recount2_data.R')

get_recount2_template_experiment(project_id, local_dir)

Loading objects:
  rse_gene


### Subset genes
For our downstream analysis we will be comparing our set of differentially expression genes against the set found in [Crow et. al. publication](https://www.pnas.org/content/pnas/116/13/6491.full.pdf), therefore we will limit our genes to include only those genes shared between our starting set of genes and those in publication. 

In [8]:
# Get generic genes identified by Crow et. al.
DE_prior_file = "https://raw.githubusercontent.com/maggiecrow/DEprior/master/DE_Prior.txt"

DE_prior = pd.read_csv(DE_prior_file,
                       header=0,
                       sep="\t")

DE_prior.head()

Unnamed: 0,Gene_Order,Gene_EntrezID,N_HitLists,DE_Prior_Rank,Gene_Name
0,1,7503,79,1.0,XIST
1,2,8653,64,0.999948,DDX3Y
2,3,9086,62,0.99987,EIF1AY
3,4,8284,52,0.99987,KDM5D
4,5,8287,46,0.999791,USP9Y


In [9]:
# Get list of published generic genes
published_generic_genes = list(DE_prior['Gene_Name'])

In [10]:
# Get list of our genes
# Load real template experiment
template_data_file = params['template_data_file']

# Read template data
template_data = pd.read_csv(
    template_data_file,
    header=0,
    sep='\t',
    index_col=0)

our_gene_ids = list(template_data.columns)

In [11]:
# File mapping ensembl ids to hgnc symbols
gene_id_file = os.path.join(
    local_dir,
    "ensembl_hgnc_mapping.tsv")

In [12]:
%%R
suppressWarnings(library("biomaRt"))

In [13]:
%%R -i template_data_file -i gene_id_file

# Get mapping between ensembl gene ids (ours) to HGNC gene symbols (published)

source('../generic_modules/process_names.R')

if (file.exists(gene_id_file) == FALSE){
    gene_id_mapping <- get_ensembl_symbol_mapping(template_data_file, gene_id_file)
}

In [14]:
# Read gene id mapping
gene_id_mapping = pd.read_csv(
        gene_id_file,
        header=0,
        sep='\t',
        index_col=0)

print(gene_id_mapping.shape)
gene_id_mapping.head()

(57210, 2)


Unnamed: 0,ensembl_gene_id,hgnc_symbol
1,ENSG00000002330,BAD
2,ENSG00000003137,CYP26B1
3,ENSG00000003249,DBNDD1
4,ENSG00000004799,PDK4
5,ENSG00000006062,MAP3K14


In [15]:
# Get mapping between ensembl ids with and without version numbers
# Expressiond data uses ensembl gene ids with version number 
ensembl_gene_ids = pd.DataFrame(data={'ensembl_version': our_gene_ids,
                                      'ensembl_parsed': [gene_id.split('.')[0] for gene_id in our_gene_ids]})

print(ensembl_gene_ids.shape)
ensembl_gene_ids.head()

(58037, 2)


Unnamed: 0,ensembl_version,ensembl_parsed
0,ENSG00000000003.14,ENSG00000000003
1,ENSG00000000005.5,ENSG00000000005
2,ENSG00000000419.12,ENSG00000000419
3,ENSG00000000457.13,ENSG00000000457
4,ENSG00000000460.16,ENSG00000000460


In [16]:
# Map ensembl ids with version number to gene_id_mapping_df
gene_id_mapping = pd.merge(gene_id_mapping, 
                           ensembl_gene_ids, 
                           left_on='ensembl_gene_id',
                           right_on='ensembl_parsed', 
                           how='outer')

print(gene_id_mapping.shape)
gene_id_mapping.set_index('ensembl_version', inplace=True)
gene_id_mapping.head()

(58129, 4)


Unnamed: 0_level_0,ensembl_gene_id,hgnc_symbol,ensembl_parsed
ensembl_version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000002330.13,ENSG00000002330,BAD,ENSG00000002330
ENSG00000003137.8,ENSG00000003137,CYP26B1,ENSG00000003137
ENSG00000003249.13,ENSG00000003249,DBNDD1,ENSG00000003249
ENSG00000004799.7,ENSG00000004799,PDK4,ENSG00000004799
ENSG00000006062.13,ENSG00000006062,MAP3K14,ENSG00000006062


Since this experiment contains both RNA-seq and smRNA-seq samples which are in different ranges so we will drop smRNA samples so that samples are within the same range. The analysis identifying these two subsets of samples can be found in this [notebook](0_explore_input_data.ipynb)

In [17]:
# Replace ensembl ids with gene symbols
template_data = process.replace_ensembl_ids(template_data,
                                            gene_id_mapping)

In [18]:
template_data.head()

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,MIR6830,MIR548AH,MYOCOS,MIR522,MIR6715B,MIR3116-2,MIR3202-2,HSFX3,VINAC1P,MIR4481
SRR493937,3.800154,0.01819,9.128459,1.585803,0.713952,84.990542,19.943002,12.065586,2.498822,7.713002,...,0.0,0.0,0.135649,0.0,0.0,0.0,0.0,0.588513,0.0,0.0
SRR493938,3.798473,0.0,9.157669,1.456017,0.738967,85.788323,19.621252,11.738855,2.435308,7.125808,...,0.005417,0.0,0.127873,0.0,0.0,0.0,0.0,0.452511,0.0,0.0
SRR493939,20.222134,0.0,15.522072,3.944919,2.512265,8.072948,47.246078,18.866957,2.44323,12.14367,...,0.0,0.0,0.011112,0.0,0.0,0.0,0.0,0.446833,0.0,0.0
SRR493940,19.988329,0.0,17.140304,3.977217,2.378849,7.599225,46.594458,18.92119,2.448276,12.04896,...,0.147412,0.0,0.037886,0.0,0.0,0.07756,0.0,0.462295,0.0,0.168159
SRR493941,3.239231,0.019398,9.069316,2.181582,1.008155,17.675885,19.371137,11.759756,1.970725,6.740705,...,0.00286,0.0,0.15622,0.0,0.0,0.0,0.070955,0.739403,0.0,0.0


In [19]:
# Get intersection of gene lists
our_gene_ids_hgnc = template_data.columns
shared_genes_hgnc = list(set(our_gene_ids_hgnc).intersection(published_generic_genes))
print(len(shared_genes_hgnc))

17785


In [20]:
# Save shared genes
shared_genes_file = os.path.join(
    local_dir,
    "shared_gene_ids.pickle")

outfile = open(shared_genes_file,'wb')
pickle.dump(shared_genes_hgnc,outfile)
outfile.close()

In [21]:
# Drop smRNA samples so that samples are within the same range
smRNA_samples = ["SRR493961",
                 "SRR493962",
                 "SRR493963",
                 "SRR493964",
                 "SRR493965",
                 "SRR493966",
                 "SRR493967",
                 "SRR493968",
                 "SRR493969",
                 "SRR493970",
                 "SRR493971",
                 "SRR493972"]

In [22]:
# Drop samples
template_data = template_data.drop(smRNA_samples)

In [23]:
# Drop genes
template_data = template_data[shared_genes_hgnc]

print(template_data.shape)
template_data.head()

(24, 17788)


Unnamed: 0,RPL23A,HOPX,CNTLN,YEATS2,ZEB2,DAB2,LLGL2,TEKT3,CCNG1,HNRNPDL,...,TMEM52,SPINT3,ERICH3,GOLGA2P5,PIH1D2,GABARAPL2,PHKA2,AHCYL2,FAM170A,CAMSAP3
SRR493937,26.380195,7.594717,0.486687,3.244362,4.594237,10.716403,12.610643,1.133912,9.398151,31.639392,...,0.841445,0.0,3.098554,1.865488,1.264137,11.905335,3.857274,18.718648,0.0,2.89407
SRR493938,26.04399,7.449357,0.465863,3.453337,4.65546,10.348601,12.898778,1.17887,8.950152,31.57778,...,0.76263,0.0,3.248183,1.838183,1.335846,11.89156,3.79802,18.645412,0.0,2.979511
SRR493939,23.938892,17.296808,0.309277,2.77427,1.626023,4.724359,21.224896,0.599888,16.530086,34.01342,...,1.633144,0.0,0.602161,3.885646,1.750385,11.529645,5.524931,13.243615,0.0,8.817652
SRR493940,23.536737,17.226186,0.303837,2.888214,1.664242,4.715913,21.06764,0.587902,17.026152,35.165989,...,1.475681,0.0,0.640175,3.943578,1.805356,11.934644,5.547112,12.788488,0.025959,8.600757
SRR493941,19.06397,20.052478,0.572087,4.763826,5.982738,17.533812,18.588802,0.296801,12.384805,24.52243,...,0.605483,0.0,0.583209,1.57346,0.705457,15.842725,4.316776,18.247787,0.011332,7.776162


In [24]:
print(len(template_data.columns) - len(shared_genes_hgnc))

3


*Note:* There is a difference in the number of `shared_genes_hgnc` and genes in the template experiment because 3 genes have 2 different ensembl gene ids have map to the same hgnc symbol (one forward, one reverse)

In [25]:
# Save 
template_data.to_csv(template_data_file, float_format='%.5f', sep='\t')

### Normalize compendium 

In [26]:
# Load real gene expression data
original_compendium_file = params['compendium_data_file']

In [27]:
# Read data
original_compendium = pd.read_table(
    original_compendium_file,
    header=0,
    sep='\t',
    index_col=0)

print(original_compendium.shape)
original_compendium.head()

(1552, 17785)


Unnamed: 0,ARHGAP45,ZNF610,PPP1R3B,RBBP7,CCDC185,GPATCH3,C1orf159,DECR2,NAA11,KRTAP4-6,...,FBXL6,UBC,CD1A,OLFML2B,CACNA2D4,EID1,CHRNA10,SCYL2,SCRT1,POLR2L
SRR1604987,4.686,2.914,6.349,12.617,0.034,4.167,1.487,4.824,0.0,0.107,...,6.677,418.218,0.0,0.07,0.05,40.078,0.194,9.64,0.012,52.679
SRR1604988,4.968,2.993,6.243,12.53,0.011,4.031,1.397,4.778,0.006,0.065,...,6.247,413.291,0.0,0.04,0.042,40.601,0.09,9.914,0.007,51.503
SRR1604989,5.91,2.855,13.585,16.71,0.0,3.882,1.172,2.845,0.0,0.0,...,5.733,297.397,0.0,0.019,0.003,38.53,0.154,10.461,0.002,39.457
SRR1604990,5.737,2.884,14.17,16.927,0.016,3.883,1.199,2.745,0.0,0.0,...,5.552,298.423,0.0,0.038,0.012,37.747,0.114,10.902,0.0,38.281
SRR1604991,2.58,2.653,8.768,17.953,0.067,4.608,1.624,3.556,0.0,0.0,...,5.893,281.337,0.0,0.986,0.012,64.853,0.215,8.097,0.01,83.953


In [28]:
# Replace ensembl ids with gene symbols
#original_compendium = process.replace_ensembl_ids(original_compendium,
#                                                gene_id_mapping)

In [29]:
# Drop genes
original_compendium = original_compendium[shared_genes_hgnc]

original_compendium.head()

Unnamed: 0,RPL23A,HOPX,CNTLN,YEATS2,ZEB2,DAB2,LLGL2,TEKT3,CCNG1,HNRNPDL,...,TMEM52,SPINT3,ERICH3,GOLGA2P5,PIH1D2,GABARAPL2,PHKA2,AHCYL2,FAM170A,CAMSAP3
SRR1604987,75.874,0.469,1.026,6.637,0.02,0.108,8.724,0.148,21.094,32.1,...,0.507,0.0,0.0,0.768,2.511,25.181,1.838,10.666,0.017,8.872
SRR1604988,76.305,0.462,0.934,6.621,0.019,0.096,8.777,0.16,21.491,32.7,...,0.511,0.0,0.005,0.781,2.293,24.445,1.834,10.647,0.016,9.349
SRR1604989,87.869,0.069,1.145,14.261,0.044,0.17,8.784,0.157,29.383,30.95,...,0.382,0.0,0.0,1.125,1.918,21.214,2.574,4.115,0.0,10.486
SRR1604990,85.461,0.06,1.221,14.436,0.031,0.156,8.665,0.07,30.541,31.362,...,0.45,0.0,0.0,1.074,2.168,20.698,2.53,4.177,0.0,10.43
SRR1604991,67.554,0.088,1.208,9.838,1.624,6.82,5.938,0.179,33.542,37.241,...,0.269,0.0,0.0,0.36,1.616,23.423,1.885,5.375,0.0,4.421


In [30]:
# 0-1 normalize per gene
scaler = preprocessing.MinMaxScaler()
original_data_scaled = scaler.fit_transform(original_compendium)
original_data_scaled_df = pd.DataFrame(original_data_scaled,
                                columns=original_compendium.columns,
                                index=original_compendium.index)

print(original_data_scaled_df.shape)
original_data_scaled_df.head()

(1552, 17785)


Unnamed: 0,RPL23A,HOPX,CNTLN,YEATS2,ZEB2,DAB2,LLGL2,TEKT3,CCNG1,HNRNPDL,...,TMEM52,SPINT3,ERICH3,GOLGA2P5,PIH1D2,GABARAPL2,PHKA2,AHCYL2,FAM170A,CAMSAP3
SRR1604987,0.026935,0.002304,0.056414,0.286263,0.000169,0.000591,0.099707,0.007618,0.058015,0.190397,...,0.006762,0.0,0.0,0.075882,0.104425,0.076349,0.019088,0.385667,0.001143,0.196963
SRR1604988,0.027088,0.002269,0.051355,0.285573,0.000161,0.000526,0.100313,0.008236,0.059107,0.193956,...,0.006815,0.0,0.000387,0.077166,0.095359,0.074117,0.019047,0.38498,0.001075,0.207553
SRR1604989,0.031193,0.000339,0.062957,0.615096,0.000373,0.000931,0.100393,0.008082,0.080812,0.183576,...,0.005095,0.0,0.0,0.111155,0.079764,0.064321,0.026732,0.148792,0.0,0.232795
SRR1604990,0.030338,0.000295,0.067136,0.622644,0.000263,0.000854,0.099033,0.003603,0.083997,0.18602,...,0.006001,0.0,0.0,0.106116,0.090161,0.062756,0.026275,0.151034,0.0,0.231551
SRR1604991,0.023981,0.000432,0.066421,0.424326,0.013754,0.037351,0.067866,0.009214,0.09225,0.22089,...,0.003588,0.0,0.0,0.03557,0.067205,0.071018,0.019576,0.194352,0.0,0.098148


In [31]:
# Save data
normalized_data_file = params['normalized_compendium_data_file']

original_data_scaled_df.to_csv(
    normalized_data_file, float_format='%.3f', sep='\t')

original_compendium.to_csv(
    original_compendium_file, float_format='%.3f', sep='\t')

# Save scaler transform
scaler_file = params['scaler_transform_file']

outfile = open(scaler_file,'wb')
pickle.dump(scaler,outfile)
outfile.close()

### Train VAE 

In [32]:
# Setup directories
# Create VAE directories
output_dirs = [os.path.join(base_dir, dataset_name, "models"),
               os.path.join(base_dir, dataset_name, "logs")]

# Check if analysis output directory exist otherwise create
for each_dir in output_dirs:
    if os.path.exists(each_dir) == False:
        print('creating new directory: {}'.format(each_dir))
        os.makedirs(each_dir, exist_ok=True)

# Check if NN architecture directory exist otherwise create
for each_dir in output_dirs:
    new_dir = os.path.join(each_dir, NN_architecture)
    if os.path.exists(new_dir) == False:
        print('creating new directory: {}'.format(new_dir))
        os.makedirs(new_dir, exist_ok=True)

In [33]:
# Train VAE on new compendium data
#pipeline.train_vae(config_file,
#                   normalized_data_file)