# Process data
This notebook does the following:

1. Selects template experiment
2. Downloads subset of recount2 data, including the template experiment (50 random experiments + 1 template experiment)
3. Train VAE on subset of recount2 data

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
import random
import rpy2
import seaborn as sns
from sklearn import preprocessing
import pickle

sys.path.append("../")
from functions import generate_labeled_data, utils, pipeline

from numpy.random import seed
randomState = 123
seed(randomState)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))
Using TensorFlow backend.


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../"))

config_file = os.path.abspath(os.path.join(base_dir,
                                           "Rank_pathways",
                                           "init_config.tsv"))
params = utils.read_config(config_file)

### Select template experiment

We manually selected bioproject [SRP012656](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE37764), which contains primary non-small cell lung adenocarcinoma tumors and adjacent normal tissues of 6 never-smoker Korean female patients with 2 replicates each.

In [3]:
# Load params
local_dir = params["local_dir"]
dataset_name = params['dataset_name']
NN_architecture = params['NN_architecture']
project_id = params['project_id']

### Download subset of recount2 to use as a compendium
The compendium will be composed of random experiments + the selected template experiment

In [4]:
%%R
# Select 59
# Run one time
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("recount")

NULL


In [5]:
%%R
library('recount')

In [6]:
#%%R -i project_id -i local_dir

#source('../functions/download_recount2_data.R')

#get_recount2_compendium(project_id, local_dir)

### Download expression data for selected project id

In [7]:
%%R -i project_id -i local_dir

source('../functions/download_recount2_data.R')

get_recount2_template_experiment(project_id, local_dir)

Loading objects:
  rse_gene


### Subset genes
For our downstream analysis we will be comparing our set of differentially expression genes against the set found in [Crow et. al. publication](https://www.pnas.org/content/pnas/116/13/6491.full.pdf), therefore we will limit our genes to include only those genes shared between our starting set of genes and those in publication. 

In [8]:
# Get generic genes identified by Crow et. al.
DE_prior_file = "https://raw.githubusercontent.com/maggiecrow/DEprior/master/DE_Prior.txt"

DE_prior = pd.read_csv(DE_prior_file,
                       header=0,
                       sep="\t")

DE_prior.head()

Unnamed: 0,Gene_Order,Gene_EntrezID,N_HitLists,DE_Prior_Rank,Gene_Name
0,1,7503,79,1.0,XIST
1,2,8653,64,0.999948,DDX3Y
2,3,9086,62,0.99987,EIF1AY
3,4,8284,52,0.99987,KDM5D
4,5,8287,46,0.999791,USP9Y


In [9]:
# Get list of published generic genes
published_generic_genes = list(DE_prior['Gene_Name'])

In [10]:
# Get list of our genes
# Load real template experiment
template_data_file = os.path.join(
    local_dir,
    "recount2_template_data.tsv")

# Read template data
template_data = pd.read_csv(
    template_data_file,
    header=0,
    sep='\t',
    index_col=0)

our_gene_ids = list(template_data.columns)

In [11]:
# File mapping ensembl ids to hgnc symbols
gene_id_file = os.path.join(
    local_dir,
    "ensembl_hgnc_mapping.tsv")

In [12]:
%%R
suppressWarnings(library("biomaRt"))

In [13]:
%%R -i template_data_file -i gene_id_file

# Get mapping between ensembl gene ids (ours) to HGNC gene symbols (published)

source('../functions/GSEA_analysis.R')

if (file.exists(gene_id_file) == FALSE){
    gene_id_mapping <- get_ensembl_symbol_mapping(template_data_file, gene_id_file)
}

In [14]:
# Read gene id mapping
gene_id_mapping = pd.read_csv(
        gene_id_file,
        header=0,
        sep='\t',
        index_col=0)

print(gene_id_mapping.shape)
gene_id_mapping.head()

(57210, 2)


Unnamed: 0,ensembl_gene_id,hgnc_symbol
1,ENSG00000002330,BAD
2,ENSG00000003137,CYP26B1
3,ENSG00000003249,DBNDD1
4,ENSG00000004799,PDK4
5,ENSG00000006062,MAP3K14


In [15]:
# Get mapping between ensembl ids with and without version numbers
# Expressiond data uses ensembl gene ids with version number 
ensembl_gene_ids = pd.DataFrame(data={'ensembl_version': our_gene_ids,
                                      'ensembl_parsed': [gene_id.split('.')[0] for gene_id in our_gene_ids]})

print(ensembl_gene_ids.shape)
ensembl_gene_ids.head()

(58037, 2)


Unnamed: 0,ensembl_version,ensembl_parsed
0,ENSG00000000003.14,ENSG00000000003
1,ENSG00000000005.5,ENSG00000000005
2,ENSG00000000419.12,ENSG00000000419
3,ENSG00000000457.13,ENSG00000000457
4,ENSG00000000460.16,ENSG00000000460


In [16]:
# Map ensembl ids with version number to gene_id_mapping_df
gene_id_mapping = pd.merge(gene_id_mapping, 
                           ensembl_gene_ids, 
                           left_on='ensembl_gene_id',
                           right_on='ensembl_parsed', 
                           how='outer')

print(gene_id_mapping.shape)
gene_id_mapping.set_index('ensembl_version', inplace=True)
gene_id_mapping.head()

(58129, 4)


Unnamed: 0_level_0,ensembl_gene_id,hgnc_symbol,ensembl_parsed
ensembl_version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000002330.13,ENSG00000002330,BAD,ENSG00000002330
ENSG00000003137.8,ENSG00000003137,CYP26B1,ENSG00000003137
ENSG00000003249.13,ENSG00000003249,DBNDD1,ENSG00000003249
ENSG00000004799.7,ENSG00000004799,PDK4,ENSG00000004799
ENSG00000006062.13,ENSG00000006062,MAP3K14,ENSG00000006062


In [17]:
# Get intersection of gene lists
our_gene_ids_hgnc = gene_id_mapping.loc[our_gene_ids]['hgnc_symbol']
shared_genes_hgnc = set(our_gene_ids_hgnc).intersection(published_generic_genes)
print(len(shared_genes_hgnc))

17788


In [18]:
# Convert shared hgnc gene ids back to ensembl ids with version
shared_genes = list(gene_id_mapping[gene_id_mapping['hgnc_symbol'].isin(shared_genes_hgnc)].index)
print(len(shared_genes))

17851


In [19]:
# Save shared genes
shared_genes_file = os.path.join(
    local_dir,
    "shared_gene_ids.pickle")

outfile = open(shared_genes_file,'wb')
pickle.dump(shared_genes,outfile)
outfile.close()

Since this experiment contains both RNA-seq and smRNA-seq samples which are in different ranges so we will drop smRNA samples so that samples are within the same range. The analysis identifying these two subsets of samples can be found in this [notebook](0_explore_input_data.ipynb)

In [20]:
# Drop smRNA samples so that samples are within the same range
smRNA_samples = ["SRR493961",
                 "SRR493962",
                 "SRR493963",
                 "SRR493964",
                 "SRR493965",
                 "SRR493966",
                 "SRR493967",
                 "SRR493968",
                 "SRR493969",
                 "SRR493970",
                 "SRR493971",
                 "SRR493972"]

In [21]:
# Drop samples
template_data = template_data.drop(smRNA_samples)

In [22]:
# Drop genes
template_data = template_data[shared_genes]

print(template_data.shape)
template_data.head()

(24, 17851)


Unnamed: 0,ENSG00000002330.13,ENSG00000003137.8,ENSG00000003249.13,ENSG00000004799.7,ENSG00000006062.13,ENSG00000006282.20,ENSG00000006283.17,ENSG00000006327.13,ENSG00000007384.15,ENSG00000007968.6,...,ENSG00000203965.12,ENSG00000205352.10,ENSG00000206538.8,ENSG00000211450.9,ENSG00000213246.6,ENSG00000224470.7,ENSG00000244879.5,ENSG00000264364.2,ENSG00000226978.1,ENSG00000230292.5
SRR493937,12.990968,0.848728,0.809381,34.353087,12.810135,12.114271,0.280827,36.630775,7.228667,0.276567,...,0.634714,19.870212,1.752787,13.408707,16.808396,8.976874,0.976822,11.274722,0.0,0.0
SRR493938,12.513636,1.016285,0.907216,34.784134,12.080994,12.518965,0.283395,34.576562,7.338301,0.361873,...,0.538282,19.251368,1.746381,13.165898,16.929152,8.891573,0.912389,11.632596,0.0,0.0
SRR493939,25.954618,0.49441,1.870051,9.915847,6.725095,21.012343,0.223001,32.285838,17.261978,0.705615,...,1.356601,34.834635,0.962179,22.371422,23.873134,5.430164,1.49381,14.609568,0.0,0.0
SRR493940,25.763659,0.624823,1.736342,10.573514,6.58389,21.389096,0.17423,32.901649,17.409587,0.633884,...,1.452227,35.175937,0.971337,22.598672,24.089967,5.504614,1.512516,14.139525,0.0,0.0
SRR493941,18.379733,3.738637,1.44336,32.690667,5.856697,13.002362,0.189597,8.971186,13.199591,0.145056,...,0.932711,18.635415,4.78081,9.416897,19.599759,9.417604,0.960957,12.767854,0.0,0.0


In [23]:
# Save 
template_data.to_csv(template_data_file, float_format='%.5f', sep='\t')

### Normalize compendium 

In [24]:
# Load real gene expression data
original_compendium_file = os.path.join(
    local_dir,
    "recount2_compedium_data.tsv")

In [25]:
# Read data
original_compendium = pd.read_table(
    original_compendium_file,
    header=0,
    sep='\t',
    index_col=0)

# Drop genes
original_compendium = original_compendium[shared_genes]

print(original_compendium.shape)
original_compendium.head()

(1552, 17851)


Unnamed: 0,ENSG00000002330.13,ENSG00000003137.8,ENSG00000003249.13,ENSG00000004799.7,ENSG00000006062.13,ENSG00000006282.20,ENSG00000006283.17,ENSG00000006327.13,ENSG00000007384.15,ENSG00000007968.6,...,ENSG00000203965.12,ENSG00000205352.10,ENSG00000206538.8,ENSG00000211450.9,ENSG00000213246.6,ENSG00000224470.7,ENSG00000244879.5,ENSG00000264364.2,ENSG00000226978.1,ENSG00000230292.5
SRR1604987,13.982,0.053,9.477,0.895,2.042,15.303,0.012,118.426,5.025,0.49,...,2.291,52.515,0.758,29.027,34.794,7.11,1.314,28.624,0.007,0.0
SRR1604988,13.665,0.074,9.05,1.025,1.951,16.045,0.016,116.722,5.386,0.468,...,2.362,53.092,0.655,29.024,35.853,7.036,1.251,28.456,0.0,0.0
SRR1604989,7.415,0.108,8.961,3.269,3.297,19.08,0.02,114.528,4.698,0.313,...,1.752,36.212,0.236,27.901,21.716,9.401,3.564,23.767,0.0,0.0
SRR1604990,7.149,0.094,8.979,3.733,3.227,18.335,0.022,115.94,5.013,0.262,...,1.676,35.664,0.296,27.573,23.04,9.456,3.49,24.948,0.0,0.0
SRR1604991,11.509,0.656,8.244,2.268,4.801,19.253,0.027,61.891,3.208,0.326,...,1.366,32.978,3.788,19.781,27.818,7.313,1.215,16.875,0.0,0.0


In [26]:
# 0-1 normalize per gene
scaler = preprocessing.MinMaxScaler()
original_data_scaled = scaler.fit_transform(original_compendium)
original_data_scaled_df = pd.DataFrame(original_data_scaled,
                                columns=original_compendium.columns,
                                index=original_compendium.index)

print(original_data_scaled_df.shape)
original_data_scaled_df.head()

(1552, 17851)


Unnamed: 0,ENSG00000002330.13,ENSG00000003137.8,ENSG00000003249.13,ENSG00000004799.7,ENSG00000006062.13,ENSG00000006282.20,ENSG00000006283.17,ENSG00000006327.13,ENSG00000007384.15,ENSG00000007968.6,...,ENSG00000203965.12,ENSG00000205352.10,ENSG00000206538.8,ENSG00000211450.9,ENSG00000213246.6,ENSG00000224470.7,ENSG00000244879.5,ENSG00000264364.2,ENSG00000226978.1,ENSG00000230292.5
SRR1604987,0.030346,0.001769,0.183531,0.010066,0.071795,0.168457,0.000242,0.346751,0.135361,0.010723,...,0.203789,0.174118,0.01027,0.082616,0.186744,0.210119,0.02665,0.26023,7.5e-05,0.0
SRR1604988,0.029658,0.00247,0.175262,0.011528,0.068596,0.176625,0.000323,0.341762,0.145085,0.010242,...,0.210105,0.176031,0.008875,0.082608,0.192428,0.207932,0.025372,0.258703,0.0,0.0
SRR1604989,0.016093,0.003605,0.173538,0.036767,0.11592,0.210035,0.000404,0.335338,0.126552,0.00685,...,0.155844,0.120064,0.003198,0.079411,0.116553,0.277824,0.072283,0.216073,0.0,0.0
SRR1604990,0.015516,0.003138,0.173887,0.041985,0.113459,0.201834,0.000444,0.339472,0.135038,0.005734,...,0.149084,0.118247,0.004011,0.078478,0.123659,0.279449,0.070782,0.22681,0.0,0.0
SRR1604991,0.024979,0.021898,0.159653,0.025508,0.1688,0.211939,0.000545,0.181217,0.086415,0.007134,...,0.121509,0.109341,0.051324,0.0563,0.149303,0.216118,0.024642,0.153416,0.0,0.0


In [27]:
# Save data
normalized_data_file = os.path.join(
    local_dir,
    "normalized_recount2_compendium_data.tsv")

original_data_scaled_df.to_csv(
    normalized_data_file, float_format='%.3f', sep='\t')

original_compendium.to_csv(
    original_compendium_file, float_format='%.3f', sep='\t')

# Save scaler transform
scaler_file = os.path.join(
    local_dir,
    "scaler_transform.pickle")

outfile = open(scaler_file,'wb')
pickle.dump(scaler,outfile)
outfile.close()

### Train VAE 

In [28]:
"""# Setup directories
# Create VAE directories
output_dirs = [os.path.join(base_dir, dataset_name, "models"),
               os.path.join(base_dir, dataset_name, "logs")]

# Check if analysis output directory exist otherwise create
for each_dir in output_dirs:
    if os.path.exists(each_dir) == False:
        print('creating new directory: {}'.format(each_dir))
        os.makedirs(each_dir, exist_ok=True)

# Check if NN architecture directory exist otherwise create
for each_dir in output_dirs:
    new_dir = os.path.join(each_dir, NN_architecture)
    if os.path.exists(new_dir) == False:
        print('creating new directory: {}'.format(new_dir))
        os.makedirs(new_dir, exist_ok=True)"""

'# Setup directories\n# Create VAE directories\noutput_dirs = [os.path.join(base_dir, dataset_name, "models"),\n               os.path.join(base_dir, dataset_name, "logs")]\n\n# Check if analysis output directory exist otherwise create\nfor each_dir in output_dirs:\n    if os.path.exists(each_dir) == False:\n        print(\'creating new directory: {}\'.format(each_dir))\n        os.makedirs(each_dir, exist_ok=True)\n\n# Check if NN architecture directory exist otherwise create\nfor each_dir in output_dirs:\n    new_dir = os.path.join(each_dir, NN_architecture)\n    if os.path.exists(new_dir) == False:\n        print(\'creating new directory: {}\'.format(new_dir))\n        os.makedirs(new_dir, exist_ok=True)'

In [29]:
"""# Train VAE on new compendium data
# Write out model to rank_pathways directory
pipeline.train_vae(config_file,
                   normalized_data_file)"""

'# Train VAE on new compendium data\n# Write out model to rank_pathways directory\npipeline.train_vae(config_file,\n                   normalized_data_file)'