# Process recount2 data
This notebook does the following:

1. Selects template experiment
2. Downloads subset of recount2 data, including the template experiment (subset of random experiments + 1 template experiment)
3. Train VAE on subset of recount2 data

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
import rpy2
import seaborn as sns
from sklearn import preprocessing
import pickle

from ponyo import utils, train_vae_modules
from generic_expression_patterns_modules import process, calc

np.random.seed(123)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))
Using TensorFlow backend.


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../"))

config_file = os.path.abspath(os.path.join(base_dir,
                                           "configs",
                                           "config_test.tsv"))
params = utils.read_config(config_file)

### Select template experiment

We manually selected bioproject [SRP012656](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE37764), which contains primary non-small cell lung adenocarcinoma tumors and adjacent normal tissues of 6 never-smoker Korean female patients with 2 replicates each.

In [3]:
# Load params
local_dir = params["local_dir"]
dataset_name = params['dataset_name']
NN_architecture = params['NN_architecture']
project_id = params['project_id']
num_recount2_experiments = params['num_recount2_experiments_to_download']
template_data_file = params['template_data_file']
original_compendium_file = params['original_compendium_file']
normalized_data_file = params['normalized_compendium_file']
shared_genes_file = params['shared_genes_file']
scaler_file = params['scaler_transform_file']

In [4]:
# Check if analysis output directory exist otherwise create
if not os.path.exists(local_dir):
    os.makedirs(local_dir, exist_ok=True)

## Download data

In [5]:
%%R
suppressPackageStartupMessages(library('recount'))

In [6]:
%%R -i project_id -i num_recount2_experiments -i local_dir -i base_dir

source('../generic_expression_patterns_modules/download_recount2_data.R')

get_recount2_compendium(project_id, num_recount2_experiments, local_dir, base_dir)





























[1] "SRP012651"
Loading objects:
  rse_gene
[1] "SRP014574"
Loading objects:
  rse_gene
[1] "SRP000542"
Loading objects:
  rse_gene
[1] "SRP012656"
Loading objects:
  rse_gene


In [7]:
%%R -i project_id -i local_dir

source('../generic_expression_patterns_modules/download_recount2_data.R')

get_recount2_template_experiment(project_id, local_dir)




Loading objects:
  rse_gene


In [8]:
assert(os.path.exists(template_data_file))

True

## Process data

In [9]:
# Get generic genes identified by Crow et. al.
DE_prior_file = "https://raw.githubusercontent.com/maggiecrow/DEprior/master/DE_Prior.txt"

DE_prior = pd.read_csv(DE_prior_file,
                       header=0,
                       sep="\t")

In [10]:
# Get list of published generic genes
published_generic_genes = list(DE_prior['Gene_Name'])

In [11]:
# Get list of our genes

# Read template data
template_data = pd.read_csv(
    template_data_file,
    header=0,
    sep='\t',
    index_col=0)

our_gene_ids = list(template_data.columns)

In [12]:
template_data.head()

Unnamed: 0,ENSG00000000003.14,ENSG00000000005.5,ENSG00000000419.12,ENSG00000000457.13,ENSG00000000460.16,ENSG00000000938.12,ENSG00000000971.15,ENSG00000001036.13,ENSG00000001084.10,ENSG00000001167.14,...,ENSG00000283690.1,ENSG00000283691.1,ENSG00000283692.1,ENSG00000283693.1,ENSG00000283694.1,ENSG00000283695.1,ENSG00000283696.1,ENSG00000283697.1,ENSG00000283698.1,ENSG00000283699.1
SRR493937,622,1,398,394,154,10661,5865,1217,764,1061,...,0,3,0,0,0,0,22,25,0,0
SRR493938,622,0,399,362,159,10761,5770,1184,744,981,...,0,2,0,0,0,0,31,19,0,0
SRR493939,3077,0,629,911,503,941,12913,1768,694,1553,...,0,2,0,0,0,1,17,18,0,0
SRR493940,3041,0,694,918,476,886,12732,1773,695,1540,...,0,2,0,0,0,0,14,18,0,0
SRR493941,551,1,411,563,226,2303,5917,1232,625,963,...,0,1,0,0,0,0,22,33,0,0


In [13]:
# File mapping ensembl ids to hgnc symbols
gene_id_file = os.path.join(
    "data",
    "metadata",
    "ensembl_hgnc_mapping.tsv")

In [14]:
%%R
suppressWarnings(library("biomaRt"))

In [15]:
%%R -i template_data_file -i gene_id_file

# Get mapping between ensembl gene ids (ours) to HGNC gene symbols (published)

source('../generic_expression_patterns_modules/process_names.R')

if (file.exists(gene_id_file) == FALSE){
    gene_id_mapping <- get_ensembl_symbol_mapping(template_data_file, gene_id_file)
}

In [16]:
# Read gene id mapping
gene_id_mapping = pd.read_csv(
        gene_id_file,
        header=0,
        sep='\t',
        index_col=0)

print(gene_id_mapping.shape)

(57165, 2)


In [17]:
# Get mapping between ensembl ids with and without version numbers
# Expressiond data uses ensembl gene ids with version number 
ensembl_gene_ids = pd.DataFrame(data={'ensembl_version': our_gene_ids,
                                      'ensembl_parsed': [gene_id.split('.')[0] for gene_id in our_gene_ids]})

print(ensembl_gene_ids.shape)

(58037, 2)


In [18]:
# Map ensembl ids with version number to gene_id_mapping_df
gene_id_mapping = pd.merge(gene_id_mapping, 
                           ensembl_gene_ids, 
                           left_on='ensembl_gene_id',
                           right_on='ensembl_parsed', 
                           how='outer')

print(gene_id_mapping.shape)
gene_id_mapping.set_index('ensembl_version', inplace=True)

(58041, 4)


In [19]:
# Replace ensembl ids with gene symbols
template_data = process.replace_ensembl_ids(template_data,
                                            gene_id_mapping)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [20]:
# Get intersection of gene lists
our_gene_ids_hgnc = template_data.columns
shared_genes_hgnc = list(set(our_gene_ids_hgnc).intersection(published_generic_genes))
print(len(shared_genes_hgnc))

17784


In [21]:
# Remove subset of samples
smRNA_samples = ["SRR493961",
                 "SRR493962",
                 "SRR493963",
                 "SRR493964",
                 "SRR493965",
                 "SRR493966",
                 "SRR493967",
                 "SRR493968",
                 "SRR493969",
                 "SRR493970",
                 "SRR493971",
                 "SRR493972"]
template_data = template_data.drop(smRNA_samples)

In [22]:
# Drop genes
template_data = template_data[shared_genes_hgnc]

print(template_data.shape)

(24, 17787)


In [23]:
# Read data
print(template_data.shape)
assert(template_data.shape[0] == 24)
template_data.head()

(24, 17787)


Unnamed: 0,HIPK1-AS1,CHST4,SMARCAD1,CENPE,CREM,SSTR2,GCAT,BLACAT1,ATG13,RTKN2,...,TPPP2,GTF2E1,SRSF3,SLC7A9,ARV1,ELP2,OR7A10,BRIP1,ACCS,MARK3
SRR493937,19,16,670,73,2329,174,126,24,2237,754,...,15,210,5214,26,203,1402,0,41,1871,2674
SRR493938,17,19,657,61,2335,186,128,26,2156,750,...,12,212,5187,16,210,1442,0,55,1867,2592
SRR493939,9,6,1004,107,854,65,421,203,2284,266,...,15,262,7766,78,485,1500,0,124,1898,2434
SRR493940,10,18,966,121,763,88,433,206,2314,241,...,7,286,7575,73,533,1405,0,149,1838,2405
SRR493941,12,16,881,44,497,38,237,13,2894,7391,...,38,283,4390,8,392,2017,0,25,2260,2881


### Normalize compendium 

In [24]:
# Read data
original_compendium = pd.read_table(
    original_compendium_file,
    header=0,
    sep='\t',
    index_col=0)

print(original_compendium.shape)
original_compendium.head()

(92, 58037)


Unnamed: 0,ENSG00000000003.14,ENSG00000000005.5,ENSG00000000419.12,ENSG00000000457.13,ENSG00000000460.16,ENSG00000000938.12,ENSG00000000971.15,ENSG00000001036.13,ENSG00000001084.10,ENSG00000001167.14,...,ENSG00000283690.1,ENSG00000283691.1,ENSG00000283692.1,ENSG00000283693.1,ENSG00000283694.1,ENSG00000283695.1,ENSG00000283696.1,ENSG00000283697.1,ENSG00000283698.1,ENSG00000283699.1
SRR493816,525,0,884,428,449,37,532,2370,3393,727,...,0,2,0,0,0,0,13,17,0,0
SRR493817,348,0,790,342,364,57,23,1851,3986,614,...,0,1,0,0,0,0,12,24,0,0
SRR547975,335,0,1978,432,774,0,914,6974,425,2563,...,0,8,0,0,0,0,3,21,0,0
SRR547973,1111,0,1301,863,737,0,0,1912,3475,685,...,0,1,0,0,0,0,36,12,0,0
SRR547968,213,0,717,526,484,0,0,8267,2385,1829,...,0,0,0,0,0,0,24,34,0,0


In [25]:
# Replace ensembl ids with gene symbols
original_compendium = process.replace_ensembl_ids(original_compendium,
                                                gene_id_mapping)

In [26]:
# Drop genes
original_compendium = original_compendium[shared_genes_hgnc]

original_compendium.head()

Unnamed: 0,HIPK1-AS1,CHST4,SMARCAD1,CENPE,CREM,SSTR2,GCAT,BLACAT1,ATG13,RTKN2,...,TPPP2,GTF2E1,SRSF3,SLC7A9,ARV1,ELP2,OR7A10,BRIP1,ACCS,MARK3
SRR493816,13,4,285,132,127,3,758,902,7555,14,...,4,412,3832,4,254,1092,0,162,440,2864
SRR493817,9,0,184,63,283,4,379,842,6443,30,...,3,309,3700,14,196,964,0,118,388,3018
SRR547975,17,0,2186,2980,292,15,995,2,2932,5,...,0,642,9713,6,940,1747,0,910,479,2285
SRR547973,18,0,2262,2027,466,6,378,5,4454,558,...,1,472,5528,8,867,1417,0,283,767,3953
SRR547968,11,2,1967,1264,304,18,286,80,3406,1524,...,0,409,7416,5,562,2037,0,1159,474,2638


In [27]:
# 0-1 normalize per gene
scaler = preprocessing.MinMaxScaler()
original_data_scaled = scaler.fit_transform(original_compendium)
original_data_scaled_df = pd.DataFrame(original_data_scaled,
                                columns=original_compendium.columns,
                                index=original_compendium.index)

print(original_data_scaled_df.shape)
original_data_scaled_df.head()

(92, 17787)


Unnamed: 0,HIPK1-AS1,CHST4,SMARCAD1,CENPE,CREM,SSTR2,GCAT,BLACAT1,ATG13,RTKN2,...,TPPP2,GTF2E1,SRSF3,SLC7A9,ARV1,ELP2,OR7A10,BRIP1,ACCS,MARK3
SRR493816,0.323529,0.006803,0.087763,0.027082,0.035436,0.000447,0.57555,0.620358,1.0,0.000782,...,0.050633,0.315709,0.320047,0.002618,0.225979,0.155195,0.0,0.103514,0.13382,0.724024
SRR493817,0.205882,0.0,0.055992,0.012926,0.08038,0.000596,0.287775,0.579092,0.852754,0.001744,...,0.037975,0.236782,0.308993,0.009162,0.174377,0.136801,0.0,0.075399,0.118005,0.763051
SRR547975,0.441176,0.0,0.68575,0.611407,0.082973,0.002236,0.755505,0.001376,0.387844,0.000241,...,0.0,0.491954,0.81251,0.003927,0.836299,0.249317,0.0,0.58147,0.145681,0.577293
SRR547973,0.470588,0.0,0.709657,0.41588,0.133103,0.000895,0.287016,0.003439,0.58938,0.033498,...,0.012658,0.361686,0.462067,0.005236,0.771352,0.201897,0.0,0.180831,0.233273,1.0
SRR547968,0.264706,0.003401,0.616861,0.259335,0.08643,0.002684,0.21716,0.055021,0.450609,0.091592,...,0.0,0.31341,0.620164,0.003272,0.5,0.29099,0.0,0.740575,0.144161,0.666751


In [28]:
# Save data
template_data.to_csv(
    template_data_file, sep='\t')

original_compendium.to_csv(
    original_compendium_file, sep='\t')

original_data_scaled_df.to_csv(
    normalized_data_file, sep='\t')

# Save scaler transform
outfile = open(scaler_file,'wb')
pickle.dump(scaler,outfile)
outfile.close()

### Train VAE 
Performed exploratory analysis of compendium data [here](../explore_data/viz_recount2_compendium.ipynb) to help interpret loss curve.

In [29]:
# Setup directories
# Create VAE directories
output_dirs = [os.path.join(base_dir, dataset_name, "models"),
               os.path.join(base_dir, dataset_name, "logs")]

# Check if analysis output directory exist otherwise create
for each_dir in output_dirs:
    if os.path.exists(each_dir) == False:
        print('creating new directory: {}'.format(each_dir))
        os.makedirs(each_dir, exist_ok=True)

# Check if NN architecture directory exist otherwise create
for each_dir in output_dirs:
    new_dir = os.path.join(each_dir, NN_architecture)
    if os.path.exists(new_dir) == False:
        print('creating new directory: {}'.format(new_dir))
        os.makedirs(new_dir, exist_ok=True)

creating new directory: /home/alexandra/Documents/Repos/generic-expression-patterns/tests/models
creating new directory: /home/alexandra/Documents/Repos/generic-expression-patterns/tests/logs
creating new directory: /home/alexandra/Documents/Repos/generic-expression-patterns/tests/models/NN_2500_30
creating new directory: /home/alexandra/Documents/Repos/generic-expression-patterns/tests/logs/NN_2500_30


In [30]:
# Train VAE on new compendium data
#train_vae_modules.train_vae(config_file,
#                            normalized_data_file)

In [31]:
# Test reproducibility
template_path = "data/test_vae_logs.tsv"
output_path = "logs/NN_2500_30/tybalt_2layer_30latent_stats.tsv"
assert np.all(np.isclose(
    pd.read_csv(output_path, sep="\t").values,
    pd.read_csv(template_path, sep="\t").values
    ))

FileNotFoundError: File b'logs/NN_2500_30/tybalt_2layer_30latent_stats.tsv' does not exist