In [1]:
import random
import xml.etree.ElementTree as ET

import pandas
from plotnine import *
import requests
from tqdm import tqdm_notebook

%load_ext rpy2.ipython

In [2]:
tree = ET.parse('../data/tree.xml')
root = tree.getroot()

In [3]:
studies = root.findall('EXPERIMENT_PACKAGE_SET/EXPERIMENT_PACKAGE/STUDY')

study_accessions = [study.get('accession') for study in studies]
# Remove duplicate accessions
study_accessions = list(set(study_accessions))
print(study_accessions[:5])

['SRP026048', 'SRP044149', 'SRP050493', 'SRP083082', 'ERP012633']


In [4]:
%%R
library(recount)

# Download the metadata for all samples in recount, and write it to a file
metadata <- all_metadata()
write.table(metadata, '~/Desktop/brd-net/data/recount_metadata.csv', sep='\t', row.names=FALSE)






Attaching package: ‘BiocGenerics’



    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB



    IQR, mad, sd, var, xtabs



    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min



Attaching package: ‘S4Vectors’



    expand.grid






    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.




Attaching package: ‘matrixStats’



    anyMissing, rowMedians

In [5]:
# Load the metadata from calling recount's all_metadata function
recount2_metadata = pandas.read_csv('../data/recount_metadata.csv', sep='\t')

# Create the set of all project accessions present in recount2
recount2_study_accessions = set(recount2_metadata['project'])
print(len(recount2_study_accessions))

3219


In [6]:
# Keep only the accessions that can be downloaded from recount2
study_accessions = [accession for accession in study_accessions if accession in recount2_study_accessions]
print(len(study_accessions))

239


In [7]:
# Download information about a single study to show the data format returned from SRA
data = requests.get('http://metasra.biostat.wisc.edu/api/v01/samples.json?study={}'.format(study_accessions[0]))
print(data.json())

{'sampleCount': 7, 'terms': [{'sampleCount': 7, 'dterm': {'name': 'adult', 'ids': ['EFO:0001272']}}, {'sampleCount': 7, 'dterm': {'name': 'adult organism', 'ids': ['UBERON:0007023']}}, {'sampleCount': 7, 'dterm': {'name': 'dorsal telencephalon', 'ids': ['EFO:0003534']}}, {'sampleCount': 7, 'dterm': {'name': 'female organism', 'ids': ['UBERON:0003100']}}, {'sampleCount': 7, 'dterm': {'name': 'male organism', 'ids': ['UBERON:0003101']}}, {'sampleCount': 7, 'dterm': {'name': 'middle frontal gyrus', 'ids': ['UBERON:0002702']}}, {'sampleCount': 7, 'dterm': {'name': 'prefrontal cortex', 'ids': ['UBERON:0000451']}}], 'skip': 0, 'studyCount': 1, 'studies': [{'sampleCount': 7, 'study': {'title': 'Global epigenomic reconfiguration during mammalian brain development', 'id': 'SRP026048', 'recountId': 'SRP026048'}, 'sampleGroups': [{'type': {'type': 'tissue', 'conf': 0.9112536951752213}, 'samples': [{'experiments': [{'runs': ['SRR921940', 'SRR921941'], 'id': 'SRX314970'}], 'name': 'Middle frontal g

In [8]:
id_to_studies = {}
id_to_samples = {}
id_to_name = {}

total_samples = 0

# Find the disease ontology information associated with all the studies in both recount2 and 
# the output of find_studies.py
for accession in tqdm_notebook(study_accessions):
    data = requests.get('http://metasra.biostat.wisc.edu/api/v01/samples.json?study={}'.format(accession)).json()
    
    total_samples += data['sampleCount']
    
    for term in data['terms']:
        term_name = term['dterm']['name']
        term_ids = term['dterm']['ids']
        
        disease = False
        disease_id = None
        for id_ in term_ids:
            if 'DOID' in id_:
                disease_id = id_
                disease = True
                break
                
        if not disease:
            continue
        
        if disease_id in id_to_studies:
            id_to_studies[disease_id] += 1
            id_to_samples[disease_id] += term['sampleCount']
        else:
            id_to_studies[disease_id] = 1
            id_to_samples[disease_id] = term['sampleCount']
            id_to_name[disease_id] = term_name            

HBox(children=(IntProgress(value=0, max=239), HTML(value='')))




In [9]:
disease_dict = {}
for id_ in id_to_studies:
    disease_dict[id_] = [id_to_studies[id_], id_to_samples[id_], id_to_name[id_]]

disease_df = pandas.DataFrame.from_dict(disease_dict, orient='index', columns=['num_studies', 'num_samples', 'name'])

all_doids = set(id_to_studies.keys())

disease_df.sort_values('num_samples').head()

Unnamed: 0,num_studies,num_samples,name
DOID:12849,1,6,autistic disorder
DOID:11722,1,6,myotonic dystrophy type 1
DOID:1974,1,6,adenosarcoma
DOID:3965,1,8,Merkel cell carcinoma
DOID:397,1,8,restrictive cardiomyopathy


In [10]:
DOID_to_descendants = {}
def get_descendants(doid):
    '''This function calls the Disease Ontology API to find the ids of all descendants of a given doid'''
    if doid in DOID_to_descendants:
        return DOID_to_descendants[doid]
    
    descendants = []
    data = requests.get('http://www.disease-ontology.org/api/metadata/{}'.format(doid)).json()
    
    if 'children' not in data:
        return []
    
    for child in data['children']:
        child_doid = child[1]
        descendants.append(child_doid)
        
        # recursively traverse the ontology
        child_descendants = get_descendants(child_doid)
        descendants.extend(child_descendants)
        
    DOID_to_descendants[doid] = descendants
    return descendants

In [11]:
# Iterate through all terms found in the data, and only add them to a list if 
# none of their descendants are in the study

leaf_ids = []

# Because this function involves recursively traversing a tree, the estimated time
# given by tqdm will be way off. That said, it takes awhile to run (~ 20 minutes)

for doid in tqdm_notebook(id_to_name):
    descendants = get_descendants(doid)
    descendant_in_study = False
    for descendant in descendants:
        if descendant in all_doids:
            descendant_in_study = True
            break
    
    if not descendant_in_study:
        leaf_ids.append(doid)

# These aren't necessarily leaves of the ontology, but they are don't have any descendants
# in this dataset, so we'll call them leaves
print(len(leaf_ids))

HBox(children=(IntProgress(value=0, max=124), HTML(value='')))


102


In [12]:
disease_df.loc[leaf_ids].head()

Unnamed: 0,num_studies,num_samples,name
DOID:0060469,2,88,Miller-Dieker lissencephaly syndrome
DOID:0050908,1,22,myelodysplastic syndrome
DOID:2935,1,9,Chediak-Higashi syndrome
DOID:11162,1,83,respiratory failure
DOID:3910,3,150,lung adenocarcinoma


In [13]:
print(total_samples)
print(disease_df.loc[leaf_ids]['num_samples'].sum())

13437
8877


In [14]:
# Select studies at random from the list of all samples
# If the study has one of the leaf ids, add it to the list
# Stop once enough samples are included (~1000 valid samples)

# In a script, you would put the random seed at the top. Since each cell can be
# rerun in a notebook, manually setting the seed within each cell with a random
# call ensures that you get the same results regardless of the order in which
# the cells are run
random.seed(42)

leaf_set = set(leaf_ids)

samples_drawn = 0
plier_sample_accessions = []

# Select samples in a random order
indices = list(range(len(study_accessions)))
random.shuffle(indices)

for index in tqdm_notebook(indices):
    study = study_accessions[index]
    data = requests.get('http://metasra.biostat.wisc.edu/api/v01/samples.json?study={}'.format(study)).json()

    for term in data['terms']:
        term_ids = term['dterm']['ids']

        use_sample = False
        for id_ in term_ids:
            if id_ in leaf_set:
                plier_sample_accessions.append(study)
                
                samples_drawn += data['sampleCount']
                use_sample = True
                break
        
        if use_sample:
            break
            
    # Some of the samples will turn out to be invalid, so we use 3000 here instead of
    # a proportion of the total number of samples
    if samples_drawn > 3000:
        break
        
print(plier_sample_accessions)

HBox(children=(IntProgress(value=0, max=239), HTML(value='')))

['SRP053296', 'SRP002272', 'SRP056612', 'SRP056293', 'SRP012546', 'SRP003754', 'DRP000987', 'SRP007946', 'SRP017413', 'SRP023262', 'SRP007483', 'SRP014020', 'SRP056041', 'SRP065812', 'SRP002326', 'SRP015668', 'SRP033464', 'SRP043085', 'SRP017262', 'SRP042184', 'ERP001304', 'SRP065559', 'SRP049820', 'SRP052896', 'ERP001908', 'ERP012633', 'SRP064378', 'SRP051825', 'SRP052056', 'SRP002628', 'SRP009251', 'SRP041538', 'SRP039694', 'SRP011422', 'ERP009437', 'SRP007947', 'SRP063460', 'SRP033095', 'SRP040622', 'SRP035988', 'SRP063493', 'SRP057118', 'SRP033566', 'SRP047192', 'SRP000941', 'SRP045611', 'ERP004592', 'SRP056792', 'SRP041736', 'SRP045500', 'SRP015640', 'SRP061888', 'SRP035524', 'SRP055874', 'SRP029880', 'ERP002414', 'SRP028554', 'SRP007461', 'SRP034698', 'SRP031459', 'SRP042228']


In [15]:
# Get all runs from xml
experiment_packages = root.findall('EXPERIMENT_PACKAGE_SET/EXPERIMENT_PACKAGE')

plier_healthy = []
plier_disease = []
classifier_healthy = []
classifier_disease = []

recount_study_set = set(study_accessions)

# Find the run accessions and sort them into groups
for experiment_package in experiment_packages:
    study_accession = experiment_package.find('STUDY').get('accession')
    in_plier_set = study_accession in plier_sample_accessions
    
    case_control_status = experiment_package.get('category')
    
    if case_control_status == 'invalid' or study_accession not in recount_study_set:
        continue
    
    
    runs = experiment_package.findall('RUN_SET/RUN/IDENTIFIERS/PRIMARY_ID')
    for run in runs:
        run_id = '.'.join([study_accession, run.text])
        
        if case_control_status == 'control':
            if in_plier_set:
                plier_healthy.append(run_id)
            else:
                classifier_healthy.append(run_id)
        elif case_control_status == 'case':
            if in_plier_set:
                plier_disease.append(run_id)
            else:
                classifier_disease.append(run_id)
      

In [16]:
print(len(plier_healthy))
print(len(plier_disease))
    
print(len(classifier_healthy))
print(len(classifier_disease))

print(plier_healthy[:5])

300
805
3347
4047
['SRP041538.SRR1265536', 'SRP041538.SRR1265535', 'SRP041538.SRR1265534', 'SRP041538.SRR1265495', 'SRP041538.SRR1265533']


## Downloading Data
---
We'll now switch to R, because the only programmatic way to access recount2 data is through the recount bioconductor package. Thanks to the magic of rpy, we can use the list of run accessions we generated in the R code below.

The R portion of this notebook is based heavily on Qiewen Hu's script [here](https://github.com/greenelab/rheum-plier-data/blob/master/recount2/1-get_all_recount_dataset.R)

In [17]:
%%R 
`%>%` <- dplyr::`%>%`
library(recount)
library(xml2)

In [18]:
%%R
# Get RPKM value for each gene - adapted from recount package
getRPKM <- function(rse, length_var = 'bp_length', mapped_var = NULL) { 
  # Computes the RPKM value for each gene in the sample.
  #
  # Args: 
  #  rse: A RangedSummarizedExperiment-class object in recount package
  #  length_var: A length 1 character vector with the column name from rowData(rse) that has
  #              the coding length. For gene level objects from recount this is bp_length. If
  #              NULL, then it will use width(rowRanges(rse)) which should be used for exon RSEs.
  #  mapped_var: A length 1 character vector with the column name from colData(rse) that has
  #              the number of reads mapped. If NULL (default) then it will use the column 
  #              sums of the counts matrix
  # Returns:
  #   RPKM value for each sample
  if(!is.null(mapped_var)){
    mapped <- colData(rse)[, mapped_var] 
  } else {
    mapped <- colSums(assays(rse)$counts) 
  } 
  bg <- matrix(mapped, ncol = ncol(rse), nrow = nrow(rse), byrow = TRUE) 
  if(!is.null(length_var)){
    len <- rowData(rse)[, length_var] 
  } else {
    len <- width(rowRanges(rse))
  }
  wid <- matrix(len, nrow = nrow(rse), ncol = ncol(rse), byrow = FALSE) 
  rpkm <- assays(rse)$counts / (wid/1000) / (bg/1e6) 
  return(rpkm)
} 

getExperimentDf <- function(rpkm.list, id.vector) {
  # Extracts the runs found in id.vector from a list of dataframes, and combines
  # them into a single large dataframe
    
  rpkm.df <- do.call(base::cbind, c(rpkm.list, by = "id"))
  subset.df <- rpkm.df %>% dplyr::select(dplyr::one_of(id.vector))
  subset.df <- tibble::rownames_to_column(subset.df, "ENSG")
  
  return(subset.df)
}

In [19]:
%%R

data.dir <- file.path("../data")
recount.dir <- file.path(data.dir, "recount")
dir.create(recount.dir, recursive = TRUE, showWarnings = FALSE)

In [20]:
%%R -i study_accessions

included.study.list <- study_accessions

# Download all recount2 samples in included.study.list that are available
# This takes a while
lapply(included.study.list, 
       function(x) tryCatch(download_study(x, type = "rse-gene", 
                                           outdir = file.path(recount.dir, x)),
                            error= function(e) NULL))



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































[[1]]
[1] "http://duffel.rail.bio/recount/v2/SRP026048/rse_gene.Rdata"

[[2]]
[1] "http://duffel.rail.bio/recount/v2/SRP050493/rse_gene.Rdata"

[[3]]
[1] "http://duffel.rail.bio/recount/v2/ERP012633/rse_gene.Rdata"

[[4]]
[1] "http://duffel.rail.bio/recount/v2/SRP049820/rse_gene.Rdata"

[[5]]
[1] "http://duffel.rail.bio/recount/v2/SRP043368/rse_gene.Rdata"

[[6]]
[1] "http://duffel.rail.bio/recount/v2/SRP028336/rse_gene.Rdata"

[[7]]
[1] "http://duffel.rail.bio/recount/v2/SRP041044/rse_gene.Rdata"

[[8]]
[1] "http://duffel.rail.bio/recount/v2/SRP022043/rse_gene.Rdata"

[[9]]
[1] "http://duffel.rail.bio/recount/v2/SRP051765/rse_gene.Rdata"

[[10]]
[1] "http://duffel.rail.bio/recount/v2/SRP056612/rse_gene.Rdata"

[[11]]
[1] "http://duffel.rail.bio/recount/v2/SRP042620/rse_gene.Rdata"

[[12]]
[1] "http://duffel.rail.bio/recount/v2/SRP058181/rse_gene.Rdata"

[[13]]
[1] "http://duffel.rail.bio/recount/v2/SRP044174/rse_gene.Rdata"

[[14]]
[1] "http://duffel.rail.bio/recount/v2/SRP033248/rse_

In [21]:
%%R

# get RPKM for each experiment and add to list
rpkm.list <- list()
for(experiment in included.study.list) {
  possibleError <- tryCatch(load(file.path(recount.dir, experiment, 'rse_gene.Rdata')), error=function(e) e)
  # If the file can't be loaded, it wasn't in recount, so we don't want to try to add it to our
  # gene expression matrix
  if (inherits(possibleError, 'error')){
    next
  }
  
  rpkm <- as.data.frame(getRPKM(rse_gene))
  rpkm$id <- rownames(rpkm)
  rpkm.list[[experiment]] <- rpkm
}

In [22]:
%%R -i plier_healthy,plier_disease,classifier_healthy,classifier_disease


# Convert our lists of strings to vectors of strings, and change their names to match
# R naming conventions
plier.healthy <- unlist(plier_healthy)
plier.disease <- unlist(plier_disease)
classifier.healthy <- unlist(classifier_healthy)
classifier.disease <- unlist(classifier_disease)

plier.healthy.df <- getExperimentDf(rpkm.list, plier.healthy)
plier.disease.df <- getExperimentDf(rpkm.list, plier.disease)
classifier.healthy.df <- getExperimentDf(rpkm.list, classifier.healthy)
classifier.disease.df <- getExperimentDf(rpkm.list, classifier.disease)

print(dim(plier.healthy.df))
print(dim(plier.disease.df))
print(dim(classifier.healthy.df))
print(dim(classifier.disease.df))

[1] 58037   300
[1] 58037   791
[1] 58037  3208
[1] 58037  2434


In [24]:
%%R

write.table(plier.healthy.df, file.path(data.dir, 'plier_healthy.tsv'), sep='\t', row.names=FALSE)
write.table(plier.disease.df, file.path(data.dir, 'plier_disease.tsv'), sep='\t', row.names=FALSE)
write.table(classifier.healthy.df, file.path(data.dir, 'classifier_healthy.tsv'), sep='\t', row.names=FALSE)
write.table(classifier.disease.df, file.path(data.dir, 'classifier_disease.tsv'), sep='\t', row.names=FALSE)