<h1><b>Compare performance between omics/expression/data and omics/expression/streamed-data on GTEx dataset

<h2><b>Set instance and token

In [3]:
suppressMessages(library(tidyverse))
library(httr)
library(RJSONIO)
library(integrationCurator) # Genestack client library

Sys.setenv(PRED_SPOT_HOST = 'occam.genestack.com',
           PRED_SPOT_TOKEN = 'bd7ebdc3ea0ac8be98ecad5a7570589513885a81',
           PRED_SPOT_VERSION = 'default-released')

<h2><b>Get parameters

In [4]:
studyID <- 'GSF962829'
study_filter <- sprintf('genestack:accession=%s', studyID)
gene <-'ENSG00000014257'
ex_query <- sprintf('Gene=%s MinValue=0', gene)

<h2><b>Get expression data

<h3><b>Non-stream data API

In [4]:

start <- Sys.time()

res_ns <- OmicsQueriesApi_search_expression_data(
        study_filter = study_filter,
        ex_query = ex_query
)$content

expressions <- as_tibble(do.call(cbind, res_ns$data))
cursor <- res_ns$cursor

for (i in 1:7){
    res_ns_a <- OmicsQueriesApi_search_expression_data(
        study_filter = study_filter,
        ex_query = ex_query,
        cursor = cursor
        )$content
    expressions <- rbind(expressions, as_tibble(do.call(cbind, res_ns_a$data)))
    cursor <- res_ns_a$cursor
}

cat(sprintf('Time to get %s expression values: %s seconds\n\n', 
    nrow(expressions), round(Sys.time()-start)))

head(expressions)


Time to get 11683 expression values: 21 seconds



itemId,metadata.Experimental Platform,metadata.Source ID,metadata.Genome Version,metadata.Scale,metadata.Normalization Method,metadata.Transcriptomics Source,metadata.Run Source ID,runId,groupId,gene,expression,sample
<chr>,<list>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>
225817-ENSG00000014257,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GTEX-1117F-0226-SM-5GZZ7,225817,GSF976981,ENSG00000014257,45,GSF962832
225818-ENSG00000014257,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GTEX-111CU-1826-SM-5GZYN,225818,GSF976981,ENSG00000014257,138,GSF962862
225819-ENSG00000014257,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GTEX-111FC-0226-SM-5N9B8,225819,GSF976981,ENSG00000014257,33,GSF962868
225820-ENSG00000014257,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GTEX-111VG-2326-SM-5N9BK,225820,GSF976981,ENSG00000014257,65,GSF962892
225821-ENSG00000014257,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GTEX-111YS-2426-SM-5GZZQ,225821,GSF976981,ENSG00000014257,51,GSF962921
225822-ENSG00000014257,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GTEX-1122O-2026-SM-5NQ91,225822,GSF976981,ENSG00000014257,19,GSF962944


<h3><b>Streaming data API

In [6]:
host = 'occam.genestack.com'
BASE_URL='/frontend/rs/genestack/integrationCurator/default-released/'
token = 'bd7ebdc3ea0ac8be98ecad5a7570589513885a81'

start <- Sys.time()

#Get group accession
group_accession <- as_tibble(ExpressionIntegrationApi_get_parents_by_study(id=studyID)$content)$itemId

group_meta <- as_tibble(ExpressionIntegrationApi_get_parents_by_study(id=studyID)$content)$metadata %>%
              rename_all(function(x) paste0("metadata.", x)) %>%
              mutate(groupId = group_accession)
                                                                     
#Get expression data
res <- httr::GET(sprintf('https://%s%somics/expression/streamed-data', host, BASE_URL),
            add_headers(accept = "gzip", `Genestack-API-Token` = token), 
            query = list(
                groupAccession = group_accession,
                featureList = gene
            ))
 
streaming_data <- as_tibble(read.csv(text = rawToChar(res$content), header = T, stringsAsFactors = F))
                         
end <- round(Sys.time()-start)              
                         
                         
streaming_data_processed <- streaming_data %>%
               gather(key = "source.ID", value = "expression", -NAME) %>%
               subset(expression>=0) %>%
               add_column(group_meta) %>%
               rename(`metadata.Run Source ID`= source.ID, gene=NAME) %>%
               mutate(`metadata.Run Source ID`=gsub(".", "-",`metadata.Run Source ID`, fixed = T))

cat(sprintf('Time to get %s expression values: %s seconds\n\n', 
    nrow(streaming_data_processed), end))      
                                  
head(streaming_data_processed)
                         

Time to get 11688 expression values: 7 seconds



gene,metadata.Run Source ID,expression,metadata.Experimental Platform,metadata.Source ID,metadata.Genome Version,metadata.Scale,metadata.Normalization Method,metadata.Transcriptomics Source,groupId
<chr>,<chr>,<dbl>,<list>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
ENSG00000014257,GTEX-1117F-0226-SM-5GZZ7,45,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GSF976981
ENSG00000014257,GTEX-1117F-0426-SM-5EGHI,15,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GSF976981
ENSG00000014257,GTEX-1117F-0526-SM-5EGHJ,23,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GSF976981
ENSG00000014257,GTEX-1117F-0626-SM-5N9CS,38,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GSF976981
ENSG00000014257,GTEX-1117F-0726-SM-5GIEN,27,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GSF976981
ENSG00000014257,GTEX-1117F-1326-SM-5EGHH,62,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GSF976981


In [7]:
suppressMessages(library(janitor))
compare_df_cols(expressions, streaming_data_processed)
dfSample <- intersect(expressions$`metadata.Run Source ID`, streaming_data_processed$`metadata.Run Source ID`)

streaming_data_processed[!(streaming_data_processed$`metadata.Run Source ID` %in% expressions$`metadata.Run Source ID`),]

column_name,expressions,streaming_data_processed
<chr>,<chr>,<chr>
expression,numeric,numeric
gene,character,character
groupId,character,character
itemId,character,
metadata.Experimental Platform,list,list
metadata.Genome Version,character,character
metadata.Normalization Method,character,character
metadata.Run Source ID,character,character
metadata.Scale,character,character
metadata.Source ID,character,character


gene,metadata.Run Source ID,expression,metadata.Experimental Platform,metadata.Source ID,metadata.Genome Version,metadata.Scale,metadata.Normalization Method,metadata.Transcriptomics Source,groupId
<chr>,<chr>,<dbl>,<list>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
ENSG00000014257,GTEX-1399S-2326-SM-5K7YV,764,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GSF976981
ENSG00000014257,GTEX-1477Z-2826-SM-5SI9J,19,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GSF976981
ENSG00000014257,GTEX-18A67-1326-SM-7LT8V,83,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GSF976981
ENSG00000014257,GTEX-R55D-0006-SM-3GIJS,256,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GSF976981
ENSG00000014257,GTEX-XQ8I-1926-SM-4BOOK,60,"Illumina TrueSeq.v1, HiSeq 2000 , HiSeq2500",GTEx V7,GRCh37,Unknown,TPM,GTEx,GSF976981
