# Introduction

My comparison notebook has gotten too large, and it contains many things that don't
help communicate with the scientists. I'm thinking I should move all the complex data loading code over into a new notebook.

# Setup

* <a href="#Load-experiment-data">Load experiment data</a>
  * <a href="#Load-13-pg-tissue-experiments">Load 13 pg tissue experiments</a>
    * <a href="#List-loaded-13-pg-tissue-experiments">List loaded 13 pg tissue experiments</a>
    * <a href="#List-13-pg-tissue-libraries-and-files">List 13 pg tissue libraries and files</a>
    * <a href="#Example-13-pg-tissue-correlation-scores">Example 13 pg tissue correlation scores</a>
  * <a href="#Load-10-ng-tissue-experiments">Load 10 ng tissue experiments</a>
    * <a href="#List-loaded-10-ng-tissue-experiments">List loaded 10 ng tissue experiments</a>
    * <a href="#List-10-ng-tissue-libraries-and-files">List 10 ng tissue libraries and files</a>
    * <a href="#Example-10-ng-tissue-correlation-scores">Example 10 ng tissue correlation scores</a>
  * <a href="#Load-ENCODE-bulk-cell-line-experiments">Load ENCODE bulk cell line experiments</a>
    * <a href="#List-Bulk-ENCODE-bulk-cell-line-libraries-and-files">List ENCODE bulk cell line libraries and files</a>
    * <a href="#Example-ENCODE-bulk-cell-line-correlation-scores">Example ENCODE bulk cell line correlation scores</a> 
  * <a href="#Load-pool-split">Load C57Bl6 layer 5 pyramidal neuron pool/split examples</a>
* <a href="#Load-Gencode-Gene-ID-to-Gene-Name">Load gencode gene id to gene name translation table</a>
* <a href="#Generate-DCC-Library-to-Human-Readable-Names">Generate DCC Library to Human Readable Names</a>
* <a href="#Compute-scores-for-all-libraries-vs-all-libraries">Compute scores for all libraries vs all libraries by experiment set</a>

In [1]:
from __future__ import print_function, division

import pandas
import numpy
import scipy.stats
import collections

import os
import sys

import paramiko
import RDF

In [2]:
ROOT = os.path.expanduser('~/proj/encode3-curation')
if ROOT not in sys.path:
    sys.path.append(ROOT)

In [3]:
import curation_common
from curation_common import *

In [4]:
server = ENCODED('www.encodeproject.org')
server.load_netrc()

In [5]:
def load_experiments_by_library(model, libraries, verbose=False):
    """Load experiment information using DCC Library objects.
    
    The library objects aren't terribly useful as they don't
    point at the experiments and unfortunately files are attached
    to the experiments and not libraries or replicates.
    """
    for i, lib_id in enumerate(libraries):
        if verbose:
            print(lib_id, i, "of", len(libraries))
        libdata = server.get_jsonld(lib_id)
        load_jsonld_into_model(model, libdata)
        search_results = server.search_jsonld(searchTerm=lib_id)
        for g in search_results['@graph']:
            if 'Experiment' in g['@type']:
                experiment = server.get_jsonld(g['@id'])
                load_jsonld_into_model(model, experiment)

In [6]:
def load_experiments_by_dataset(model, dataset, verbose=False):
    """Load experiment information using DCC Library objects.
    
    The library objects aren't terribly useful as they don't
    point at the experiments and unfortunately files are attached
    to the experiments and not libraries or replicates.
    """
    
    jsonld = server.get_jsonld(dataset)
    load_jsonld_into_model(model, jsonld)    

    query = RDF.SPARQLQuery("""@PREFIX replicate <https://www.encodeproject.org/profiles/replicate.json#> .
select ?library
where {
    ?replicate replicate:Library ?library .
}
""")
    libraries = query.execute(model)
    for i, row in enumerate(libraries):
        lib_id = str(row['library'])
        if verbose:
            print(lib_id, i, "of", len(libraries))
        libdata = server.get_jsonld(lib_id)
        load_jsonld_into_model(model, libdata)


In [57]:
def load_quantifications(experiment_files, quantification_name='FPKM'):
    """Load FPKMs out of RSEM results into a pandas dataframe
    
    Columns will be library accession identifiers.
    """
    column_map = {
        'TPM': 5,
        'FPKM': 6
    }
    column = column_map[quantification_name]
    libraries = []
    quantifications = []
    libraries = []
    experiments = {}
    for i, row in enumerate(experiment_files):
        experiment = str(row['experiment'])
        library = str(row['library'])[-12:-1]
        experiments.setdefault(experiment, []).append(library)
        url = str(row['file_href'])
        print('loading: {} {}'.format(library, url))
        rsem = pandas.read_csv(url, sep='\t', index_col=0, usecols=[0, column])
        quantifications.append(rsem[quantification_name])
        libraries.append(library)

    for key in experiments:
        if len(experiments[key]) != 2:
            print("removing libs of exp {}. [{}]".format(key, ','.join(experiments[key])))
            for lib in experiments[key]:
                i = libraries.index(lib)
                del quantifications[i]
                del libraries[i]

    df = pandas.concat(quantifications, axis=1)
    df.columns = libraries
    return df

In [8]:
def replicate_scores(table, rep1_name, rep2_name, Acutoff=0):
    """Compute correlations, MAD, and SD replicate comparison scores
    """
    rep1 = table[rep1_name]
    rep2 = table[rep2_name]
    
    eitherzero = (rep1 == 0) | (rep2 == 0)
    replz1 = numpy.log2(rep1[eitherzero != True])
    replz2 = numpy.log2(rep2[eitherzero != True])
    
    M = replz1 - replz2
    A = (replz1 + replz2) / 2.0

    scores = pandas.Series({
        'total rows': len(table),
        'passed filter': len(replz1[A > Acutoff]),
            
        'Naïve Pearson': scipy.stats.pearsonr(rep1, rep2)[0],
        'Naïve Spearman': scipy.stats.spearmanr(rep1, rep2)[0],
            
        'Rafa Pearson': scipy.stats.pearsonr(replz1[A > Acutoff], replz2[A > Acutoff])[0],
        'Rafa Spearman': scipy.stats.spearmanr(replz1[A > Acutoff], replz2[A > Acutoff])[0],
        'MAD': numpy.round(numpy.median(numpy.abs(M)[A > Acutoff]) * 1.4826, 3),
        'SD': numpy.round(numpy.sqrt(numpy.mean(M[A > Acutoff] ** 2)), 3)
    },
    index = ['total rows', 'passed filter', 
             'Naïve Pearson', 'Naïve Spearman', 
             'Rafa Pearson', 'Rafa Spearman',
             'MAD', 'SD']
    )
    return scores

In [9]:
def compute_all_vs_all_scores(fpkms, Acutoff=0):
    """Compute all the scores of note for a FPKM table.
    """
    all_scores = collections.OrderedDict()
    shape = (len(fpkms.columns),len(fpkms.columns))
    for rep1 in fpkms.columns:
        for rep2 in fpkms.columns:
            scores = replicate_scores(fpkms, rep1, rep2, Acutoff)
            for name in scores.keys():
                if name not in all_scores:
                    all_scores[name] = pandas.DataFrame(
                        numpy.zeros(shape),
                        index=fpkms.columns,
                        columns=fpkms.columns
                    )
                all_scores[name][rep1][rep2] = scores[name]
    return pandas.Panel(all_scores)

# Load experiment data

## Load 13 pg tissue experiments

List of our library IDs that have been uploaded to the DCC

https://www.encodeproject.org/datasets/ENCSR901QHQ/

In [10]:
tissue_13pg_libs = [
    'barbara-wold:{}'.format(x) for x in [
        14626, 14629, 14495, 14630, 14627, 14631, 14628, 14632,
        14499, 14633, 14501, 14634, 14485, 14486, 
        14487, 14488, 14653, 14654, 14655, 14656, 
        # TODO: Why are these libraries missing not available as aliases?
        # TODO: 14665 = STL 010_liver_GITC_7/1/2014_13pgs
        # TODO: 14666 = STL 011_liver_GITC_7/1/2014_13pgs
        #14665, 14666, 
        14635, 14636
        ]
    ]

The load function is a bit complex as files are attached to experiments and not to their library, so we have to search for the library to find the related experiment, and then load the experiment into our model.

In [11]:
tissue_13pg_model = get_model(use_contexts=False)
load_experiments_by_library(tissue_13pg_model, tissue_13pg_libs)

### List loaded 13 pg tissue experiments

In [12]:
%%sparql -m tissue_13pg_model -c
select ?exp ?description
where {
  ?exp a experiment:Experiment ;
       rdf:description ?description .
}

Found 11 rows.


0,1
exp,description
experiments:ENCSR527RFK/,Total RNA-seq on postnatal 0 day mouse forebrain (13pgs)
experiments:ENCSR026ZRP/,Total RNA-seq on postnatal 0 day mouse midbrain (13pgs)
experiments:ENCSR861FGB/,Total RNA-seq on postnatal 0 day mouse hindbrain (13pgs)
experiments:ENCSR510ADJ/,Total RNA-seq on postnatal 0 day mouse heart (13pg)
experiments:ENCSR950BNG/,Total RNA-seq on postnatal 0 day mouse liver (13pgs)
experiments:ENCSR265YFZ/,Total RNA-seq on postnatal 0 day mouse skeletal muscle (13pgs)
experiments:ENCSR225BBK/,Total RNA-seq on PGP human bipolar spindle neurons (13pgs)
experiments:ENCSR368QPC/,Total RNA-seq on PGP human IPS cells (13pgs)
experiments:ENCSR000OXO/,Total RNA-seq on embryonic 11.5 day mouse forebrain


Find gene quantifications files for our current annotation (M4). This was needed as there are quantification runs for M2 and M3 for some of the libraries.

(Hopefully we weren't expecting human....)

In [13]:
%%sparql -m tissue_13pg_model -c -o tissue_13pg_files
select ?experiment ?library ?age ?bioname ?genome_annotation ?biorep ?techrep ?file_href 
where {
  ?experiment a experiment:Experiment ;
              experiment:files ?file .
  ?file file:output_type ?output_type ;
        file:href ?file_href ;
        file:genome_annotation ?genome_annotation ;
        file:replicate ?replicate .
  ?replicate replicate:library ?library ;
             replicate:biological_replicate_number ?biorep ;
             replicate:technical_replicate_number ?techrep .
  ?library library:aliases ?alias ;
           library:biosample ?biosample .
  ?biosample biosample:biosample_term_name ?bioname ;
             biosample:age ?age .
  filter(regex(?output_type, "gene quantifications"))
  filter(regex(?genome_annotation, "M4"))
}
order by ?experiment ?alias
limit 40


Found 18 rows.


### List 13 pg tissue libraries and files

In [14]:
tissue_13pg_files

0,1,2,3,4,5,6,7
experiment,library,age,bioname,genome_annotation,biorep,techrep,file_href
experiments:ENCSR000OXO/,libraries:ENCLB917PKP/,11.5,forebrain,M4,1,1,files:ENCFF005TWA/@@download/ENCFF005TWA.tsv
experiments:ENCSR000OXO/,libraries:ENCLB026BHP/,11.5,forebrain,M4,2,1,files:ENCFF550IBS/@@download/ENCFF550IBS.tsv
experiments:ENCSR026ZRP/,libraries:ENCLB719BQO/,0,midbrain,M4,1,1,files:ENCFF091FHP/@@download/ENCFF091FHP.tsv
experiments:ENCSR026ZRP/,libraries:ENCLB669AEL/,0,midbrain,M4,2,1,files:ENCFF447MON/@@download/ENCFF447MON.tsv
experiments:ENCSR265YFZ/,libraries:ENCLB304LFK/,0,skeletal muscle tissue,M4,1,1,files:ENCFF849RFV/@@download/ENCFF849RFV.tsv
experiments:ENCSR265YFZ/,libraries:ENCLB096HAH/,0,skeletal muscle tissue,M4,2,1,files:ENCFF021FHW/@@download/ENCFF021FHW.tsv
experiments:ENCSR510ADJ/,libraries:ENCLB652HKH/,0,heart,M4,1,1,files:ENCFF408DTF/@@download/ENCFF408DTF.tsv
experiments:ENCSR510ADJ/,libraries:ENCLB348BMH/,0,heart,M4,2,1,files:ENCFF393RNC/@@download/ENCFF393RNC.tsv
experiments:ENCSR527RFK/,libraries:ENCLB766UOB/,0,forebrain,M4,1,1,files:ENCFF923GRU/@@download/ENCFF923GRU.tsv


In [15]:
tissue_13pg_fpkms = load_quantifications(tissue_13pg_files, 'FPKM')

loading: ENCLB917PKP https://www.encodeproject.org/files/ENCFF005TWA/@@download/ENCFF005TWA.tsv
loading: ENCLB026BHP https://www.encodeproject.org/files/ENCFF550IBS/@@download/ENCFF550IBS.tsv
loading: ENCLB719BQO https://www.encodeproject.org/files/ENCFF091FHP/@@download/ENCFF091FHP.tsv
loading: ENCLB669AEL https://www.encodeproject.org/files/ENCFF447MON/@@download/ENCFF447MON.tsv
loading: ENCLB304LFK https://www.encodeproject.org/files/ENCFF849RFV/@@download/ENCFF849RFV.tsv
loading: ENCLB096HAH https://www.encodeproject.org/files/ENCFF021FHW/@@download/ENCFF021FHW.tsv
loading: ENCLB652HKH https://www.encodeproject.org/files/ENCFF408DTF/@@download/ENCFF408DTF.tsv
loading: ENCLB348BMH https://www.encodeproject.org/files/ENCFF393RNC/@@download/ENCFF393RNC.tsv
loading: ENCLB766UOB https://www.encodeproject.org/files/ENCFF923GRU/@@download/ENCFF923GRU.tsv
loading: ENCLB181TCJ https://www.encodeproject.org/files/ENCFF235UIN/@@download/ENCFF235UIN.tsv
loading: ENCLB449LBZ https://www.encodep

In [16]:
tissue_13pg_fpkms.columns

Index(['ENCLB917PKP', 'ENCLB026BHP', 'ENCLB719BQO', 'ENCLB669AEL',
       'ENCLB304LFK', 'ENCLB096HAH', 'ENCLB652HKH', 'ENCLB348BMH',
       'ENCLB766UOB', 'ENCLB181TCJ', 'ENCLB449LBZ', 'ENCLB905LVV',
       'ENCLB238LIR', 'ENCLB765HDK', 'ENCLB005HHX', 'ENCLB185MNU',
       'ENCLB356IIP', 'ENCLB791CRT'],
      dtype='object')

In [17]:
tissue_13pg_tpms = load_quantifications(tissue_13pg_files, 'TPM')

loading: ENCLB917PKP https://www.encodeproject.org/files/ENCFF005TWA/@@download/ENCFF005TWA.tsv
loading: ENCLB026BHP https://www.encodeproject.org/files/ENCFF550IBS/@@download/ENCFF550IBS.tsv
loading: ENCLB719BQO https://www.encodeproject.org/files/ENCFF091FHP/@@download/ENCFF091FHP.tsv
loading: ENCLB669AEL https://www.encodeproject.org/files/ENCFF447MON/@@download/ENCFF447MON.tsv
loading: ENCLB304LFK https://www.encodeproject.org/files/ENCFF849RFV/@@download/ENCFF849RFV.tsv
loading: ENCLB096HAH https://www.encodeproject.org/files/ENCFF021FHW/@@download/ENCFF021FHW.tsv
loading: ENCLB652HKH https://www.encodeproject.org/files/ENCFF408DTF/@@download/ENCFF408DTF.tsv
loading: ENCLB348BMH https://www.encodeproject.org/files/ENCFF393RNC/@@download/ENCFF393RNC.tsv
loading: ENCLB766UOB https://www.encodeproject.org/files/ENCFF923GRU/@@download/ENCFF923GRU.tsv
loading: ENCLB181TCJ https://www.encodeproject.org/files/ENCFF235UIN/@@download/ENCFF235UIN.tsv
loading: ENCLB449LBZ https://www.encodep

### Example 13 pg tissue correlation scores

Replicates from Experiment ENCSR000OXO 

Scores from <a href="http://wiki.encodedcc.org/index.php/File:Lrna_qc_all.xlsx">Lrna_qc_all.xlsx</a>, to provide at least some verification I reimplmented Rafa's algorithm correction.

<table>
  <tr><td>Rafa Pearson</td><td>MAD</td><td>Rafa Spearman</td><td>SD</td></tr>
  <tr><td>0.5428255</td><td>1.344</td><td>0.6686317</td><td>2.453</td></tr>
</table>

In [18]:
replicate_scores(tissue_13pg_fpkms, 'ENCLB917PKP', 'ENCLB026BHP')

total rows        69690.000000
passed filter      7888.000000
Naïve Pearson         0.969447
Naïve Spearman        0.795169
Rafa Pearson          0.542825
Rafa Spearman         0.668632
MAD                   1.344000
SD                    2.453000
dtype: float64

## Load 10 ng tissue experiments

In [19]:
tissue_10ng_libs = [
    'barbara-wold:{}'.format(x) for x in [
        '15492', '15491', '15490', '15489', '15486', '15485', '15484', '15483',
        '15480', '15479', '15478', '15477', '15476', '15475', '15018', '15017',
        '15016', '15015', '15012', '15011', '15010', '15009', '15008', '15007', 
    ]
]

tissue_10ng_model = get_model(use_contexts=False)
load_experiments_by_library(tissue_10ng_model, tissue_10ng_libs)

### List loaded 10 ng tissue experiments

In [20]:
%%sparql -m tissue_10ng_model -c
select distinct ?exp ?description
where {
  ?exp a experiment:Experiment ;
       rdf:description ?description .
}
group by ?exp
order by ?exp

Found 12 rows.


0,1
exp,description
experiments:ENCSR017JEG/,Total RNA-Seq on postnatal 0 day mouse hindbrain
experiments:ENCSR096STK/,RNA-seq on mouse liver
experiments:ENCSR160IIN/,Total RNA-seq on 10ng of embryonic 11.5 day mouse forebrain
experiments:ENCSR284AMY/,Total RNA-seq on 10ng of embryonic 11.5 day mouse liver
experiments:ENCSR307BCA/,RNA-seq on embryonic 11.5 day mouse midbrain
experiments:ENCSR362AIZ/,Total RNA-Seq on postnatal 0 day mouse forebrain
experiments:ENCSR438XCG/,Total RNA-Seq on postnatal 0 day mouse thymus
experiments:ENCSR526SEX/,Total RNA-Seq on postnatal 0 day mouse heart
experiments:ENCSR691OPQ/,RNA-seq on embryonic 11.5 day mouse heart


In [21]:
%%sparql -m tissue_10ng_model -c -o tissue_10ng_files
select ?experiment ?library ?age ?bioname ?genome_annotation ?biorep ?techrep ?output_type ?file_href
where {
    ?experiment a experiment:Experiment .
    ?experiment experiment:files ?file .
    ?file file:output_type ?output_type ;
          file:href ?file_href ;
          file:genome_annotation ?genome_annotation ;
          file:replicate ?replicate .
    ?replicate replicate:library ?library ;
               replicate:biological_replicate_number ?biorep ;
               replicate:technical_replicate_number ?techrep .
    ?library library:aliases ?alias ;
             library:biosample ?biosample .
    ?biosample biosample:biosample_term_name ?bioname ;
               biosample:age ?age .    
    filter(regex(?output_type, "gene quantifications"))        
    #filter(regex(?genome_annotation, "(M4)|(V19)"))
}
order by ?experiment ?alias
limit 40


Found 24 rows.


### List 10 ng tissue libraries and files

In [22]:
tissue_10ng_files

0,1,2,3,4,5,6,7,8
experiment,library,age,bioname,genome_annotation,biorep,techrep,output_type,file_href
experiments:ENCSR017JEG/,libraries:ENCLB274VUA/,0,hindbrain,M4,1,1,gene quantifications,files:ENCFF635MWR/@@download/ENCFF635MWR.tsv
experiments:ENCSR017JEG/,libraries:ENCLB441AFS/,0,hindbrain,M4,2,1,gene quantifications,files:ENCFF845TNV/@@download/ENCFF845TNV.tsv
experiments:ENCSR096STK/,libraries:ENCLB370ZFK/,0,liver,M4,1,1,gene quantifications,files:ENCFF872PTK/@@download/ENCFF872PTK.tsv
experiments:ENCSR096STK/,libraries:ENCLB273BPC/,0,liver,M4,2,1,gene quantifications,files:ENCFF122YPQ/@@download/ENCFF122YPQ.tsv
experiments:ENCSR160IIN/,libraries:ENCLB835LVO/,11.5,forebrain,M4,1,1,gene quantifications,files:ENCFF376TIM/@@download/ENCFF376TIM.tsv
experiments:ENCSR160IIN/,libraries:ENCLB471QMM/,11.5,forebrain,M4,2,1,gene quantifications,files:ENCFF033VVD/@@download/ENCFF033VVD.tsv
experiments:ENCSR284AMY/,libraries:ENCLB055KYV/,11.5,liver,M4,1,1,gene quantifications,files:ENCFF645HNE/@@download/ENCFF645HNE.tsv
experiments:ENCSR284AMY/,libraries:ENCLB357KYA/,11.5,liver,M4,2,1,gene quantifications,files:ENCFF081XJC/@@download/ENCFF081XJC.tsv
experiments:ENCSR307BCA/,libraries:ENCLB659UFY/,11.5,midbrain,M4,1,1,gene quantifications,files:ENCFF877LFX/@@download/ENCFF877LFX.tsv


In [23]:
tissue_10ng_fpkms = load_quantifications(tissue_10ng_files, 'FPKM')

loading: ENCLB274VUA https://www.encodeproject.org/files/ENCFF635MWR/@@download/ENCFF635MWR.tsv
loading: ENCLB441AFS https://www.encodeproject.org/files/ENCFF845TNV/@@download/ENCFF845TNV.tsv
loading: ENCLB370ZFK https://www.encodeproject.org/files/ENCFF872PTK/@@download/ENCFF872PTK.tsv
loading: ENCLB273BPC https://www.encodeproject.org/files/ENCFF122YPQ/@@download/ENCFF122YPQ.tsv
loading: ENCLB835LVO https://www.encodeproject.org/files/ENCFF376TIM/@@download/ENCFF376TIM.tsv
loading: ENCLB471QMM https://www.encodeproject.org/files/ENCFF033VVD/@@download/ENCFF033VVD.tsv
loading: ENCLB055KYV https://www.encodeproject.org/files/ENCFF645HNE/@@download/ENCFF645HNE.tsv
loading: ENCLB357KYA https://www.encodeproject.org/files/ENCFF081XJC/@@download/ENCFF081XJC.tsv
loading: ENCLB659UFY https://www.encodeproject.org/files/ENCFF877LFX/@@download/ENCFF877LFX.tsv
loading: ENCLB426JKF https://www.encodeproject.org/files/ENCFF227YZV/@@download/ENCFF227YZV.tsv
loading: ENCLB266LCY https://www.encodep

In [24]:
tissue_10ng_fpkms.columns

Index(['ENCLB274VUA', 'ENCLB441AFS', 'ENCLB370ZFK', 'ENCLB273BPC',
       'ENCLB835LVO', 'ENCLB471QMM', 'ENCLB055KYV', 'ENCLB357KYA',
       'ENCLB659UFY', 'ENCLB426JKF', 'ENCLB266LCY', 'ENCLB055JUC',
       'ENCLB074REG', 'ENCLB415KPR', 'ENCLB658ICO', 'ENCLB741KQB',
       'ENCLB601XLL', 'ENCLB347FRI', 'ENCLB080NNG', 'ENCLB180OTB',
       'ENCLB454ZUS', 'ENCLB061TDP', 'ENCLB847UDV', 'ENCLB704CYQ'],
      dtype='object')

In [25]:
tissue_10ng_tpms = load_quantifications(tissue_10ng_files, 'TPM')

loading: ENCLB274VUA https://www.encodeproject.org/files/ENCFF635MWR/@@download/ENCFF635MWR.tsv
loading: ENCLB441AFS https://www.encodeproject.org/files/ENCFF845TNV/@@download/ENCFF845TNV.tsv
loading: ENCLB370ZFK https://www.encodeproject.org/files/ENCFF872PTK/@@download/ENCFF872PTK.tsv
loading: ENCLB273BPC https://www.encodeproject.org/files/ENCFF122YPQ/@@download/ENCFF122YPQ.tsv
loading: ENCLB835LVO https://www.encodeproject.org/files/ENCFF376TIM/@@download/ENCFF376TIM.tsv
loading: ENCLB471QMM https://www.encodeproject.org/files/ENCFF033VVD/@@download/ENCFF033VVD.tsv
loading: ENCLB055KYV https://www.encodeproject.org/files/ENCFF645HNE/@@download/ENCFF645HNE.tsv
loading: ENCLB357KYA https://www.encodeproject.org/files/ENCFF081XJC/@@download/ENCFF081XJC.tsv
loading: ENCLB659UFY https://www.encodeproject.org/files/ENCFF877LFX/@@download/ENCFF877LFX.tsv
loading: ENCLB426JKF https://www.encodeproject.org/files/ENCFF227YZV/@@download/ENCFF227YZV.tsv
loading: ENCLB266LCY https://www.encodep

### Example 10 ng tissue correlation scores

replicate scores for experiment ENCSR760TOE

Scores from <a href="http://wiki.encodedcc.org/index.php/File:Lrna_qc_all.xlsx">Lrna_qc_all.xlsx</a>, to provide at least some verification I reimplmented Rafa's algorithm correction.

<table>
  <tr><td>Rafa Pearson</td><td>MAD</td><td>Rafa Spearman</td><td>SD</td></tr>
  <tr><td>0.9873127</td><td>0.238</td><td>0.9882345</td><td>0.331</td></tr>
</table>

(The Pearson is very slightly different, but could be plausible a difference in rounding.)

In [26]:
replicate_scores(tissue_10ng_fpkms, 'ENCLB454ZUS', 'ENCLB061TDP')

total rows        69690.000000
passed filter     13761.000000
Naïve Pearson         0.992520
Naïve Spearman        0.942254
Rafa Pearson          0.987313
Rafa Spearman         0.988234
MAD                   0.238000
SD                    0.331000
dtype: float64

## Load ENCODE bulk cell line experiments

In [27]:
bulk_cell_line_model = get_model()
bulk_cell_line_evaluation = server.get_jsonld('https://www.encodeproject.org/datasets/ENCSR000AJW/')
load_jsonld_into_model(bulk_cell_line_model, bulk_cell_line_evaluation)

Load in additional information about libraries so we can get at the nucelic acid and biosample term name.

In [28]:
%%sparql -m bulk_cell_line_model -c -o bulk_cell_line_libraries
select ?library
where {
    ?replicate replicate:library ?library .
}

Found 24 rows.


  results = Redland.librdf_query_execute(self._query,model._model)


In [29]:
for row in bulk_cell_line_libraries:
    load_jsonld_into_model(
        bulk_cell_line_model,
        server.get_jsonld(str(row['library']))
    )

In [30]:
load_jsonld_into_model(bulk_cell_line_model, server.get_jsonld("https://www.encodeproject.org/files/ENCFF782PCD/"))
load_jsonld_into_model(bulk_cell_line_model, server.get_jsonld("https://www.encodeproject.org/files/ENCFF902SEE/"))

In [43]:
%%sparql -m bulk_cell_line_model -c -o bulk_cell_line_files
select ?labname ?experiment ?library ?bioname ?na_term ?file_href ?output_type ?genome_annotation ?biorep ?techrep
     
where {
  ?experiment a experiment:Experiment ;
              experiment:lab ?lab ;
              experiment:files ?file .
  ?file file:output_type ?output_type ;
        file:href ?file_href ;
        file:genome_annotation ?genome_annotation ;
        file:replicate ?replicate . 
  ?replicate replicate:library ?library ;
             replicate:biological_replicate_number ?biorep ;
             replicate:technical_replicate_number ?techrep .
  ?library library:biosample ?biosample ;
           library:nucleic_acid_term_name ?na_term .
  ?biosample biosample:biosample_term_name ?bioname .
  ?lab <https://www.encodeproject.org/profiles/Lab.json#title> ?labname .
    
  filter(regex(?output_type, "gene quantifications"))
}
order by ?labname ?experiment ?library
limit 40


Found 24 rows.


## List Bulk ENCODE bulk cell line libraries and files

In [44]:
bulk_cell_line_files

0,1,2,3,4,5,6,7,8,9
labname,experiment,library,bioname,na_term,file_href,output_type,genome_annotation,biorep,techrep
"Barbara Wold, Caltech",experiments:ENCSR000AEG/,libraries:ENCLB043ZZZ/,GM12878,RNA,files:ENCFF486PVW/@@download/ENCFF486PVW.tsv,gene quantifications,V19,1,1
"Barbara Wold, Caltech",experiments:ENCSR000AEG/,libraries:ENCLB044ZZZ/,GM12878,RNA,files:ENCFF428CJQ/@@download/ENCFF428CJQ.tsv,gene quantifications,V19,2,1
"Barbara Wold, Caltech",experiments:ENCSR000AEH/,libraries:ENCLB045ZZZ/,GM12878,polyadenylated mRNA,files:ENCFF745IAF/@@download/ENCFF745IAF.tsv,gene quantifications,V19,1,1
"Barbara Wold, Caltech",experiments:ENCSR000AEH/,libraries:ENCLB046ZZZ/,GM12878,polyadenylated mRNA,files:ENCFF830IVF/@@download/ENCFF830IVF.tsv,gene quantifications,V19,2,1
"Barbara Wold, Caltech",experiments:ENCSR000AEP/,libraries:ENCLB061ZZZ/,K562,RNA,files:ENCFF771MAN/@@download/ENCFF771MAN.tsv,gene quantifications,V19,1,1
"Barbara Wold, Caltech",experiments:ENCSR000AEP/,libraries:ENCLB062ZZZ/,K562,RNA,files:ENCFF806RDV/@@download/ENCFF806RDV.tsv,gene quantifications,V19,2,1
"Barbara Wold, Caltech",experiments:ENCSR000AEQ/,libraries:ENCLB063ZZZ/,K562,polyadenylated mRNA,files:ENCFF176ACR/@@download/ENCFF176ACR.tsv,gene quantifications,V19,1,1
"Barbara Wold, Caltech",experiments:ENCSR000AEQ/,libraries:ENCLB064ZZZ/,K562,polyadenylated mRNA,files:ENCFF456PAW/@@download/ENCFF456PAW.tsv,gene quantifications,V19,2,1
"Brenton Graveley, UConn",experiments:ENCSR000AEE/,libraries:ENCLB039ZZZ/,GM12878,RNA,files:ENCFF009ZXH/@@download/ENCFF009ZXH.tsv,gene quantifications,V19,1,1


#### Load ENCODE Bull cell line fpkms

In [45]:
bulk_cell_line_fpkms = load_quantifications(bulk_cell_line_files, 'FPKM')

loading: ENCLB043ZZZ https://www.encodeproject.org/files/ENCFF486PVW/@@download/ENCFF486PVW.tsv
loading: ENCLB044ZZZ https://www.encodeproject.org/files/ENCFF428CJQ/@@download/ENCFF428CJQ.tsv
loading: ENCLB045ZZZ https://www.encodeproject.org/files/ENCFF745IAF/@@download/ENCFF745IAF.tsv
loading: ENCLB046ZZZ https://www.encodeproject.org/files/ENCFF830IVF/@@download/ENCFF830IVF.tsv
loading: ENCLB061ZZZ https://www.encodeproject.org/files/ENCFF771MAN/@@download/ENCFF771MAN.tsv
loading: ENCLB062ZZZ https://www.encodeproject.org/files/ENCFF806RDV/@@download/ENCFF806RDV.tsv
loading: ENCLB063ZZZ https://www.encodeproject.org/files/ENCFF176ACR/@@download/ENCFF176ACR.tsv
loading: ENCLB064ZZZ https://www.encodeproject.org/files/ENCFF456PAW/@@download/ENCFF456PAW.tsv
loading: ENCLB039ZZZ https://www.encodeproject.org/files/ENCFF009ZXH/@@download/ENCFF009ZXH.tsv
loading: ENCLB040ZZZ https://www.encodeproject.org/files/ENCFF219RWY/@@download/ENCFF219RWY.tsv
loading: ENCLB041ZZZ https://www.encodep

In [46]:
bulk_cell_line_tpms = load_quantifications(bulk_cell_line_files, 'TPM')

loading: ENCLB043ZZZ https://www.encodeproject.org/files/ENCFF486PVW/@@download/ENCFF486PVW.tsv
loading: ENCLB044ZZZ https://www.encodeproject.org/files/ENCFF428CJQ/@@download/ENCFF428CJQ.tsv
loading: ENCLB045ZZZ https://www.encodeproject.org/files/ENCFF745IAF/@@download/ENCFF745IAF.tsv
loading: ENCLB046ZZZ https://www.encodeproject.org/files/ENCFF830IVF/@@download/ENCFF830IVF.tsv
loading: ENCLB061ZZZ https://www.encodeproject.org/files/ENCFF771MAN/@@download/ENCFF771MAN.tsv
loading: ENCLB062ZZZ https://www.encodeproject.org/files/ENCFF806RDV/@@download/ENCFF806RDV.tsv
loading: ENCLB063ZZZ https://www.encodeproject.org/files/ENCFF176ACR/@@download/ENCFF176ACR.tsv
loading: ENCLB064ZZZ https://www.encodeproject.org/files/ENCFF456PAW/@@download/ENCFF456PAW.tsv
loading: ENCLB039ZZZ https://www.encodeproject.org/files/ENCFF009ZXH/@@download/ENCFF009ZXH.tsv
loading: ENCLB040ZZZ https://www.encodeproject.org/files/ENCFF219RWY/@@download/ENCFF219RWY.tsv
loading: ENCLB041ZZZ https://www.encodep

### Example ENCODE bulk cell line correlation scores

In [47]:
replicate_scores(bulk_cell_line_fpkms, 'ENCLB035ZZZ', 'ENCLB036ZZZ')

total rows        58540.000000
passed filter     12169.000000
Naïve Pearson         0.998247
Naïve Spearman        0.928167
Rafa Pearson          0.984208
Rafa Spearman         0.984904
MAD                   0.281000
SD                    0.365000
dtype: float64

# Load pool split

These haven't been run at the DCC yet. I have STAR/RSEM results that are based on a deprecated index.

In [50]:
def load_pool_split(quantification_name='FPKM'):
    column_map = {
        'TPM': 5,
        'FPKM': 6
    }
    column = column_map[quantification_name]
    
    pool_split_root="/woldlab/castor/home/diane/proj/submission/encode-y3q3/"
    pool_split_libraries=[
        "15304-LC_805_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15305-LC_806_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15306-LC_807_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15307-LC_807_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15308-LC_808_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15309-LC_809_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15310-LC_810_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15311-LC_811_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15356-LC_813_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15357-LC_814_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15358-LC_815_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15359-LC_816_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15360-LC_817_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15361-LC_818_C57Bl6_layer_V_pyramidal_neuron_pool-split",
        "15362-LC_819_C57Bl6_layer_V_pyramidal_neuron_pool-split",
    ]
    
    host = paramiko.SSHClient()
    host.load_system_host_keys() 
    host.set_missing_host_key_policy(paramiko.WarningPolicy())
    host.connect('pongo.cacr.caltech.edu', username='diane')
    sftp = host.open_sftp()
    sftp.chdir(pool_split_root)
    
    results = []
    libraries = []
    for path in pool_split_libraries:
        rsem_file = sftp.open(os.path.join(path, 'Aligned.toTranscriptome.out_rsem.genes.results'))
        quantification = pandas.read_csv(rsem_file, sep='\t', index_col=0, usecols=[0, column])
        results.append(quantification)
        libraries.append(path[:5])
    df = pandas.concat(results, axis=1)
    df.columns = libraries
    return df


In [53]:
#pool_split_fpkms = load_pool_split('FPKM')

In [None]:
#pool_split_tpms = load_pool_split('TPM')

In [54]:
pool_split_libs = [
    'barbara-wold:{}'.format(x) for x in [
        "15304","15305","15306","15307","15308","15309","15310","15311",
        "15356","15357","15358","15359","15360","15361","15362",
    ]
]

pool_split_model = get_model(use_contexts=False)
load_experiments_by_library(pool_split_model, pool_split_libs)

In [55]:
%%sparql -m pool_split_model -c -o pool_split_files
select ?labname ?experiment ?library ?bioname ?na_term ?file_href ?output_type ?genome_annotation ?biorep ?techrep
     
where {
  ?experiment a experiment:Experiment ;
              experiment:lab ?lab ;
              experiment:files ?file .
  ?file file:output_type ?output_type ;
        file:href ?file_href ;
        file:genome_annotation ?genome_annotation ;
        file:replicate ?replicate . 
  ?replicate replicate:library ?library ;
             replicate:biological_replicate_number ?biorep ;
             replicate:technical_replicate_number ?techrep .
  ?library library:biosample ?biosample ;
           library:nucleic_acid_term_name ?na_term .
  ?biosample biosample:biosample_term_name ?bioname .
  ?lab <https://www.encodeproject.org/profiles/Lab.json#title> ?labname .
    
  filter(regex(?output_type, "gene quantifications"))
}
order by ?labname ?experiment ?library
limit 40


Found 15 rows.


In [58]:
pool_split_fpkms = load_quantifications(pool_split_files, 'FPKM')

loading: ENCLB151YVV https://www.encodeproject.org/files/ENCFF610SDG/@@download/ENCFF610SDG.tsv
loading: ENCLB231DLC https://www.encodeproject.org/files/ENCFF483LZZ/@@download/ENCFF483LZZ.tsv
loading: ENCLB264HUG https://www.encodeproject.org/files/ENCFF730LQE/@@download/ENCFF730LQE.tsv
loading: ENCLB282TJI https://www.encodeproject.org/files/ENCFF717YLM/@@download/ENCFF717YLM.tsv
loading: ENCLB292TWB https://www.encodeproject.org/files/ENCFF679JNF/@@download/ENCFF679JNF.tsv
loading: ENCLB301VGJ https://www.encodeproject.org/files/ENCFF358GCQ/@@download/ENCFF358GCQ.tsv
loading: ENCLB376XWO https://www.encodeproject.org/files/ENCFF678QBC/@@download/ENCFF678QBC.tsv
loading: ENCLB579FVG https://www.encodeproject.org/files/ENCFF504LYK/@@download/ENCFF504LYK.tsv
loading: ENCLB637WDO https://www.encodeproject.org/files/ENCFF996SVN/@@download/ENCFF996SVN.tsv
loading: ENCLB670TAW https://www.encodeproject.org/files/ENCFF757PZM/@@download/ENCFF757PZM.tsv
loading: ENCLB679XGZ https://www.encodep

ValueError: No objects to concatenate

In [None]:
pool_split_tpms = load_quantifications(pool_split_files, 'TPM')

# Generate DCC Library to Human Readable Names

The ENCODE ids are hard to think about, and some of our descriptive names are too long, so we need to define a set of translation tables and formatting functions to select what metadata to show on our plots.

In [None]:
lab_shorten = {
    'barbara-wold': 'wold',
    'brenton-graveley': 'grav',
    'thomas-gingeras': 'ging',
}

na_shorten = {
    'polyadenylated mRNA': 'PolyA',
    'RNA': 'Total'
}

age_fix = {
    '0': 'P0',
    '11.5': 'e11.5'
}

tissue_shorten = {
    'thymus': 'thymus',
    'skeletal muscle tissue': 'skel. musc.',
    'liver': 'liver',
    'heart': 'heart',
    'midbrain': 'midbrain',
    'hindbrain': 'hindbrain',
    'forebrain': 'forebrain',
    'liver': 'liver',
}

In [None]:
bulk_cell_line_labels = collections.OrderedDict()
for row in bulk_cell_line_files:
    library = str(row['library'])[-12:-1]
    if library in bulk_cell_line_fpkms.columns:
        bulk_cell_line_labels[library] = "{lab} {na} {bio} {rep}".format(
            lab=lab_shorten[str(row['labname'])],
            na=na_shorten[str(row['na_term'])],
            bio=str(row['bioname']),
            rep='r' + str(row['biorep']),
        )
bulk_cell_line_labels = pandas.Series(bulk_cell_line_labels)

In [None]:
bulk_cell_line_labels

In [None]:
tissue_13pg_labels = collections.OrderedDict()
for row in tissue_13pg_files:
    library = str(row['library'])[-12:-1]
    if library in tissue_13pg_fpkms.columns:    
        tissue_13pg_labels[library] = "{age} {bio} {rep}".format(
            age=age_fix[str(row['age'])],
            bio=tissue_shorten[str(row['bioname'])],
            rep='r' + str(row['biorep']),
        )
    print(library, tissue_13pg_labels[library])
tissue_13pg_labels = pandas.Series(tissue_13pg_labels)

In [None]:
tissue_13pg_labels

In [None]:
tissue_10ng_labels = collections.OrderedDict()
for row in tissue_10ng_files:
    library = str(row['library'])[-12:-1]
    if library in tissue_10ng_fpkms.columns:
        tissue_10ng_labels[library] = "{age} {bio} {rep}".format(
            age=age_fix[str(row['age'])],
            bio=tissue_shorten[str(row['bioname'])],
            rep='r' + str(row['biorep']),
        )
tissue_10ng_labels = pandas.Series(tissue_10ng_labels)

In [None]:
tissue_10ng_labels

# Compute Scores

In [None]:
tissue_13pg_fpkm_scores = compute_all_vs_all_scores(tissue_13pg_fpkms)
tissue_10ng_fpkm_scores = compute_all_vs_all_scores(tissue_10ng_fpkms)
bulk_cell_line_fpkm_scores = compute_all_vs_all_scores(bulk_cell_line_fpkms)
pool_split_fpkm_scores = compute_all_vs_all_scores(pool_split_fpkms)

In [None]:
tissue_13pg_tpm_scores = compute_all_vs_all_scores(tissue_13pg_tpms)
tissue_10ng_tpm_scores = compute_all_vs_all_scores(tissue_10ng_tpms)
bulk_cell_line_tpm_scores = compute_all_vs_all_scores(bulk_cell_line_tpms)
pool_split_tpm_scores = compute_all_vs_all_scores(pool_split_tpms)

# Save tables

In [None]:
store = pandas.HDFStore('comparison.h5', 'w', complevel=5, complib='bzip2')

In [None]:
store.put('tissue_13pg/fpkms', tissue_13pg_fpkms, format="table")
store.put('tissue_13pg/fpkm_scores', tissue_13pg_fpkm_scores, format="table")
store.put('tissue_13pg/tpms', tissue_13pg_tpms, format="table")
store.put('tissue_13pg/tpm_scores', tissue_13pg_tpm_scores, format="table")
store.put('tissue_13pg/labels', tissue_13pg_labels, format="table")

store.put('tissue_10ng/fpkms', tissue_10ng_fpkms, format="table")
store.put('tissue_10ng/fpkm_scores', tissue_10ng_fpkm_scores, format="table")
store.put('tissue_10ng/tpms', tissue_10ng_tpms, format="table")
store.put('tissue_10ng/tpm_scores', tissue_10ng_tpm_scores, format="table")
store.put('tissue_10ng/labels', tissue_10ng_labels, format="table")

store.put('c57bl6_purk_pool_split/fpkms', pool_split_fpkms, format="table")
store.put('c57bl6_purk_pool_split/fpkm_scores', pool_split_fpkm_scores, format="table")
store.put('c57bl6_purk_pool_split/tpms', pool_split_tpms, format="table")
store.put('c57bl6_purk_pool_split/tpm_scores', pool_split_tpm_scores, format="table")

store.put('bulk_cell_line/fpkms', bulk_cell_line_fpkms, format="table")
store.put('bulk_cell_line/fpkm_scores', bulk_cell_line_fpkm_scores, format="table")
store.put('bulk_cell_line/tpms', bulk_cell_line_tpms, format="table")
store.put('bulk_cell_line/tpm_scores', bulk_cell_line_tpm_scores, format="table")
store.put('bulk_cell_line/labels', bulk_cell_line_labels, format="table")

In [None]:
store.keys()

In [None]:
store.close()