# Introduction

I was wondering how much replication depends on having similar read depth between the two replicates.

In [2]:
import pandas
import collections
import sys
import os
import numpy
import pprint
import time

HTSW = os.path.expanduser('~/proj/htsworkflow')
if HTSW not in sys.path:
    sys.path.append(HTSW)
ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if ROOT not in sys.path:
    sys.path.append(ROOT)
import save_rnaseq_madqc

In [3]:
query_url = 'search/?type=experiment&assay_term_name=RNA-seq'
cache_name = os.path.join(ROOT, 'rnaseq-experiments.shelf')
cache = save_rnaseq_madqc.caching_encoded_experiment_loader(query_url, cache_name)

Reading 150 of 1504 records in 0.10522496799967485 seconds
Reading 300 of 1504 records in 0.1217044690001785 seconds
Reading 450 of 1504 records in 0.2341561310004181 seconds
Reading 600 of 1504 records in 0.13383693299874722 seconds
Reading 750 of 1504 records in 0.022397961000024225 seconds
Reading 900 of 1504 records in 0.21603940299974056 seconds
Reading 1050 of 1504 records in 0.03475890499976231 seconds
Reading 1200 of 1504 records in 0.35518614200009324 seconds
Reading 1350 of 1504 records in 0.25264025000069523 seconds
Reading 1500 of 1504 records in 0.12043020599958254 seconds
Read 1504 records in 1.5963753679989168 seconds


In [10]:
[ (i, x['@id']) for i, x in enumerate(cache['ENCSR000AAT']['files']) if x['@id'] == '/files/ENCFF821HDC/' ]

[(30, '/files/ENCFF821HDC/')]

In [13]:
sorted(cache['ENCSR000AAT']['files'][30].keys())

['@id',
 '@type',
 'accession',
 'aliases',
 'alternate_accessions',
 'analysis_step_version',
 'assembly',
 'award',
 'biological_replicates',
 'content_md5sum',
 'dataset',
 'date_created',
 'dbxrefs',
 'derived_from',
 'file_format',
 'file_size',
 'file_type',
 'flowcell_details',
 'genome_annotation',
 'href',
 'lab',
 'md5sum',
 'notes',
 'output_category',
 'output_type',
 'quality_metrics',
 'replicate',
 'schema_version',
 'status',
 'step_run',
 'submitted_by',
 'submitted_file_name',
 'title',
 'uuid']

In [14]:
sorted(cache['ENCSR000AAT']['files'][30]['analysis_step_version'].keys())

['@id',
 '@type',
 'aliases',
 'analysis_step',
 'date_created',
 'schema_version',
 'software_versions',
 'status',
 'submitted_by',
 'uuid',
 'version']

In [15]:
def get_software_name(experiment_file):
    analysis = experiment_file.get('analysis_step_version')
    if analysis:
        software_versions = analysis.get('software_versions')
        if software_versions:
            for version in software_versions:
                software = version['software']
                yield software['name']


In [18]:
def get_software(experiment_file):
    analysis = experiment_file.get('analysis_step_version')
    if analysis:
        software_versions = analysis.get('software_versions')
        if software_versions:
            for version in software_versions:
                yield version['software']

In [17]:
list(get_software_name(cache['ENCSR000AAT']['files'][30]))

['rna-pipelines', 'lrna-align-star-pe', 'star', 'samtools']

In [19]:
sw = list(get_software(cache['ENCSR000AAT']['files'][30]))

In [23]:
sw[3]

{'@id': '/software/samtools/',
 '@type': ['Software', 'Item'],
 'aliases': [],
 'award': '/awards/ENCODE/',
 'date_created': '2015-02-12T21:54:17.918472+00:00',
 'description': 'Samtools is a suite of programs for interacting with high-throughput sequencing data.  SAMtools implements various utilities for post-processing alignments in the SAM format, such as indexing, variant caller and alignment viewer, and thus provides universal tools for processing read alignments (PMID:19505943).',
 'lab': '/labs/encode-consortium/',
 'name': 'samtools',
 'references': [],
 'schema_version': '3',
 'software_type': ['other'],
 'source_url': 'http://sourceforge.net/projects/samtools/files/samtools/',
 'status': 'released',
 'submitted_by': '/users/81a6cc12-2847-4e2e-8f2c-f566699eb29e/',
 'title': 'Samtools',
 'url': 'http://www.htslib.org/',
 'used_by': ['ENCODE'],
 'uuid': 'ce18defa-8989-4067-9ac7-2f13734223da',
 'versions': ['/software-versions/d44db1ed-3af1-4e3b-ae7a-b0fb1085877f/',
  '/software-

In [32]:
for findex, f in enumerate(cache['ENCSR000AAT']['files']):
    metrics = f.get('quality_metrics')
    if metrics:
        for qindex, qc in enumerate(metrics):
            print(findex, qindex, f['output_type'], qc['@type'])

14 0 alignments ['StarQualityMetric', 'QualityMetric', 'Item']
15 0 transcriptome alignments ['StarQualityMetric', 'QualityMetric', 'Item']
17 0 alignments ['StarQualityMetric', 'QualityMetric', 'Item']
18 0 transcriptome alignments ['StarQualityMetric', 'QualityMetric', 'Item']
19 0 gene quantifications ['MadQualityMetric', 'QualityMetric', 'Item']
28 0 gene quantifications ['MadQualityMetric', 'QualityMetric', 'Item']
30 0 alignments ['SamtoolsFlagstatsQualityMetric', 'QualityMetric', 'Item']
30 1 alignments ['StarQualityMetric', 'QualityMetric', 'Item']
31 0 transcriptome alignments ['SamtoolsFlagstatsQualityMetric', 'QualityMetric', 'Item']
31 1 transcriptome alignments ['StarQualityMetric', 'QualityMetric', 'Item']
32 0 alignments ['StarQualityMetric', 'QualityMetric', 'Item']
32 1 alignments ['SamtoolsFlagstatsQualityMetric', 'QualityMetric', 'Item']
34 0 transcriptome alignments ['SamtoolsFlagstatsQualityMetric', 'QualityMetric', 'Item']
34 1 transcriptome alignments ['StarQuali

In [36]:
cache['ENCSR000AAT']['files'][30]['quality_metrics']

[{'@id': '/samtools-flagstats-quality-metrics/3182d1d9-06a7-474b-84bf-494323861553/',
  '@type': ['SamtoolsFlagstatsQualityMetric', 'QualityMetric', 'Item'],
  'aliases': ['dnanexus:qc.star_genome_flagstat.job-Bq3qZ9j0J6Z6yG8fp5607g0Z'],
  'assay_term_id': 'OBI:0001271',
  'assay_term_name': 'RNA-seq',
  'attachment': {'download': 'ENCSR000AAT_rep1_1_star_genome_flagstat.txt',
   'href': '@@download/attachment/ENCSR000AAT_rep1_1_star_genome_flagstat.txt',
   'md5sum': '57f7ba1788d80cc7a226ea464443fdee',
   'size': 399,
   'type': 'text/plain'},
  'date_created': '2016-02-23T23:52:03.975169+00:00',
  'diff_chroms': 0,
  'diff_chroms_qc_failed': 0,
  'duplicates': 0,
  'duplicates_qc_failed': 0,
  'mapped': 557166990,
  'mapped_pct': '96.32%',
  'mapped_qc_failed': 0,
  'paired': 578444568,
  'paired_properly': 557166990,
  'paired_properly_pct': '96.32%',
  'paired_properly_qc_failed': 0,
  'paired_qc_failed': 0,
  'quality_metric_of': ['/files/ENCFF821HDC/'],
  'read1': 289222284,
  'r

In [39]:
cache['ENCSR160IIN']['replicates'][0]['library']['biosample']['age']

'11.5'