# Introduction

We need to submit the processed data that was used in our paper to the DCC so other people can investigate what we did, and unfortunately since my pipeline is slightly different from theirs I need to send my actual processed results.

In [1]:
import hashlib
import pandas
import sys
import os
import numpy
from base64 import b64encode
from pprint import pprint

In [2]:
from woldrnaseq import models
from woldrnaseq.common import validate_reference_type

In [3]:
HTSW=os.path.expanduser('~diane/proj/htsworkflow')
if HTSW not in sys.path:
    sys.path.append(HTSW)
from htsworkflow.submission import encoded

In [4]:
#server = encoded.ENCODED('www.encodeproject.org')
server = encoded.ENCODED('test.encodedcc.org')
server.load_netrc()
validator = encoded.DCCValidator(server)

In [5]:
uploaded_raw_sheet_filename = 'C1-mouse-forelimb-submission-201907-uploaded-production.xlsx'
uploaded = pandas.ExcelFile(uploaded_raw_sheet_filename)

In [6]:
uploaded.sheet_names

['Biosample', 'Library', 'Experiment', 'Replicate', 'File']

In [7]:
submitted_experiment = uploaded.parse('Experiment')
submitted_libraries = uploaded.parse('Library')
submitted_replicates = uploaded.parse('Replicate')
submitted_files = uploaded.parse('File')


In [8]:
metadata = submitted_replicates.merge(
    submitted_experiment[['accession', 'aliases:array']].rename({'aliases:array': 'aliases:experiment'}, axis=1),
    left_on='experiment', right_on='aliases:experiment').merge(
        submitted_files.rename({'accession': 'file_accession'}, axis=1),
        left_on='aliases:array', right_on='replicate')
metadata.head()

Unnamed: 0,uuid,experiment,biological_replicate_number:integer,technical_replicate_number:integer,library,aliases:array,accession,aliases:experiment,file_accession,dataset,...,flowcell_details:json,read_length:integer,file_size:integer,lab,award,file_format,output_type,platform,library_id:skip,replicate
0,bf273ae5-72b5-4237-b03b-a9e68fb56f5c,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A1,1,1,barbara-wold:17327_A1,barbara-wold:17327_A1_b1_t1,ENCSR316KIY,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A1,ENCFF791VYN,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A1,...,"[{""machine"": ""http://jumpgate.caltech.edu/sequ...",50,76952708,barbara-wold,UM1HG009443,fastq,reads,encode:HiSeq2500,17327_A1,barbara-wold:17327_A1_b1_t1
1,bf273ae5-72b5-4237-b03b-a9e68fb56f5c,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A1,1,1,barbara-wold:17327_A1,barbara-wold:17327_A1_b1_t1,ENCSR316KIY,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A1,ENCFF272GSE,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A1,...,"[{""machine"": ""http://jumpgate.caltech.edu/sequ...",50,78307437,barbara-wold,UM1HG009443,fastq,reads,encode:HiSeq2500,17327_A1,barbara-wold:17327_A1_b1_t1
2,af8e4fc1-7a4f-49c3-b5f6-8c4fbcae92de,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A10,1,1,barbara-wold:17327_A10,barbara-wold:17327_A10_b1_t1,ENCSR541RSL,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A10,ENCFF875SEA,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A10,...,"[{""machine"": ""http://jumpgate.caltech.edu/sequ...",50,30471929,barbara-wold,UM1HG009443,fastq,reads,encode:HiSeq2500,17327_A10,barbara-wold:17327_A10_b1_t1
3,af8e4fc1-7a4f-49c3-b5f6-8c4fbcae92de,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A10,1,1,barbara-wold:17327_A10,barbara-wold:17327_A10_b1_t1,ENCSR541RSL,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A10,ENCFF065ITR,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A10,...,"[{""machine"": ""http://jumpgate.caltech.edu/sequ...",50,29889475,barbara-wold,UM1HG009443,fastq,reads,encode:HiSeq2500,17327_A10,barbara-wold:17327_A10_b1_t1
4,2990357c-2cec-4bf7-ac5d-709a14f39ea8,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A11,1,1,barbara-wold:17327_A11,barbara-wold:17327_A11_b1_t1,ENCSR736TCS,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A11,ENCFF753EDK,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A11,...,"[{""machine"": ""http://jumpgate.caltech.edu/sequ...",50,86476491,barbara-wold,UM1HG009443,fastq,reads,encode:HiSeq2500,17327_A11,barbara-wold:17327_A11_b1_t1


In [9]:
libraries = models.load_library_tables(['all_analysis_vdir/library-passing.tsv'])
experiments = models.load_experiments(['all_analysis_vdir/experiment_vdir.tsv'])

Unrecognized columns present. Is this intended?: analysis_name


In [10]:
import dask
from dask.distributed import Client
import hashlib

def import_htsw():
    HTSW = os.path.expanduser('~diane/proj/htsworkflow')
    if HTSW not in sys.path:
        sys.path.append(HTSW)
    from htsworkflow.submission.encoded import ENCODED
    from htsworkflow.submission.fastqname import FastqName
    
def get_md5sum(filename):
    block = 4096
    count = 10
    md5 = hashlib.md5()
    with open(filename, 'rb') as stream:
        while True:
            buffer = stream.read(block * count)
            if len(buffer) == 0:
                break
            md5.update(buffer)
    return (filename, md5.hexdigest())


c = Client('127.0.0.1:8786')
#c = Client()

c.run(import_htsw)

def create_md5s(libraries):
    files = []
    for i, lib in libraries.iterrows():
        files.append(models.find_library_bam_file(lib, 'genome'))
        files.append(models.find_library_bam_file(lib, 'transcriptome'))
        files.append(find_rsem(lib, 'genome'))
        files.append(find_rsem(lib, 'transcriptome'))
        files.append(find_bigwig(lib, 'uniq'))
        files.append(find_bigwig(lib, 'all'))
         
    md5s = c.map(get_md5sum, files)
    md5_values = c.gather(md5s)
    return md5_values

processed_md5_filename = 'all_analysis_vdir/processed_md5s.csv'
if not os.path.exists(processed_md5_filename):
    md5_values = create_md5s(libraries)
    pandas.DataFrame(md5_values, columns=['filename', 'md5sum']).to_csv(processed_md5_filename)
else:
    md5_values = pandas.read_csv(processed_md5_filename).set_index('filename')['md5sum'].to_dict()
print('md5s calculated', len(md5_values))

md5s calculated 6222


In [11]:
l = libraries.iloc[0]

In [12]:
assert l.stranded == 'unstranded'

In [13]:
metadata[metadata['library'] == 'barbara-wold:{}'.format(l.name)]

Unnamed: 0,uuid,experiment,biological_replicate_number:integer,technical_replicate_number:integer,library,aliases:array,accession,aliases:experiment,file_accession,dataset,...,flowcell_details:json,read_length:integer,file_size:integer,lab,award,file_format,output_type,platform,library_id:skip,replicate
818,51e96f22-533a-4550-9311-2d90cc98f888,barbara-wold:c1_e11.5_mouse_limb_run10_19906_A5,1,1,barbara-wold:19906_A5,barbara-wold:19906_A5_b1_t1,ENCSR741DDM,barbara-wold:c1_e11.5_mouse_limb_run10_19906_A5,ENCFF531EHX,barbara-wold:c1_e11.5_mouse_limb_run10_19906_A5,...,"[{""machine"": ""http://jumpgate.caltech.edu/sequ...",50,27870489,barbara-wold,UM1HG009443,fastq,reads,encode:HiSeq2500,19906_A5,barbara-wold:19906_A5_b1_t1
819,51e96f22-533a-4550-9311-2d90cc98f888,barbara-wold:c1_e11.5_mouse_limb_run10_19906_A5,1,1,barbara-wold:19906_A5,barbara-wold:19906_A5_b1_t1,ENCSR741DDM,barbara-wold:c1_e11.5_mouse_limb_run10_19906_A5,ENCFF506KKD,barbara-wold:c1_e11.5_mouse_limb_run10_19906_A5,...,"[{""machine"": ""http://jumpgate.caltech.edu/sequ...",50,27759563,barbara-wold,UM1HG009443,fastq,reads,encode:HiSeq2500,19906_A5,barbara-wold:19906_A5_b1_t1


In [14]:
fastq_rows = metadata[metadata['library'] == 'barbara-wold:{}'.format(l.name)]
'/experiment/{}/'.format(fastq_rows.loc[fastq_rows.first_valid_index()]['accession'])

'/experiment/ENCSR741DDM/'

In [15]:
models.find_library_bam_file(l)

'/woldlab/castor/home/sau/flowcells/H7CNTBCX2/19906_A5/19906_A5-mm10-M4-male_genome.bam'

In [16]:
models.find_library_bam_file(l, 'transcriptome')

'/woldlab/castor/home/sau/flowcells/H7CNTBCX2/19906_A5/19906_A5-mm10-M4-male_anno.bam'

In [17]:
l

analysis_dir        /woldlab/castor/home/sau/flowcells/H7CNTBCX2/1...
genome                                                           mm10
annotation                                                         M4
sex                                                              male
read_1                                            19906_A5_*.fastq.gz
reference_prefix                                                    -
analysis_name                                                19906_A5
stranded                                                   unstranded
Name: 19906_A5, dtype: object

In [18]:
def guess_flowcell(library):
    sau = os.path.expanduser('~sau/flowcells/')
    analysis_dir = library.analysis_dir.replace(sau, '')
    flowcell, subdir = os.path.split(analysis_dir)
    return flowcell
    
guess_flowcell(l)

'H7CNTBCX2'

In [19]:
def find_bigwig(library, track_type, analysis_root=None):
    assert library.stranded == 'unstranded'

    paths = [library.analysis_dir, os.path.expanduser('~sau/public_html/{}'.format(guess_flowcell(library)))]
    filename = library.analysis_name + '-' + models.genome_name_from_library(library) + "_" + track_type + '.bw'
    for p in paths:
        pathname = os.path.join(p, filename)
        if os.path.exists(pathname):
            return pathname
            break
        

In [20]:
find_bigwig(l, 'uniq')

'/woldlab/castor/home/sau/public_html/H7CNTBCX2/19906_A5-mm10-M4-male_uniq.bw'

In [21]:
def find_rsem(library, reference_type='genome', analysis_root=None):
    assert reference_type in ['genome', 'transcriptome'], "reference_type is either genome or transcriptome"
    if reference_type == 'genome':
        suffix = "_anno_rsem.genes.results"
    else: # reference_type == 'transcriptome':
        suffix = "_anno_rsem.isoforms.results"

    filename = library.analysis_name + '-' + models.genome_name_from_library(library) + suffix
    pathname = os.path.join(library.analysis_dir, filename)
    assert os.path.exists(pathname)
    return pathname

In [22]:
find_rsem(l, reference_type='transcriptome')

'/woldlab/castor/home/sau/flowcells/H7CNTBCX2/19906_A5/19906_A5-mm10-M4-male_anno_rsem.isoforms.results'

In [23]:
def create_analysis_step_run(server, validator, step, run, dry_run):
    obj = {
        'aliases': [run],
        'analysis_step_version': step,
        'status': 'in progress',
    }
    collection = '/analysis-step-runs/'
    validator.validate(obj, collection)
    #if not dry_run:
    #    return server.post_json(collection, obj)
    #else:
    return obj

def create_analysis_step_runs(server, validator, dry_run=True):
    steps = [
        'encode:wold-bulk-rnaseq-alignment-step-v1',
        'encode:wold-bulk-rnaseq-signal-generation-step-v1',
        'encode:wold-bulk-rnaseq-quantification-step-v1',
    ]
    for step in steps:
        run = step.replace('-step', '-run').replace('encode:', 'barbara-wold:')
        print('run', run)
        need_to_create = False
        try:
            obj = server.get_json(run)
            pprint(obj)
        except encoded.HTTPError as e:
            if e.response.status_code == 404:
                print(f'Need to create {run}')
                need_to_create = True
        if need_to_create:
            pprint(create_analysis_step_run(server, validator, step, run, dry_run))


create_analysis_step_runs(server, validator, dry_run=True)        

run barbara-wold:wold-bulk-rnaseq-alignment-run-v1
{'@context': '/terms/',
 '@id': '/analysis-step-runs/ecd38dae-7937-4865-8f82-0a3829f1b249/',
 '@type': ['AnalysisStepRun', 'Item'],
 'aliases': ['barbara-wold:wold-bulk-rnaseq-alignment-run-v1'],
 'analysis_step_version': {'@id': '/analysis-step-versions/wold-bulk-rnaseq-star-alignment-step-v-1-0/',
                           '@type': ['AnalysisStepVersion', 'Item'],
                           'aliases': ['encode:wold-bulk-rnaseq-alignment-step-v1'],
                           'analysis_step': {'@id': '/analysis-steps/wold-bulk-rnaseq-star-alignment-step-v-1/',
                                             '@type': ['AnalysisStep', 'Item'],
                                             'aliases': ['encode:wold-bulk-rnaseq-alignment-step'],
                                             'analysis_step_types': ['alignment',
                                                                     'QA '
                                            

In [24]:
def sanitize_submitted_filename(filename):
    if filename.startswith('/'):
        paths = [
            os.path.expanduser('~sau/flowcells/'),
            os.path.expanduser('~diane/proj/flowcells/'),
        ]
        for p in paths:
            if filename.startswith(p):
                return filename.replace(p, '')
        else:
            raise ValueError("Unrecognized filename {}".format(filename))
    else:
        return filename

In [25]:
sanitize_submitted_filename(find_rsem(l))

'H7CNTBCX2/19906_A5/19906_A5-mm10-M4-male_anno_rsem.genes.results'

In [26]:
def generate_bam_object(library, metadata, md5_cache, reference_type='genome'):
    validate_reference_type(reference_type)
    if reference_type == 'genome':
        output_type = 'alignments'
    else:
        output_type = 'transcriptome alignments'
    suffix_map = {
        'genome': 'genome',
        'transcriptome': 'anno',
    }
        
    alignment = models.find_library_bam_file(library, reference_type)
    star_index = '/file/ENCFF483PAE/'
    
    fastq_rows = metadata[metadata['library'] == 'barbara-wold:{}'.format(library.name)]
    dataset = '/experiments/{}/'.format(
        fastq_rows.loc[fastq_rows.first_valid_index()]['accession']
    )
    derived_from = ['/file/{}/'.format(x) for x in fastq_rows['file_accession']]
    derived_from.append(star_index)

    triplet = models.genome_name_from_library(library)
    suffix = suffix_map[reference_type]
    bam_alias = f'barbara-wold:{library.analysis_name}-{triplet}_{suffix}.bam'

    obj = {
        #'@type': ['File'],
        'aliases': [bam_alias],
        'dataset': dataset,
        'file_format': 'bam',
        #'output_category': 'alignment',
        'output_type': output_type,
        'assembly': 'mm10-minimal',
        'genome_annotation': library.annotation,
        'step_run': '/analysis-step-runs/ecd38dae-7937-4865-8f82-0a3829f1b249/',
        'derived_from': derived_from,
        'award': 'UM1HG009443',
        'lab': '/labs/barbara-wold/',
        'md5sum': md5_values[alignment],
        'file_size': os.stat(alignment).st_size,
        'submitted_file_name': alignment,
    }
    return obj

In [27]:
genome_bam = generate_bam_object(l, metadata, md5_values, 'genome')
validator.validate(genome_bam, 'File')
genome_bam

{'aliases': ['barbara-wold:19906_A5-mm10-M4-male_genome.bam'],
 'dataset': '/experiments/ENCSR741DDM/',
 'file_format': 'bam',
 'output_type': 'alignments',
 'assembly': 'mm10-minimal',
 'genome_annotation': 'M4',
 'step_run': '/analysis-step-runs/ecd38dae-7937-4865-8f82-0a3829f1b249/',
 'derived_from': ['/file/ENCFF531EHX/',
  '/file/ENCFF506KKD/',
  '/file/ENCFF483PAE/'],
 'award': 'UM1HG009443',
 'lab': '/labs/barbara-wold/',
 'md5sum': '225757e3e69fa8bf0412cc71c708dd3d',
 'file_size': 77642556,
 'submitted_file_name': '/woldlab/castor/home/sau/flowcells/H7CNTBCX2/19906_A5/19906_A5-mm10-M4-male_genome.bam'}

In [28]:
transcriptome_bam = generate_bam_object(l, metadata, md5_values, 'transcriptome')
validator.validate(transcriptome_bam, 'File')
transcriptome_bam

{'aliases': ['barbara-wold:19906_A5-mm10-M4-male_anno.bam'],
 'dataset': '/experiments/ENCSR741DDM/',
 'file_format': 'bam',
 'output_type': 'transcriptome alignments',
 'assembly': 'mm10-minimal',
 'genome_annotation': 'M4',
 'step_run': '/analysis-step-runs/ecd38dae-7937-4865-8f82-0a3829f1b249/',
 'derived_from': ['/file/ENCFF531EHX/',
  '/file/ENCFF506KKD/',
  '/file/ENCFF483PAE/'],
 'award': 'UM1HG009443',
 'lab': '/labs/barbara-wold/',
 'md5sum': '1da6f195906220e222191910c1b3121f',
 'file_size': 71730686,
 'submitted_file_name': '/woldlab/castor/home/sau/flowcells/H7CNTBCX2/19906_A5/19906_A5-mm10-M4-male_anno.bam'}

In [29]:
hashlib.md5(b'').hexdigest()

'd41d8cd98f00b204e9800998ecf8427e'

Alignment analysis_step_run: /analysis-step-runs/ecd38dae-7937-4865-8f82-0a3829f1b249/

Signal Generation analysis_step_run: /analysis-step-runs/6e5ac5c3-b9fd-4b10-a13e-47d4ab2e6bf8/

Quantification analysis_step_run: /analysis-step-runs/76310d0b-df30-45e6-bd5f-44c07e2d219e/

In [30]:
def get_attachment(filename, mime_type):
    with open(filename, 'rb') as instream:
        contents = instream.read()
        print(type(contents))
        payload = "data:{};base64,{}".format(
            mime_type, 
            b64encode(contents).decode("ascii"))
        obj = {
            "type": mime_type,
            "download": filename.split("/")[-1],
            "href": payload,
            "size": len(contents),
            "md5sum": hashlib.md5(contents).hexdigest(),
        }
        return obj

def generate_flagstat_object(library, reference_type):
    suffix_map = {
        'genome': 'genome',
        'transcriptome': 'anno',
    }
    triplet = models.genome_name_from_library(library)
    suffix = suffix_map[reference_type]
    bam_alias = f'barbara-wold:{library.analysis_name}-{triplet}_{suffix}.bam'
    flagstat_filename = f'{library.analysis_name}-{triplet}_{suffix}.flagstat'
    flagstat_pathname = os.path.join('all_analysis_vdir', flagstat_filename)
    flagstat = models.load_flagstat(flagstat_pathname)
    flagstat['@type'] = ['SamtoolsFlagstatsQualityMetric', 'QualityMetric', 'Item']
    flagstat['quality_metric_of'] = [bam_alias]
    flagstat['step_run'] = '/analysis-step-runs/ecd38dae-7937-4865-8f82-0a3829f1b249/'
    flagstat['attachment'] = get_attachment(flagstat_pathname, 'text/plain')
    flagstat['award'] = 'UM1HG009443'
    flagstat['lab'] = '/labs/barbara-wold/'
    return flagstat

In [31]:
genome_flagstat = generate_flagstat_object(l, 'genome')
pprint(genome_flagstat)
validator.validate(genome_flagstat, 'SamtoolsFlagstatsQualityMetric')

<class 'bytes'>
{'@type': ['SamtoolsFlagstatsQualityMetric', 'QualityMetric', 'Item'],
 'attachment': {'download': '19906_A5-mm10-M4-male_genome.flagstat',
                'href': 'data:text/plain;base64,MTY3NDI1NiArIDEwMTI4OCBpbiB0b3RhbCAoUUMtcGFzc2VkIHJlYWRzICsgUUMtZmFpbGVkIHJlYWRzKQo1MzQ3MjYgKyAzMDA4NyBzZWNvbmRhcnkKMCArIDAgc3VwcGxlbWVudGFyeQowICsgMCBkdXBsaWNhdGVzCjE2MzUwMDQgKyA4OTgwNyBtYXBwZWQgKDk3LjY2JSA6IDg4LjY2JSkKMCArIDAgcGFpcmVkIGluIHNlcXVlbmNpbmcKMCArIDAgcmVhZDEKMCArIDAgcmVhZDIKMCArIDAgcHJvcGVybHkgcGFpcmVkIChOL0EgOiBOL0EpCjAgKyAwIHdpdGggaXRzZWxmIGFuZCBtYXRlIG1hcHBlZAowICsgMCBzaW5nbGV0b25zIChOL0EgOiBOL0EpCjAgKyAwIHdpdGggbWF0ZSBtYXBwZWQgdG8gYSBkaWZmZXJlbnQgY2hyCjAgKyAwIHdpdGggbWF0ZSBtYXBwZWQgdG8gYSBkaWZmZXJlbnQgY2hyIChtYXBRPj01KQo=',
                'md5sum': '3874fbcd1575b784f6992b31cee99e8b',
                'size': 407,
                'type': 'text/plain'},
 'award': 'UM1HG009443',
 'duplicates': 0,
 'duplicates_qc_failed': 0,
 'lab': '/labs/barbara-wold/',
 'mapped': 163500

In [32]:
def generate_bigwig_object(library, bam, md5_cache, bigwig_type='uniq'):
    if bigwig_type == 'uniq':
        output_type = 'signal of unique reads'
    elif bigwig_type == 'all':
        output_type = 'signal of all reads'
    else:
        raise NotImplemented('Only unstranded implemented')
    
    assert bam['output_type'] == 'alignments'
    derived_from = [bam['aliases'][0]]
    bigwig = find_bigwig(library, bigwig_type)
    
    obj = {
        #'@type': ['File'],
        'aliases': [],
        'dataset': bam['dataset'],
        'file_format': 'bigWig',
        #'output_category': 'signal',
        'output_type': output_type,
        'assembly': 'mm10-minimal',
        'genome_annotation': library.annotation,
        'step_run': '/analysis-step-runs/6e5ac5c3-b9fd-4b10-a13e-47d4ab2e6bf8/',
        'derived_from': derived_from,
        'award': 'UM1HG009443',
        'lab': '/labs/barbara-wold/',
        'md5sum': md5_cache[bigwig],
        'file_size': os.stat(bigwig).st_size,
        'submitted_file_name': bigwig,
    }
    return obj

In [33]:
uniq_bigwig = generate_bigwig_object(l, genome_bam, md5_values, 'uniq')
validator.validate(uniq_bigwig, 'File')
uniq_bigwig

{'aliases': [],
 'dataset': '/experiments/ENCSR741DDM/',
 'file_format': 'bigWig',
 'output_type': 'signal of unique reads',
 'assembly': 'mm10-minimal',
 'genome_annotation': 'M4',
 'step_run': '/analysis-step-runs/6e5ac5c3-b9fd-4b10-a13e-47d4ab2e6bf8/',
 'derived_from': ['barbara-wold:19906_A5-mm10-M4-male_genome.bam'],
 'award': 'UM1HG009443',
 'lab': '/labs/barbara-wold/',
 'md5sum': '07cd28255fed50d810465402701034a8',
 'file_size': 7017411,
 'submitted_file_name': '/woldlab/castor/home/sau/public_html/H7CNTBCX2/19906_A5-mm10-M4-male_uniq.bw'}

In [34]:
all_bigwig = generate_bigwig_object(l, genome_bam, md5_values, 'all')
validator.validate(all_bigwig, 'File')
all_bigwig

{'aliases': [],
 'dataset': '/experiments/ENCSR741DDM/',
 'file_format': 'bigWig',
 'output_type': 'signal of all reads',
 'assembly': 'mm10-minimal',
 'genome_annotation': 'M4',
 'step_run': '/analysis-step-runs/6e5ac5c3-b9fd-4b10-a13e-47d4ab2e6bf8/',
 'derived_from': ['barbara-wold:19906_A5-mm10-M4-male_genome.bam'],
 'award': 'UM1HG009443',
 'lab': '/labs/barbara-wold/',
 'md5sum': 'f040d64033fea22bc7f27e6abe0fbcf3',
 'file_size': 12492215,
 'submitted_file_name': '/woldlab/castor/home/sau/public_html/H7CNTBCX2/19906_A5-mm10-M4-male_all.bw'}

In [35]:
def generate_rsem_object(library, bam, md5_cache, reference_type='genome'):
    validate_reference_type(reference_type)
    if reference_type == 'genome':
        output_type = 'gene quantifications'
    else:
        output_type = 'transcript quantifications'
    
    assert bam['output_type'] == 'transcriptome alignments'
    derived_from = [bam['aliases'][0], '/files/ENCFF064YNQ/']
    rsem = find_rsem(library, reference_type)
    
    obj = {
        #'@type': ['File'],
        'aliases': [],
        'dataset': bam['dataset'],
        'file_format': 'tsv',
        #'output_category': 'quantification',
        'output_type': output_type,
        'assembly': 'mm10-minimal',  #library.genome,
        'genome_annotation': library.annotation,
        'step_run': '/analysis-step-runs/76310d0b-df30-45e6-bd5f-44c07e2d219e/',
        'derived_from': derived_from,
        'award': 'UM1HG009443',
        'lab': '/labs/barbara-wold/',
        'md5sum': md5_cache[rsem],
        'file_size': os.stat(rsem).st_size,
        'submitted_file_name': rsem,
    }
    return obj

In [36]:
genome_rsem = generate_rsem_object(l, transcriptome_bam, md5_values, 'genome')
validator.validate(genome_rsem, 'File')
genome_rsem

{'aliases': [],
 'dataset': '/experiments/ENCSR741DDM/',
 'file_format': 'tsv',
 'output_type': 'gene quantifications',
 'assembly': 'mm10-minimal',
 'genome_annotation': 'M4',
 'step_run': '/analysis-step-runs/76310d0b-df30-45e6-bd5f-44c07e2d219e/',
 'derived_from': ['barbara-wold:19906_A5-mm10-M4-male_anno.bam',
  '/files/ENCFF064YNQ/'],
 'award': 'UM1HG009443',
 'lab': '/labs/barbara-wold/',
 'md5sum': '362e256e66652ebedaabd9c75928f3aa',
 'file_size': 9542941,
 'submitted_file_name': '/woldlab/castor/home/sau/flowcells/H7CNTBCX2/19906_A5/19906_A5-mm10-M4-male_anno_rsem.genes.results'}

In [37]:
transcriptome_rsem = generate_rsem_object(l, transcriptome_bam, md5_values, 'transcriptome')
validator.validate(transcriptome_rsem, 'File')
transcriptome_rsem

{'aliases': [],
 'dataset': '/experiments/ENCSR741DDM/',
 'file_format': 'tsv',
 'output_type': 'transcript quantifications',
 'assembly': 'mm10-minimal',
 'genome_annotation': 'M4',
 'step_run': '/analysis-step-runs/76310d0b-df30-45e6-bd5f-44c07e2d219e/',
 'derived_from': ['barbara-wold:19906_A5-mm10-M4-male_anno.bam',
  '/files/ENCFF064YNQ/'],
 'award': 'UM1HG009443',
 'lab': '/labs/barbara-wold/',
 'md5sum': 'e92a671bda0bfd524666652f2a99c43a',
 'file_size': 17413669,
 'submitted_file_name': '/woldlab/castor/home/sau/flowcells/H7CNTBCX2/19906_A5/19906_A5-mm10-M4-male_anno_rsem.isoforms.results'}

In [38]:
pandas.DataFrame([
    genome_bam,
    transcriptome_bam,
    uniq_bigwig,
    all_bigwig,
    genome_rsem,
    transcriptome_rsem,
]).rename({
    'aliases': 'aliases:array',
    'derived_from': 'derived_from:array',
    'quality_metrics': 'quality_metrics:json'
    
})

Unnamed: 0,aliases,assembly,award,dataset,derived_from,file_format,file_size,genome_annotation,lab,md5sum,output_type,step_run,submitted_file_name
0,[barbara-wold:19906_A5-mm10-M4-male_genome.bam],mm10-minimal,UM1HG009443,/experiments/ENCSR741DDM/,"[/file/ENCFF531EHX/, /file/ENCFF506KKD/, /file...",bam,77642556,M4,/labs/barbara-wold/,225757e3e69fa8bf0412cc71c708dd3d,alignments,/analysis-step-runs/ecd38dae-7937-4865-8f82-0a...,/woldlab/castor/home/sau/flowcells/H7CNTBCX2/1...
1,[barbara-wold:19906_A5-mm10-M4-male_anno.bam],mm10-minimal,UM1HG009443,/experiments/ENCSR741DDM/,"[/file/ENCFF531EHX/, /file/ENCFF506KKD/, /file...",bam,71730686,M4,/labs/barbara-wold/,1da6f195906220e222191910c1b3121f,transcriptome alignments,/analysis-step-runs/ecd38dae-7937-4865-8f82-0a...,/woldlab/castor/home/sau/flowcells/H7CNTBCX2/1...
2,[],mm10-minimal,UM1HG009443,/experiments/ENCSR741DDM/,[barbara-wold:19906_A5-mm10-M4-male_genome.bam],bigWig,7017411,M4,/labs/barbara-wold/,07cd28255fed50d810465402701034a8,signal of unique reads,/analysis-step-runs/6e5ac5c3-b9fd-4b10-a13e-47...,/woldlab/castor/home/sau/public_html/H7CNTBCX2...
3,[],mm10-minimal,UM1HG009443,/experiments/ENCSR741DDM/,[barbara-wold:19906_A5-mm10-M4-male_genome.bam],bigWig,12492215,M4,/labs/barbara-wold/,f040d64033fea22bc7f27e6abe0fbcf3,signal of all reads,/analysis-step-runs/6e5ac5c3-b9fd-4b10-a13e-47...,/woldlab/castor/home/sau/public_html/H7CNTBCX2...
4,[],mm10-minimal,UM1HG009443,/experiments/ENCSR741DDM/,"[barbara-wold:19906_A5-mm10-M4-male_anno.bam, ...",tsv,9542941,M4,/labs/barbara-wold/,362e256e66652ebedaabd9c75928f3aa,gene quantifications,/analysis-step-runs/76310d0b-df30-45e6-bd5f-44...,/woldlab/castor/home/sau/flowcells/H7CNTBCX2/1...
5,[],mm10-minimal,UM1HG009443,/experiments/ENCSR741DDM/,"[barbara-wold:19906_A5-mm10-M4-male_anno.bam, ...",tsv,17413669,M4,/labs/barbara-wold/,e92a671bda0bfd524666652f2a99c43a,transcript quantifications,/analysis-step-runs/76310d0b-df30-45e6-bd5f-44...,/woldlab/castor/home/sau/flowcells/H7CNTBCX2/1...


In [39]:
validator = encoded.DCCValidator(server)
processed_files = []
for i, lib in libraries.iterrows():
    gene_bam = generate_bam_object(lib, metadata, md5_values, 'genome')
    transcript_bam = generate_bam_object(lib, metadata, md5_values, 'transcriptome')
    all_bigwig = generate_bigwig_object(lib, gene_bam, md5_values, 'all')
    uniq_bigwig = generate_bigwig_object(lib, gene_bam, md5_values, 'uniq')
    gene_rsem = generate_rsem_object(lib, transcript_bam, md5_values, 'genome')
    transcript_rsem = generate_rsem_object(lib, transcript_bam, md5_values, 'transcriptome')
    for f in [gene_bam, transcript_bam, all_bigwig, uniq_bigwig, gene_rsem, transcript_rsem]:
        validator.validate(f, 'File')
        processed_files.append(f)

In [40]:
processed_files_df = pandas.DataFrame(processed_files).rename({
    'aliases': 'aliases:array',
    'derived_from': 'derived_from:array',
    'quality_metrics': 'quality_metrics:json',
    'file_size': 'file_size:integer',
}, axis=1)
processed_files_df.insert(0, 'accession', numpy.nan)
processed_files_df.insert(0, 'uuid', numpy.nan)

In [41]:
processed_files_df['aliases:array'] = processed_files_df['aliases:array'].apply(lambda x: ','.join(x))
processed_files_df['derived_from:array'] = processed_files_df['derived_from:array'].apply(lambda x: ','.join(x))

In [42]:
processed_files_df.head()

Unnamed: 0,uuid,accession,aliases:array,assembly,award,dataset,derived_from:array,file_format,file_size:integer,genome_annotation,lab,md5sum,output_type,step_run,submitted_file_name
0,,,barbara-wold:19906_A5-mm10-M4-male_genome.bam,mm10-minimal,UM1HG009443,/experiments/ENCSR741DDM/,"/file/ENCFF531EHX/,/file/ENCFF506KKD/,/file/EN...",bam,77642556,M4,/labs/barbara-wold/,225757e3e69fa8bf0412cc71c708dd3d,alignments,/analysis-step-runs/ecd38dae-7937-4865-8f82-0a...,/woldlab/castor/home/sau/flowcells/H7CNTBCX2/1...
1,,,barbara-wold:19906_A5-mm10-M4-male_anno.bam,mm10-minimal,UM1HG009443,/experiments/ENCSR741DDM/,"/file/ENCFF531EHX/,/file/ENCFF506KKD/,/file/EN...",bam,71730686,M4,/labs/barbara-wold/,1da6f195906220e222191910c1b3121f,transcriptome alignments,/analysis-step-runs/ecd38dae-7937-4865-8f82-0a...,/woldlab/castor/home/sau/flowcells/H7CNTBCX2/1...
2,,,,mm10-minimal,UM1HG009443,/experiments/ENCSR741DDM/,barbara-wold:19906_A5-mm10-M4-male_genome.bam,bigWig,12492215,M4,/labs/barbara-wold/,f040d64033fea22bc7f27e6abe0fbcf3,signal of all reads,/analysis-step-runs/6e5ac5c3-b9fd-4b10-a13e-47...,/woldlab/castor/home/sau/public_html/H7CNTBCX2...
3,,,,mm10-minimal,UM1HG009443,/experiments/ENCSR741DDM/,barbara-wold:19906_A5-mm10-M4-male_genome.bam,bigWig,7017411,M4,/labs/barbara-wold/,07cd28255fed50d810465402701034a8,signal of unique reads,/analysis-step-runs/6e5ac5c3-b9fd-4b10-a13e-47...,/woldlab/castor/home/sau/public_html/H7CNTBCX2...
4,,,,mm10-minimal,UM1HG009443,/experiments/ENCSR741DDM/,"barbara-wold:19906_A5-mm10-M4-male_anno.bam,/f...",tsv,9542941,M4,/labs/barbara-wold/,362e256e66652ebedaabd9c75928f3aa,gene quantifications,/analysis-step-runs/76310d0b-df30-45e6-bd5f-44...,/woldlab/castor/home/sau/flowcells/H7CNTBCX2/1...


In [43]:
processed_files_df.tail()

Unnamed: 0,uuid,accession,aliases:array,assembly,award,dataset,derived_from:array,file_format,file_size:integer,genome_annotation,lab,md5sum,output_type,step_run,submitted_file_name
6217,,,barbara-wold:20028_C3-mm10-M4-male_anno.bam,mm10-minimal,UM1HG009443,/experiments/ENCSR711YCM/,"/file/ENCFF074EFL/,/file/ENCFF323LPX/,/file/EN...",bam,87639601,M4,/labs/barbara-wold/,84e3f31ff39890bf1f280afddbb984ff,transcriptome alignments,/analysis-step-runs/ecd38dae-7937-4865-8f82-0a...,/woldlab/castor/home/sau/flowcells/HFNLTBCX2/2...
6218,,,,mm10-minimal,UM1HG009443,/experiments/ENCSR711YCM/,barbara-wold:20028_C3-mm10-M4-male_genome.bam,bigWig,13621699,M4,/labs/barbara-wold/,5b41aacdd38afe5677cdd5936652bbc8,signal of all reads,/analysis-step-runs/6e5ac5c3-b9fd-4b10-a13e-47...,/woldlab/castor/home/sau/public_html/HFNLTBCX2...
6219,,,,mm10-minimal,UM1HG009443,/experiments/ENCSR711YCM/,barbara-wold:20028_C3-mm10-M4-male_genome.bam,bigWig,8376618,M4,/labs/barbara-wold/,4600ddb609b805f325599fa3a14965bb,signal of unique reads,/analysis-step-runs/6e5ac5c3-b9fd-4b10-a13e-47...,/woldlab/castor/home/sau/public_html/HFNLTBCX2...
6220,,,,mm10-minimal,UM1HG009443,/experiments/ENCSR711YCM/,"barbara-wold:20028_C3-mm10-M4-male_anno.bam,/f...",tsv,9549180,M4,/labs/barbara-wold/,a5e719db1ac32e3e8b5d42a45dc4d63e,gene quantifications,/analysis-step-runs/76310d0b-df30-45e6-bd5f-44...,/woldlab/castor/home/sau/flowcells/HFNLTBCX2/2...
6221,,,,mm10-minimal,UM1HG009443,/experiments/ENCSR711YCM/,"barbara-wold:20028_C3-mm10-M4-male_anno.bam,/f...",tsv,17419077,M4,/labs/barbara-wold/,46e8a0b122d16109416d7153f838af0d,transcript quantifications,/analysis-step-runs/76310d0b-df30-45e6-bd5f-44...,/woldlab/castor/home/sau/flowcells/HFNLTBCX2/2...


In [44]:
processed_files_df.shape

(6222, 15)

In [45]:
processed_files_df.to_excel('all_analysis_vdir/processed_files.xlsx', 'File', index=False)