# Introduction

We need to submit the processed data that was used in our paper to the DCC so other people can investigate what we did, and unfortunately since my pipeline is slightly different from theirs I need to send my actual processed results.

In [1]:
import hashlib
import pandas
import sys
import os
import numpy
from base64 import b64encode
from pprint import pprint

In [2]:
from woldrnaseq import models
from woldrnaseq.common import validate_reference_type

In [3]:
HTSW=os.path.expanduser('~diane/proj/htsworkflow')
if HTSW not in sys.path:
    sys.path.append(HTSW)
from htsworkflow.submission import encoded

In [4]:
server = encoded.ENCODED('www.encodeproject.org')
#server = encoded.ENCODED('test.encodedcc.org')
server.load_netrc()
validator = encoded.DCCValidator(server)

In [5]:
uploaded_raw_sheet_filename = 'C1-mouse-forelimb-submission-201907-uploaded-production.xlsx'
uploaded = pandas.ExcelFile(uploaded_raw_sheet_filename)

In [6]:
uploaded.sheet_names

['Biosample', 'Library', 'Experiment', 'Replicate', 'File']

In [7]:
submitted_experiment = uploaded.parse('Experiment')
submitted_libraries = uploaded.parse('Library')
submitted_replicates = uploaded.parse('Replicate')
submitted_files = uploaded.parse('File')


In [8]:
metadata = submitted_replicates.merge(
    submitted_experiment[['accession', 'aliases:array']].rename({'aliases:array': 'aliases:experiment'}, axis=1),
    left_on='experiment', right_on='aliases:experiment').merge(
        submitted_files.rename({'accession': 'file_accession'}, axis=1),
        left_on='aliases:array', right_on='replicate')
metadata.head()

Unnamed: 0,uuid,experiment,biological_replicate_number:integer,technical_replicate_number:integer,library,aliases:array,accession,aliases:experiment,file_accession,dataset,...,flowcell_details:json,read_length:integer,file_size:integer,lab,award,file_format,output_type,platform,library_id:skip,replicate
0,bf273ae5-72b5-4237-b03b-a9e68fb56f5c,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A1,1,1,barbara-wold:17327_A1,barbara-wold:17327_A1_b1_t1,ENCSR316KIY,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A1,ENCFF791VYN,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A1,...,"[{""machine"": ""http://jumpgate.caltech.edu/sequ...",50,76952708,barbara-wold,UM1HG009443,fastq,reads,encode:HiSeq2500,17327_A1,barbara-wold:17327_A1_b1_t1
1,bf273ae5-72b5-4237-b03b-a9e68fb56f5c,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A1,1,1,barbara-wold:17327_A1,barbara-wold:17327_A1_b1_t1,ENCSR316KIY,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A1,ENCFF272GSE,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A1,...,"[{""machine"": ""http://jumpgate.caltech.edu/sequ...",50,78307437,barbara-wold,UM1HG009443,fastq,reads,encode:HiSeq2500,17327_A1,barbara-wold:17327_A1_b1_t1
2,af8e4fc1-7a4f-49c3-b5f6-8c4fbcae92de,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A10,1,1,barbara-wold:17327_A10,barbara-wold:17327_A10_b1_t1,ENCSR541RSL,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A10,ENCFF875SEA,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A10,...,"[{""machine"": ""http://jumpgate.caltech.edu/sequ...",50,30471929,barbara-wold,UM1HG009443,fastq,reads,encode:HiSeq2500,17327_A10,barbara-wold:17327_A10_b1_t1
3,af8e4fc1-7a4f-49c3-b5f6-8c4fbcae92de,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A10,1,1,barbara-wold:17327_A10,barbara-wold:17327_A10_b1_t1,ENCSR541RSL,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A10,ENCFF065ITR,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A10,...,"[{""machine"": ""http://jumpgate.caltech.edu/sequ...",50,29889475,barbara-wold,UM1HG009443,fastq,reads,encode:HiSeq2500,17327_A10,barbara-wold:17327_A10_b1_t1
4,2990357c-2cec-4bf7-ac5d-709a14f39ea8,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A11,1,1,barbara-wold:17327_A11,barbara-wold:17327_A11_b1_t1,ENCSR736TCS,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A11,ENCFF753EDK,barbara-wold:c1_e10.5_mouse_limb_run2_17327_A11,...,"[{""machine"": ""http://jumpgate.caltech.edu/sequ...",50,86476491,barbara-wold,UM1HG009443,fastq,reads,encode:HiSeq2500,17327_A11,barbara-wold:17327_A11_b1_t1


In [9]:
libraries = models.load_library_tables(['all_analysis_vdir/library-passing.tsv'])
experiments = models.load_experiments(['all_analysis_vdir/experiment_vdir.tsv'])

Unrecognized columns present. Is this intended?: analysis_name


In [10]:
def get_attachment(filename, mime_type):
    with open(filename, 'rb') as instream:
        contents = instream.read()
        payload = "data:{};base64,{}".format(
            mime_type, 
            b64encode(contents).decode("ascii"))
        obj = {
            "type": mime_type,
            "download": filename.split("/")[-1] + '.txt',
            "href": payload,
            #"size": len(contents),
            #"md5sum": hashlib.md5(contents).hexdigest(),
        }
        return obj

def generate_flagstat_object(library, reference_type):
    suffix_map = {
        'genome': 'genome',
        'transcriptome': 'anno',
    }
    triplet = models.genome_name_from_library(library)
    suffix = suffix_map[reference_type]
    bam_alias = f'barbara-wold:{library.analysis_name}-{triplet}_{suffix}.bam'
    flagstat_filename = f'{library.analysis_name}-{triplet}_{suffix}.flagstat'
    flagstat_pathname = os.path.join('all_analysis_vdir', flagstat_filename)
    flagstat = models.load_flagstat(flagstat_pathname)
    #flagstat['@type'] = ['SamtoolsFlagstatsQualityMetric', 'QualityMetric', 'Item']
    flagstat['quality_metric_of'] = [bam_alias]
    flagstat['step_run'] = '/analysis-step-runs/ecd38dae-7937-4865-8f82-0a3829f1b249/'
    flagstat['attachment'] = get_attachment(flagstat_pathname, 'text/plain')
    flagstat['award'] = 'UM1HG009443'
    flagstat['lab'] = '/labs/barbara-wold/'
    return flagstat

In [11]:
def has_flagstat(server, bam):
    obj = server.get_json(bam, datastore='database')
    for metric in obj.get('quality_metrics', []):
        if 'SamtoolsFlagstatsQualityMetric' in metric['@type']:
            return True
    return False


In [12]:
genome_flagstat = generate_flagstat_object(libraries.iloc[0], 'genome')
pprint(genome_flagstat)
validator.validate(genome_flagstat, 'SamtoolsFlagstatsQualityMetric')

{'attachment': {'download': '19906_A5-mm10-M4-male_genome.flagstat.txt',
                'href': 'data:text/plain;base64,MTY3NDI1NiArIDEwMTI4OCBpbiB0b3RhbCAoUUMtcGFzc2VkIHJlYWRzICsgUUMtZmFpbGVkIHJlYWRzKQo1MzQ3MjYgKyAzMDA4NyBzZWNvbmRhcnkKMCArIDAgc3VwcGxlbWVudGFyeQowICsgMCBkdXBsaWNhdGVzCjE2MzUwMDQgKyA4OTgwNyBtYXBwZWQgKDk3LjY2JSA6IDg4LjY2JSkKMCArIDAgcGFpcmVkIGluIHNlcXVlbmNpbmcKMCArIDAgcmVhZDEKMCArIDAgcmVhZDIKMCArIDAgcHJvcGVybHkgcGFpcmVkIChOL0EgOiBOL0EpCjAgKyAwIHdpdGggaXRzZWxmIGFuZCBtYXRlIG1hcHBlZAowICsgMCBzaW5nbGV0b25zIChOL0EgOiBOL0EpCjAgKyAwIHdpdGggbWF0ZSBtYXBwZWQgdG8gYSBkaWZmZXJlbnQgY2hyCjAgKyAwIHdpdGggbWF0ZSBtYXBwZWQgdG8gYSBkaWZmZXJlbnQgY2hyIChtYXBRPj01KQo=',
                'type': 'text/plain'},
 'award': 'UM1HG009443',
 'duplicates': 0,
 'duplicates_qc_failed': 0,
 'lab': '/labs/barbara-wold/',
 'mapped': 1635004,
 'mapped_pct': '97.66',
 'mapped_qc_failed': 89807,
 'quality_metric_of': ['barbara-wold:19906_A5-mm10-M4-male_genome.bam'],
 'step_run': '/analysis-step-runs/ecd38dae-793

In [13]:
obj = server.get_json('barbara-wold:19906_A5-mm10-M4-male_genome.bam')

In [14]:
obj['quality_metrics']

[]

In [15]:
obj = server.get_json('/files/ENCFF704SWF/')
[ x['@type'] for x in obj['quality_metrics']]

[['SamtoolsFlagstatsQualityMetric', 'QualityMetric', 'Item'],
 ['StarQualityMetric', 'QualityMetric', 'Item']]

In [16]:
has_flagstat(server, 'barbara-wold:19906_A5-mm10-M4-male_genome.bam')

True

In [19]:
added_flagstat = []
flagstat_present = []
responses = {}

In [20]:
validator = encoded.DCCValidator(server)
processed_files = []
for library_id, lib in libraries.iterrows():
    for aligned_to in ['genome', 'transcriptome']:
        flagstat = generate_flagstat_object(lib, aligned_to)
        bam = flagstat['quality_metric_of'][0]
        if not has_flagstat(server, bam):
            print(f'adding {library_id}')
            added_flagstat.append(bam)
            validator.validate(flagstat, 'SamtoolsFlagstatsQualityMetric')
            responses[bam] = server.post_json('/samtools-flagstats-quality-metrics/', flagstat)
        else:
            flagstat_present.append(bam)

print('added', len(added_flagstat))
print('present', len(flagstat_present))

adding 20028_C1
adding 20028_C1
adding 18263_A5
adding 18263_A5
adding 20031_F6
adding 20031_F6
adding 18317_G7
adding 18317_G7
adding 18276_G11
adding 18276_G11
adding 20026_A12
adding 20026_A12
adding 19916_C6
adding 19916_C6
adding 20036_D12
adding 20036_D12
adding 20037_E3
adding 20037_E3
adding 18265_C3
adding 18265_C3
adding 18312_B6
adding 18312_B6
adding 18316_F3
adding 18316_F3
adding 18256_F11
adding 18256_F11
adding 17328_B3
adding 17328_B3
adding 17328_B9
adding 17328_B9
adding 20027_B5
adding 20027_B5
adding 19910_E10
adding 19910_E10
adding 18253_C10
adding 18253_C10
adding 18275_F10
adding 18275_F10
adding 19911_F4
adding 19911_F4
adding 19913_H5
adding 19913_H5
adding 18042_A8
adding 18042_A8
adding 18270_A10
adding 18270_A10
adding 18088_G11
adding 18088_G11
adding 18256_F8
adding 18256_F8
adding 17331_E4
adding 17331_E4
adding 20034_B4
adding 20034_B4
adding 20041_C2
adding 20041_C2
adding 19917_D5
adding 19917_D5
adding 18044_C3
adding 18044_C3
adding 18087_F12
addin

adding 20046_C3
adding 20046_C3
adding 18264_B9
adding 18264_B9
adding 18256_F5
adding 18256_F5
adding 18266_D1
adding 18266_D1
adding 18049_H11
adding 18049_H11
adding 19909_D11
adding 19909_D11
adding 18272_C1
adding 18272_C1
adding 18270_A1
adding 18270_A1
adding 18253_C7
adding 18253_C7
adding 20044_A4
adding 20044_A4
adding 20032_G5
adding 20032_G5
adding 18314_D8
adding 18314_D8
adding 18266_D11
adding 18266_D11
adding 20035_C1
adding 20035_C1
adding 19916_C8
adding 19916_C8
adding 19914_A7
adding 19914_A7
adding 20047_D11
adding 20047_D11
adding 20028_C6
adding 20028_C6
adding 18313_C11
adding 18313_C11
adding 18312_B5
adding 18312_B5
adding 18049_H6
adding 18049_H6
adding 18276_G3
adding 18276_G3
adding 18258_A12
adding 18258_A12
adding 17327_A1
adding 17327_A1
adding 18261_D12
adding 18261_D12
adding 17328_B5
adding 17328_B5
adding 20046_C8
adding 20046_C8
adding 18266_D6
adding 18266_D6
adding 18316_F9
adding 18316_F9
adding 18254_D11
adding 18254_D11
adding 18314_D11
adding 

adding 20039_A10
adding 20046_C2
adding 20046_C2
adding 18265_C10
adding 18265_C10
adding 18264_B10
adding 18264_B10
adding 18254_D8
adding 18254_D8
adding 20033_A11
adding 20033_A11
adding 19907_B1
adding 19907_B1
adding 19909_D2
adding 19909_D2
adding 20026_A10
adding 20026_A10
adding 20035_C3
adding 20035_C3
adding 19910_E4
adding 19910_E4
adding 17331_E9
adding 17331_E9
adding 18316_F8
adding 18316_F8
adding 20045_B10
adding 20045_B10
adding 20042_D2
adding 20042_D2
adding 17329_C8
adding 17329_C8
adding 18087_F7
adding 18087_F7
adding 18255_E12
adding 18255_E12
adding 18258_A6
adding 18258_A6
adding 18316_F2
adding 18316_F2
adding 18273_D4
adding 18273_D4
adding 18271_B9
adding 18271_B9
adding 18088_G3
adding 18088_G3
adding 20047_D12
adding 20047_D12
adding 20046_C10
adding 20046_C10
adding 20044_A12
adding 20044_A12
adding 18255_E7
adding 18255_E7
adding 19909_D3
adding 19909_D3
adding 18313_C2
adding 18313_C2
adding 18317_G3
adding 18317_G3
adding 19906_A10
adding 19906_A10
add

adding 20090_C4
adding 20090_C4
adding 19912_G6
adding 19912_G6
adding 19914_A12
adding 19914_A12
adding 18269_G5
adding 18269_G5
adding 20048_E9
adding 20048_E9
adding 18262_E1
adding 18262_E1
adding 20090_C5
adding 20090_C5
adding 20026_A4
adding 20026_A4
adding 20028_C2
adding 20028_C2
adding 20039_A9
adding 20039_A9
adding 18259_B10
adding 18259_B10
adding 18275_F5
adding 18275_F5
adding 18251_A8
adding 18251_A8
adding 20036_D7
adding 20036_D7
adding 20033_A3
adding 20033_A3
adding 20028_C3
adding 20028_C3
added 1546
present 528


In [None]:
has_flagstat

In [None]:
i

In [None]:
responses['19906_A5']

In [None]:
has_flagstat(server, '/files/ENCFF385LEF/')