In [2]:
#setup dependencies, install metric parser
import sys
!{sys.executable} -m pip install --no-cache-dir --upgrade  qc-metric-aggregator

import os
import firecloud.api as fapi

Collecting qc-metric-aggregator
  Downloading qc_metric_aggregator-0.1.3-py3-none-any.whl (17 kB)
Installing collected packages: qc-metric-aggregator
  Attempting uninstall: qc-metric-aggregator
    Found existing installation: qc-metric-aggregator 0.1.2
    Uninstalling qc-metric-aggregator-0.1.2:
      Successfully uninstalled qc-metric-aggregator-0.1.2
Successfully installed qc-metric-aggregator-0.1.3
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [3]:
#set up constants
bucket = os.environ['WORKSPACE_BUCKET']
workspace_namespace = os.environ['WORKSPACE_NAMESPACE']
workspace_name = os.environ['WORKSPACE_NAME']
threshold_file_name = "thresholds.yml"
final_output_file_name = "qc_results.tsv"

#we should pull this from a central place rather than a workspace specific bucket
master_thresholds_file = bucket + "/" + threshold_file_name
#copy the thresholds file to the notebook env
!gsutil cp $master_thresholds_file .

Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/thresholds.yml...
/ [1 files][  748.0 B/  748.0 B]                                                
Operation completed over 1 objects/748.0 B.                                      


In [4]:
#fetch sample ids from the terra workspace table
samples = fapi.get_entities(workspace_namespace, workspace_name, "sample").json()
sample_ids = [s['name'] for s in samples]

In [5]:
#figure out which files from cromwell runs we need to localize
#this heuristic can definitely be futher optimized
files_in_bucket = !gsutil ls -r $bucket/**
files_to_localize = [f for f in files_in_bucket if any(sample_id in f for sample_id in sample_ids)]

In [6]:
#create input file for gsutil file localization
with open('files_to_localize', 'w') as fout:
    fout.write("\n".join(files_to_localize))

!mkdir localized_files

In [7]:
#run the metric aggregator for each sample and write out the results
from process_metrics.threshold_file_parser import ThresholdFileParser
from process_metrics.qc_validator import QcValidator
from process_metrics.metrics import AvailableMetrics
from process_metrics.report_generator import ReportGenerator

pass_fail_thresholds = ThresholdFileParser(threshold_file_name).thresholds()

qc_results = []
first_sample = True
with open(final_output_file_name, 'w') as fout:   
    for sample_id in sample_ids:
        
        #localize files relevant to this sample
        !cat files_to_localize | grep $sample_id | grep -v .cram | gsutil -m cp -I ./localized_files
        
        metrics = AvailableMetrics(sample_id)
        validator = QcValidator("localized_files/")
        res = ReportGenerator(sample_id, pass_fail_thresholds, metrics, validator).gather_metrics()
        headers = res[0]
        values = res[1]
        if first_sample:
            first_sample = False
            headers[0] = "entity:qc_result_sample_id"
            lowercased_headers = [h.lower() for h in headers]
            print(str.join("\t", lowercased_headers), file=fout)
        print(str.join("\t", values), file=fout)
        
        #clean up localized files for this sample
        !rm -rf localized_files/*

Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/3a6a3ee6-ef5c-464a-9331-80ada805ac2c/SingleSampleQc/29127f27-d10d-46cc-9c0e-fef14c16b35c/call-CheckContamination/C2020_CZEMTHPRG_EE_3_v1_Exome_GCP.verify_bam_id.selfSM...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/3a6a3ee6-ef5c-464a-9331-80ada805ac2c/SingleSampleQc/29127f27-d10d-46cc-9c0e-fef14c16b35c/call-CollectAggregationMetrics/C2020_CZEMTHPRG_EE_3_v1_Exome_GCP.alignment_summary_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/3a6a3ee6-ef5c-464a-9331-80ada805ac2c/SingleSampleQc/29127f27-d10d-46cc-9c0e-fef14c16b35c/call-CollectAggregationMetrics/C2020_CZEMTHPRG_EE_3_v1_Exome_GCP.bait_bias_detail_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/3a6a3ee6-ef5c-464a-9331-80ada805ac2c/SingleSampleQc/29127f27-d10d-46cc-9c0e-fef14c16b35c/call-CollectAggregationMetrics/C2020_CZEMTHPRG_EE_3_v1_Exome_GCP.bait_bias_summary_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d

Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/0a8ba24b-7102-49f5-abba-3a7fc5890d10/call-CollectQualityYieldMetrics/cacheCopy/C2020_CZEMTHPRG_EE_3_v1_Exome_GCP.quality_yield_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/0a8ba24b-7102-49f5-abba-3a7fc5890d10/call-ValidateSamFile/cacheCopy/C2020_CZEMTHPRG_EE_3_v1_Exome_GCP.validation_report...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/f8514ab7-2cc9-4c2e-9de8-30c94b87cef6/SingleSampleQc/9983a1e2-3221-4e38-afff-220c8f76f2ad/call-CollectAggregationMetrics/C2020_CZEMTHPRG_EE_3_v1_Exome_GCP.alignment_summary_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/f8514ab7-2cc9-4c2e-9de8-30c94b87cef6/SingleSampleQc/9983a1e2-3221-4e38-afff-220c8f76f2ad/call-CollectAggregationMetrics/C2020_CZEMTHPRG_EE_3_v1_Exome_GCP.bait_bias_detail_metrics...
Copying gs://fc-secure-7c06e5e

Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/3de88734-e138-44b4-b517-e56b3043fb24/call-CollectQualityYieldMetrics/C2020_CZEMTHPRG_EE_4_v1_Exome_GCP.quality_yield_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/3de88734-e138-44b4-b517-e56b3043fb24/call-ValidateSamFile/C2020_CZEMTHPRG_EE_4_v1_Exome_GCP.validation_report...
\ [18/18 files][140.3 KiB/140.3 KiB] 100% Done                                  
Operation completed over 18 objects/140.3 KiB.                                   
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/38487578-fb72-49d4-98cb-d032226b84e2/call-CollectAggregationMetrics/C2020_CZEMTHPRG_EE_6_v1_Exome_GCP.alignment_summary_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/38487578-fb72-49d

Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/a0fe74be-19c4-485e-abbc-0a0567fd224a/call-CollectDuplicateMetrics/C2020_CZEMTHPRG_West_3_v1_Exome_GCP.duplication_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/a0fe74be-19c4-485e-abbc-0a0567fd224a/call-CollectHsMetrics/C2020_CZEMTHPRG_West_3_v1_Exome_GCP.hs_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/a0fe74be-19c4-485e-abbc-0a0567fd224a/call-CollectQualityYieldMetrics/C2020_CZEMTHPRG_West_3_v1_Exome_GCP.quality_yield_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/a0fe74be-19c4-485e-abbc-0a0567fd224a/call-ValidateSamFile/C2020_CZEMTHPRG_West_3_v1_Exome_GCP.validation_report...
- [18/18 files][142.7 KiB/142.7 KiB] 100% Done                           

Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/b2ebed23-c082-40ce-b4a7-52531c2bd246/call-CollectAggregationMetrics/C2020_CZEMTHPRG_West_5_v1_Exome_GCP.pre_adapter_summary_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/b2ebed23-c082-40ce-b4a7-52531c2bd246/call-CollectAggregationMetrics/C2020_CZEMTHPRG_West_5_v1_Exome_GCP.quality_distribution.pdf...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/b2ebed23-c082-40ce-b4a7-52531c2bd246/call-CollectAggregationMetrics/C2020_CZEMTHPRG_West_5_v1_Exome_GCP.quality_distribution_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/b2ebed23-c082-40ce-b4a7-52531c2bd246/call-CollectDuplicateMetrics/C2020_CZEMTHPRG_West_5_v1_Exome_GCP.duplication_metrics...
Copying gs://fc-secure-7

Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/6116bc68-b821-49c8-9ed1-5e87f3171b13/call-CollectAggregationMetrics/C2020_CZEMTHPRG_West_9_v1_Exome_GCP.insert_size_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/6116bc68-b821-49c8-9ed1-5e87f3171b13/call-CollectAggregationMetrics/C2020_CZEMTHPRG_West_9_v1_Exome_GCP.pre_adapter_detail_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/6116bc68-b821-49c8-9ed1-5e87f3171b13/call-CollectAggregationMetrics/C2020_CZEMTHPRG_West_9_v1_Exome_GCP.pre_adapter_summary_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/6116bc68-b821-49c8-9ed1-5e87f3171b13/call-CollectAggregationMetrics/C2020_CZEMTHPRG_West_9_v1_Exome_GCP.quality_distribution.pdf...
Copying gs://fc-secure-7

Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/36035ac0-8483-4907-b1bc-eb4bedda1dcd/call-CollectAggregationMetrics/C2021_CZEMTHPRG_EE_11_v1_Exome_GCP.gc_bias.pdf...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/36035ac0-8483-4907-b1bc-eb4bedda1dcd/call-CollectAggregationMetrics/C2021_CZEMTHPRG_EE_11_v1_Exome_GCP.gc_bias.summary_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/36035ac0-8483-4907-b1bc-eb4bedda1dcd/call-CollectAggregationMetrics/C2021_CZEMTHPRG_EE_11_v1_Exome_GCP.insert_size_histogram.pdf...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/36035ac0-8483-4907-b1bc-eb4bedda1dcd/call-CollectAggregationMetrics/C2021_CZEMTHPRG_EE_11_v1_Exome_GCP.insert_size_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5

Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/9a9301f2-0f7f-4dc1-b356-b7f8a3493b67/call-CollectAggregationMetrics/C2021_CZEMTHPRG_EE_14_v1_Exome_GCP.bait_bias_summary_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/9a9301f2-0f7f-4dc1-b356-b7f8a3493b67/call-CollectAggregationMetrics/C2021_CZEMTHPRG_EE_14_v1_Exome_GCP.error_summary_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/9a9301f2-0f7f-4dc1-b356-b7f8a3493b67/call-CollectAggregationMetrics/C2021_CZEMTHPRG_EE_14_v1_Exome_GCP.gc_bias.detail_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/9a9301f2-0f7f-4dc1-b356-b7f8a3493b67/call-CollectAggregationMetrics/C2021_CZEMTHPRG_EE_14_v1_Exome_GCP.gc_bias.pdf...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e

Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/97e3f6ee-613d-4b74-9e5f-545bcf0624be/call-CheckContamination/C2021_CZEMTHPRG_MAE_2_v1_Exome_GCP.verify_bam_id.selfSM...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/97e3f6ee-613d-4b74-9e5f-545bcf0624be/call-CollectAggregationMetrics/C2021_CZEMTHPRG_MAE_2_v1_Exome_GCP.alignment_summary_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/97e3f6ee-613d-4b74-9e5f-545bcf0624be/call-CollectAggregationMetrics/C2021_CZEMTHPRG_MAE_2_v1_Exome_GCP.bait_bias_detail_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/97e3f6ee-613d-4b74-9e5f-545bcf0624be/call-CollectAggregationMetrics/C2021_CZEMTHPRG_MAE_2_v1_Exome_GCP.bait_bias_summary_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-

Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/9ce6ed93-ec4c-4ddc-a8ec-783a13a40019/call-CollectQualityYieldMetrics/C2021_CZEMTHPRG_MAE_4_v1_Exome_GCP.quality_yield_metrics...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/9ce6ed93-ec4c-4ddc-a8ec-783a13a40019/call-ValidateSamFile/C2021_CZEMTHPRG_MAE_4_v1_Exome_GCP.validation_report...
- [18/18 files][143.6 KiB/143.6 KiB] 100% Done                                  
Operation completed over 18 objects/143.6 KiB.                                   
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/31642e97-f2a5-46dd-b701-98220123f53d/call-CheckContamination/C2021_CZEMTHPRG_MAE_5_v1_Exome_GCP.verify_bam_id.selfSM...
Copying gs://fc-secure-7c06e5ec-83e2-4b1d-a6e5-e3f86794651f/5bd72be4-95c1-4a91-951a-d95f6df38a3c/SingleSampleQc/31642e97-f2a5-46dd-b701-98

In [8]:
#copy the results into terra as a datatable
fapi.upload_entities_tsv(workspace_namespace, workspace_name, final_output_file_name, "flexible")

#copy the TSV to the workspace bucket
uploaded_tsv = bucket + '/' + final_output_file_name
!gsutil cp $final_output_file_name $uploaded_tsv

Copying file://qc_results.tsv [Content-Type=text/tab-separated-values]...
/ [1 files][  2.6 KiB/  2.6 KiB]                                                
Operation completed over 1 objects/2.6 KiB.                                      
